## Read Corpus

In [72]:
import os.path
import glob
import nltk

'''
Reads the RST-DT corpus data.
The function returns a list of sentences tokenized to words and index of an EDU break in each of these sentence

Attention : The internal folder structure of the corpus must not be altered. 
'''
def readCorpus(relDir, dataClass):    
    #go to the corpus dir  
    if((dataClass.upper() != "TRAINING") and (dataClass.upper() != "TEST")):
        raise ValueError("Value of 'dataClass' is incorrect. Select one of the following - 1)'TRAINING' 2)'TEST'")
    else:
        relDir +=  "/data/RSTtrees-WSJ-main-1.0/" + dataClass.upper() + "/"
        fileext = "*.edus"        
 
    absDir = os.path.join(os.getcwd(), relDir)
    
    if(os.path.isdir(absDir)):
        print("Reading corpus from "+ absDir)
    else:
        raise ValueError("The dir " + absDir +" is incorrect or doesnot exist. Please check check the value set in 'relDir' ")
            
    files = os.path.join(absDir, fileext)
    
    tokens = []
    edu_idx = []
    for fname in glob.glob(files):
        with open(fname, 'r', encoding='utf-8') as doc:
            sent=[]
            edu_boundary = []
            for edu in doc:
                # tokenize to words
                edu_tokens = nltk.word_tokenize(edu)
                
                if edu_tokens[-1] not in ["!", "?", ".", "..."]:
                    # join EDUs of a sentence
                    sent.extend(edu_tokens)
                    # remember EDU boundary indices
                    edu_boundary.append(len(sent) - 1)
                else:
                    sent.extend(edu_tokens)
                    tokens.append(sent)
                    edu_idx.append(edu_boundary)
                    # clear for next sentence
                    sent = []
                    edu_boundary = []
            
    return tokens,edu_idx

############################################################################################

tokenized_train_data,train_EDUs = readCorpus("../../RST_corpus", "Training")
tokenized_test_data,test_EDUs = readCorpus("../../RST_corpus", "Test")

### TEST CODE
# for i in range(len(tokenized_test_data)):
#     print("-------------------")
#     print(tokenized_test_data[i])
#     print("word count = {}".format(len(tokenized_test_data[i])))
#     print(test_EDUs[i])

Reading corpus from /media/roshanrane/DATA/Study/Masters/Workspace/ANLP/ANLP_assignmentFinal/repo/RST-parser/../../RST_corpus/data/RSTtrees-WSJ-main-1.0/TRAINING/
Reading corpus from /media/roshanrane/DATA/Study/Masters/Workspace/ANLP/ANLP_assignmentFinal/repo/RST-parser/../../RST_corpus/data/RSTtrees-WSJ-main-1.0/TEST/


## Word2Vec and Training Data

In [73]:
import gensim
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np

#constants denoting the n-gram and output vectors used in the whole project 
N_GRAM = 4
B_ARRAY = np.array([1,0])
C_ARRAY = np.array([0,1])

'''Plot the Word2Vec in 2D for visualization'''
def w2v_visualizer(model):
    labels = []
    tokens = []
    for word in model.wv.vocab:
        tokens.append(model[word])
        labels.append(word)
        
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)
    x = [value[0] for value in new_values]
    y = [value[1] for value in new_values] 
    
    plt.figure(figsize=(20, 16)) 
    
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i], xy=(x[i], y[i]), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
        
    plt.show()
    

'''Generate Word2Vec for all words in the corpus'''
def gen_w2v(tokenized_data, visualize = False):
    # Word2Vec - Convert each word token to a vector of size 100
    model = gensim.models.Word2Vec(tokenized_data, size= 100 , window=20, min_count=1)
    model.save("word2vec-RSTCorpus")
    if(visualize):
        w2v_visualizer(model)   
    return model
    
    
''' Takes Corpus extractions and EDU boundaries and converts it to input vectors and 
output vectors to feed to the Neural Network.
Concatenate the words in n-grams for input and calculates the Word Embeddings for each word.
Output is a 2-element vector : [1,-1] meaning EDU Break
                             : [-1,1] meaning no EDU Break '''
def extract_nnVec(tokenized_data, edu_lists, model, n_gram):
    nnData = []    
    for sent,EDUs in zip(tokenized_data, edu_lists) :
        
        #Extend the sentences with '...' This will ensure that one n-gram sequence 
        # exists for every word pair in the original sentence, with the word pair as the mid point in it.
        ext = ["." for i in range((n_gram//2) -1)]
        extended_sent = ext + sent + ext
#             print(" ".join(extended_sent))
            
            
        for i in range(len(extended_sent)- n_gram + 1):            
            #get wordvec of each word of n-gram and stack them
            nnDat_in = model.wv[extended_sent[i]]
            for j in range(1,n_gram):                
                nnDat_in = np.hstack((nnDat_in, model.wv[extended_sent[i+j]] + df.loc(i+j))
            
            #check if there is a EDU break in between the n-gram.
            mid_index = (i + n_gram//2 - 1)
            
            if(mid_index in EDUs):
#                 print("{} {} {} {}".format(sent[i],sent[i+1], sent[i+2], sent[i+3]))
                nnDat_out = B_ARRAY # EDU 'Break' class
            else:
                nnDat_out = C_ARRAY # EDU 'Continue' class
                
            nnData.append((nnDat_in, nnDat_out))
            
    return nnData
    
##########################################################################################

# word vectors are generated for the whole corpus 
w2v_model = gen_w2v(tokenized_train_data + tokenized_test_data)
#generate Vectors for training and test data
trainingVecs = extract_nnVec(tokenized_train_data, train_EDUs, w2v_model, n_gram = N_GRAM)
testVecs = extract_nnVec(tokenized_test_data, test_EDUs, w2v_model, n_gram = N_GRAM)

nn_in_dem = trainingVecs[0][0].shape[0]
nn_out_dem = 2

del w2v_model
# print("number of words = {}".format(len([word for sent in tokenized_train_data for word in sent])))
# print("number of sentences = ",len(tokenized_train_data))
print("number of training vecs = {} with {}-gram grouping ".format(len(trainingVecs), N_GRAM))

number of training vecs = 176315 with 4-gram grouping 



## Neural Network

In [91]:

'''Feed forward Neural Network with Stochastic Gradient descent '''

class myNNet():    
    ''' layers: A list containing denoting how many neurons each layer must contain. 
    The size of the list defines the number of layers.
    Example [100,500,200,2] imples the input layer has 100 neurons, there are 2 hidden layers with '''
    
    def __init__(self,layers):        
        #initialise the Weights and biases of the model
        np.random.seed(1)
        self.L = layers
        self.num_layers = len(layers)
        self.w = [] # weights
        self.b = [] # bias
        for i in range(len(layers) - 1):
            self.w.append(2 * np.random.random((layers[i], layers[i+1])) - 1)
            self.b.append(np.zeros((1,layers[i+1])))
    
    def __str__(self):
        
        note = "Neural Network Model containing:"
        for i in range(self.num_layers):
            if (i != self.num_layers-1):    
                note += ("\nlayer{} = {} Nodes {} Weights {} Biases".format(i, self.L[i], self.w[i].shape, self.b[i].shape))
            else:
                note += ("\nlayer{} = {} Nodes".format(i, self.L[i]))
        return note
    
    
    '''returns the softmax values of the layer '''
    def __sftmax(self,nodes):
        exps = np.exp(nodes - np.max(nodes))
        return exps / np.sum(exps)
    
    '''returns the derivative of the sigmoid function - in my case the tanh function '''
    def __dSigma(self,z):
        return (1 - (np.tanh(z)**2))
        
    
    '''Feeds the given input vector into the Neural network and returns the predicted output
    If getAllLayers is set True then all neurons and Z values are returned, this is needed while training. '''
    def predict(self, dat_in, getAllLayers = False):
        # input data is directly the neuron values of the 1st layer
        neurons = [dat_in.reshape(1,self.L[0])]
        z = []

        for L in range(self.num_layers - 1):
            z.append(neurons[L].dot(self.w[L]) + self.b[L])
            if(L != (self.num_layers - 2)):
                neurons.append( np.tanh(z[L]))
            else: #for the last layer perform softmax instead of tanH 
                neurons.append( self.__sftmax(z[L]))
        
        if(getAllLayers):
            return neurons, z
        else: 
            return neurons[-1]
    
    
    ''' Calculates the cost value in the model. This is used for checking the progress in the model training '''
    def calcCost(self, predicted, expected):
        totalCost = 0
        for p,e in zip(predicted, expected):
            totalCost += -(np.log(p)*e).sum()
        return totalCost/len(predicted)
    
    
    '''The main method where the model is trained using back propagation. 
    If debugPrintCycles is set to 0, nothing is printed. If it is set to any other value, 
    the various parameters are printed every debugPrintCycle times '''
    def train(self, data, passes, learn_rate = 0.01, debugPrintCycles = 0):        
        
        print("\nStarting training cycles...") 
        data_size = len(data)
        for i in range(passes):
            
            #create mini batches of randomly shuffled training data for stochastic gradient descent
            np.random.shuffle(data)
            num_of_mini_batches = len(str(data_size))            
            mini_batch_size = data_size//num_of_mini_batches           
            mini_batches = [data[i:i+mini_batch_size] for i in range(num_of_mini_batches)]

            outputs = []
            # matrices to store nudges of backpropogation
            b_nudges = [np.zeros(b.shape) for b in self.b]
            w_nudges = [np.zeros(w.shape) for w in self.w]
            
            if(debugPrintCycles): j = 0
            
            for mini_batch in mini_batches:
                for dat_in, dat_out in mini_batch:  

                    # forward propagation    
                    neurons,z = self.predict( dat_in, getAllLayers = True)                
                    outputs.append(neurons[-1])

                    if(debugPrintCycles):
                        if not(i% debugPrintCycles): 
                            if(j == 0):
                                print("\nResults after {} training cycles : ".format(i))
                            elif(j < 10): # print out the first 10 predictions
                                print("Expected out = ",dat_out)
                                print("Predicted out = ",outputs[-1])                                                   
                            j += 1

                    # back propagation                           
                    # get the partial derivatives for last layer
                    dz = (outputs[-1] - dat_out)         
                    b_nudges[-1] += dz
                    w_nudges[-1] += neurons[-2].T.dot(dz)

                    # get the partial derivatives for the rest of the layers
                    for L in range(2, self.num_layers):
                        dCost = dz.dot(self.w[-L+1].T) 
                        dz = self.__dSigma(z[-L]) * dCost
                        b_nudges[-L] += dz
                        w_nudges[-L] += neurons[-L-1].T.dot(dz)

                # update the gradient descents learned
                self.b = [(b-(learn_rate/mini_batch_size)*nb) for b, nb in zip(self.b, b_nudges)]
                self.w = [(w-(learn_rate/mini_batch_size)*nw) for w, nw in zip(self.w, w_nudges)]
            
            if(debugPrintCycles):
                if not (i% debugPrintCycles):
                    # print loss
                    expected = [dat_out for dat_in, dat_out in data]
                    print("Loss = {}".format(self.calcCost(outputs, expected)))
        
        print("Finished Training ! ")
            
    
    '''Saves the model parameter in out files that can be loaded later using updateModel() '''
    def saveModel(self, suffix):
        saveFile = 'nnModel_'+"_".join(str(l) for l in self.L[0:-1]) +"_" + suffix
        
        np.save(saveFile + "_w", self.w)
        for b_i, b in enumerate(self.b):
            np.save(saveFile + "_b"+ str(b_i), b)
        print(".. saved model params in {} files".format(saveFile))
        
        
    '''Method loads a previously trained model saved in npy files and updates the current parameters. '''
    def updateModel(self, suffix):        
        loadFile = 'nnModel_'+"_".join(str(l) for l in self.L[0:-1]) +"_" + suffix
        
        self.w = np.load(loadFile + "_w" + ".npy")        
        for b_i in range(len(self.b)):
            self.b[b_i] = np.load(loadFile + "_b"+ str(b_i) + ".npy")
        print(".. updated model with params from {} files".format(loadFile))

In [99]:
%%time

# 2 hidden layers with 50 nodes and 10 nodes each
# layers = [nn_in_dem, 50, 10, nn_out_dem]

# segmentor = myNNet(layers)
print(segmentor)

#train model
train_passes = 1000
# update parameters of the model from previously trained model, if any
# segmentor.updateModel(str(11))

# segmentor.train(trainingVecs, train_passes, debugPrintCycles = 333)
# #save model to save re-training time
# segmentor.saveModel(str(train_passes))

Neural Network Model containing:
layer0 = 400 Nodes (400, 50) Weights (1, 50) Biases
layer1 = 50 Nodes (50, 10) Weights (1, 10) Biases
layer2 = 10 Nodes (10, 2) Weights (1, 2) Biases
layer3 = 2 Nodes
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 139 µs


In [13]:
### TEST CODE gen for Neural Network
#generate simple data
# np.random.seed(1)
# data = []
# for i in range(1,500): 
#     in_dat = 2*np.random.random((1,2)) -1 
#     # A non-linear function to test on the Neural network 
#     if(((in_dat[0][0]**9 - np.exp(in_dat[0][1]))) > (in_dat[0][0])):
#         out_dat = np.array([1,0]).reshape(1,2)
#     else:
#         out_dat = np.array([0,1]).reshape(1,2)
        
#     data.append((in_dat,out_dat))

# training = data[:-20]
# test = data[-20:]
# test

In [14]:
### TEST CODE run for Neural Network
# testLayers = [2, 4, 2]
# testModel = myNNet(testLayers)
# print(testModel)
# testModel.train(training, passes = 2000, debugPrintCycles = 0 )
# # test the model's accuracy.
# for in_dat, out_dat in test:
#     print("\nExpected test out : {}\n Predicted test out : {}\n".format(out_dat, testModel.predict(in_dat)))

## Evaluation

In [96]:
'''Using the output vector of the Neural Network, decides if the prediction is a EDU 'Break' or not '''
def extractResult(nn_out):
    threshold  = -0.55
    if((nn_out[0] - nn_out[1]) > threshold ):
        return "B"
    else:
        return "C"

    
# Predict the EDU breaks on the test data
print("Test data size = {}".format(len(testVecs)))
print("Total EDU breaks expected = {}".format(sum([len(edu) for edu in test_EDUs])))

# count the True Positives, False Positives, True Negatives and False Negatives
TP = FP = TN = FN = 0
c=0
for in_vec, expected_vec in testVecs:
    # pass the test data through the neural network model
    predicted_vec = segmentor.predict(in_vec)    
    predicted = extractResult(predicted_vec.reshape(-1))  
    if(np.array_equal(expected_vec, B_ARRAY)):
        expected = 'B'
        c += 1
    else:
        expected = 'C'        
       
#     print("predicted = " , predicted_vec)
#     print("expected = " , expected_vec)  
#     print("----------")
    
    if((expected == 'B') and (predicted == 'B')):
        TP += 1
    elif((expected == 'B') and (predicted == 'C')):
        FN += 1
    elif((expected == 'C') and (predicted == 'B')):
        FP += 1
    elif((expected == 'C') and (predicted == 'C')):
        TN += 1
        
print("EDU breaks count again = ",c)

#recall, Precision and F1-score calculation
print("TP {},FN {},FP {},TN {}".format(TP,FN,FP,TN))

recall = TP/(TP+FN)
precision = TP/(TP+FP)
F1 = 2*recall*precision/(recall + precision)      

print("Recall = {}% \nPrecision = {}% \n F1-Score = {}% \n".format(recall*100, precision*100, F1*100))

Test data size = 20693
Total EDU breaks expected = 1454
EDU breaks count again =  1449
TP 497,FN 952,FP 679,TN 18565
Recall = 34.29951690821256% 
Precision = 42.26190476190476% 
 F1-Score = 37.866666666666674% 



In [19]:
outList[0].sum?

) learn_rate = 0.01 results in local minima too quickly. set to 0.01/num_of_training_data
) added softmax() 
) switch from square mean loss function to cross entropy loss function.
) Stochastic descent - 
without stochastic descent:
10 training cycles complete...
Cost = 0.28354821176544787
Wall time: 2min 33s

with stochastic descent:
10 training cycles complete...
Cost = 0.26668334795269005
Wall time: 2min 25s


Neural Network Model containing:
layer0 = 400 Nodes (400, 50) Weights (1, 50) Biases
layer1 = 50 Nodes (50, 10) Weights (1, 10) Biases
layer2 = 10 Nodes (10, 2) Weights (1, 2) Biases
layer3 = 2 Nodes

1500 training cycles at a learning rate 0.000005 :
F1-score 37%