In [1]:
import tensorflow as tf
import keras 
from tqdm import tqdm
import re
import pandas as pd 
import numpy as np
import random as rn
print(keras.__version__)
print(tf.__version__)

# Use a fixed seed for the random number generator to address randomness problem and get reproducable results with keras. the numbers don't make much difference.
seed = 42
np.random.seed(seed)

2.3.1
1.15.0


Using TensorFlow backend.


## Preprocessing
1. Prepare a dataframe with: word, sentence #, tag
2. Group words with their tags within their sentence
3. Encode words and their tags. Then pad the sequences aka create a multi-label and multi-output classification dataset

In [2]:
# Import the sentences from the data file
f=open('/Users/highsierra/Tech-Skills/Unorganised/conll2014-master/release3.2/data/conll14st-preprocessed.m2')

sentences = []
words = []
Stat = []


word_tags=[] # the target output of each word 
error_tags = ["ArtOrDet","Nn","Vt","Prep","Vform","Wform","SVA"] # the seven grammatical errors' tags 
sen_position=0 


from tqdm import tqdm
        
for line in tqdm(f):
    parts = line.split()
    if(len(parts)>0):
        if line[0]=='S':
        # Initial Inputs Processing 
            # Perform basic cleaning
            cleanSen = re.sub(r"\n", '', line[2:])
            
            # Keep track of the sentences' lengths
            Stat.append(len(cleanSen.split()))
            
            # Create a list of sentences
            sentences.append(cleanSen)
            
            # Create a one-dimensional array of input words 
            words = words + cleanSen.split()
            
            
        # Initial Outputs Processing  
            
            # By default, consider every word as non-erroneous, by creating an array with the tag "Correct" per every word.
            tags=np.empty(shape=(len(parts)-1), dtype=object)
            tags = np.where(tags==None, "Correct", tags)
            # Combine the tags associated with each sentence vertically in order to allign them with the input words
            word_tags.append(tags)
            # Keep track of the sentence's position
            sen_position += 1
            
        elif parts[0]=='A':
            if re.findall("ArtOrDet", parts[2]) or re.findall("Nn", parts[2]) or re.findall("Vt", parts[2]) or re.findall("Prep", parts[2]) or re.findall("Vform", parts[2]) or re.findall("Wform", parts[2]) or re.findall("SVA", parts[2]):
                # Keep track of the erroneous word's position by extracting it from the sentence annotation 
                digit = [int(j) for j in re.findall("[0-9]+", parts[2][:2])]            
                  
                # Extract the erroneous words tag
                for tag in error_tags:
                    if  re.search(tag, parts[2]):
                        err = re.findall(tag, parts[2])
                        
                # Using its extracted position, place the erroneous word's tag in its sentence
                word_tags[sen_position - 1][digit[0]-1] = err[0]
            
print(len(sentences))
print(len(words))

158784it [11:27, 230.80it/s]

57151
1161567





In [3]:
# Convert tags to a one-dimensional array in order to allign with input words
flattened_word_tags = []
for sen in word_tags:
    
    for tag in sen:
        flattened_word_tags.append(tag)
#print(len(flattened_word_tags))

In [4]:
# Create a label of the sentence's number for each word

i = 1
numbers = []
for sen in sentences:
    for word in range(len(sen.split())):
        numbers.append("Sentence: " + str(i))
    i+=1

In [5]:
print(len(numbers))
print(len(words))
print(len(flattened_word_tags))

1161567
1161567
1161567


In [6]:
data = pd.DataFrame(list(zip(numbers, words, flattened_word_tags)), 
               columns =['Sentence #','Word', 'Tag']) 
data 

Unnamed: 0,Sentence #,Word,Tag
0,Sentence: 1,CREATING,Correct
1,Sentence: 1,A,Correct
2,Sentence: 1,HABITABLE,Correct
3,Sentence: 1,ENVIRONMENT,Correct
4,Sentence: 2,Humans,Correct
...,...,...,...
1161562,Sentence: 57151,2009,Correct
1161563,Sentence: 57151,from,Correct
1161564,Sentence: 57151,http,Correct
1161565,Sentence: 57151,:,Correct


In [7]:
# For counting only
words = list(set(data["Word"].values))
words.append("ENDPAD")
n_words = len(words); 
print(n_words)

tags = list(set(data["Tag"].values))
n_tags = len(tags); 
print(n_tags)

33763
8


In [8]:
#SentenceGetter retrieves sentences with their labels.

class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [9]:
getter = SentenceGetter(data)
sent = getter.get_next()
sentences = getter.sentences
print(len(sentences))

57151


In [10]:
leng = []
for sen in sentences:
    leng.append(len(sen))
print(max(leng))

222


In [11]:
corpus_tokens = list(set(data["Word"].values))
corpus_tokens.append("ENDPAD")

#print(corpus_tokens)
print(len(corpus_tokens))

Idx = range(0, len(corpus_tokens))

33763


In [12]:
word2idx = pd.DataFrame(list(zip(corpus_tokens, Idx)), 
               columns =['Word', 'No']) 
word2idx 

Unnamed: 0,Word,No
0,1.5,0
1,covers,1
2,//www.technologyreview.com/infotech/18537/,2
3,//news.bbc.co.uk/2/hi/programmes/from_our_own_...,3
4,contrast,4
...,...,...
33758,executing,33758
33759,Table,33759
33760,homeowners,33760
33761,out-break,33761


In [13]:
indexed=[(w, n) for w, n in zip(word2idx["Word"].values.tolist(), word2idx["No"].values.tolist())]

In [14]:
from tqdm import tqdm


XX=[]
X=[]

for s in tqdm(sentences):
    XX=[]
    for w in s:
        for I in indexed:
            if I[0] == w[0]:
                XX.append(I[1])
                break
    X.append(XX)

100%|██████████| 57151/57151 [1:36:11<00:00,  9.90it/s]  


In [15]:
print(word2idx.loc[word2idx['Word'] == "ENDPAD"])
#We pad the sentences with the index value of "ENDPAD", which is: 33762

         Word     No
33762  ENDPAD  33762


In [16]:
from keras.preprocessing.sequence import pad_sequences
max_len = max(leng) + 1 # or 223

X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=33762)#pad with last word of index 
#n_words-1 is the index of “ENDPAD” in word2idx. Cleaner would be to use word2idx[“ENDPAD”] as value.

In [17]:
labels= {'ArtOrDet':0, 'Correct':1, 'Nn':2, 'Prep':3, 'SVA':4, 'Vform':5, 'Vt':6, 'Wform':7}

yy=[]
y=[]

for s in tqdm(sentences):
    yy=[]
    #print(s)
    for w in s:
        #print(w)
        for key in labels:
            if w[1] == key:
                yy.append(labels.get(key))
    y.append(yy)
print(len(y))

100%|██████████| 57151/57151 [00:01<00:00, 45068.29it/s]

57151





In [18]:
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=1)

In [19]:
print(X.shape)
print(y.shape)

(57151, 223)
(57151, 223)


## Modeling

In [20]:
# Partition sentences it into training and test sets: 80%, 20%

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=seed)

In [21]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout,Reshape, SimpleRNN, Bidirectional
from keras.utils import to_categorical

### Example of the experiments or runs using a SimpleRNN architecture:

Experiments for: Finding the optimal Dropout value from the list [0, 0.1, 0.2]
#### Run 1:  (a.k.a this one)
###### The Variable: 
    Dropout = 0 
    
###### Defaults:
    Number of layers = 1
    Number of units or neurons within a layer = 50
    Embedding size = 50
    Batch size = 32
    
###### Constants:
       Recurrent Dropout = 0.2
       
#### Run 2:  (a.k.a the next execution)
###### The Variable: 
    Dropout = 0.1 
    
###### Defaults:
    same as previous values
    
###### Constants:
    same as previous values
     
#### Run 3:  (a.k.a the final execution, or the experiment using the last proposed value for Dropout)
###### The Variable: 
    Dropout = 0.2 
    
###### Defaults:
    same as previous values
    
###### Constants:
    same as previous values

In [22]:
def Model_(x_tr, y_tr, epos=3, my_batch_size=32):  
    input = Input(shape=(max_len,)) # This returns a tensor. The comma is necessary when you have only one dimension.
    model = Dropout(0)(model)
    model = Embedding(input_dim=n_words, output_dim=50, input_length=max_len, name="Embedding")(input) # output_dim if 50 then the nw will learn 50-dimentional embeddings for each word. # This embedding layer will encode the input sequence into a sequence of dense 50-dimensional vectors.
    model = SimpleRNN(units=50, return_sequences=True, recurrent_dropout=0.2)(model)
    out = TimeDistributed(Dense(n_tags, activation='softmax'))(model)
    model = Model(input, out)
    
    return model


model = Model_(X_train,  y_train)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [23]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [24]:
#For training and testing the network we also need to change the labels y to categorial.

ycat_train = to_categorical(y_train, num_classes=n_tags)
ycat_test = to_categorical(y_test, num_classes=n_tags)

## Evaluation
Calculate the following metrics:
1. Accuracy
2. Precision
3. Recall
4. F1 

In [26]:
model.fit(X_train, ycat_train, epochs=3, batch_size=32, verbose=1) 
#epoch costs 5 mins

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x64f5fd290>

In [27]:
print(model.metrics_names)

['loss', 'accuracy']


In [28]:
scores = model.evaluate(X_test, ycat_test, verbose=1)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 99.84%


In [29]:
pred = model.predict(X_test, verbose=1) 
print(pred.shape)

(11431, 223, 8)


In [30]:
y_pred = np.argmax(pred,axis=-1)

In [31]:
from sklearn.metrics import precision_score, recall_score, f1_score


pres_score = []

for tru,pred in zip (y_test, y_pred):
    pres_score.append(precision_score(tru,pred,average='macro'))

precision = np.mean(pres_score)
print(precision)

  _warn_prf(average, modifier, msg_start, len(result))


0.8658826787051885


In [32]:
rec_score = []

for tru,pred in zip (y_test, y_pred):
    rec_score.append(recall_score(tru,pred,average='macro'))

recall= np.mean(rec_score)
print(recall)

  _warn_prf(average, modifier, msg_start, len(result))


0.8664763785993718


In [33]:
# Manually, it's calculated according to the formula:  f1 = (2 * precision * recall) / (precision + recall)
f_score = []

for tru,pred in zip (y_test, y_pred):
    f_score.append(f1_score(tru,pred,average='macro'))
f1 = np.mean(f_score)
print(f1)

0.8661666644676528
