In [12]:
import tensorflow as tf
import keras 
from tqdm import tqdm
import re
import pandas as pd 
import numpy as np
import random as rn
import os
print(keras.__version__)
print(tf.__version__)

# Use a fixed seed for the random number generator to address randomness problem and get reproducable results with keras. the numbers don't make much difference.
seed = 42
np.random.seed(seed)

2.3.1
1.15.0


## Preprocessing

In [2]:
from tqdm import tqdm


sentences=[]
words = []
targets=[] 
sen_position=0 

labels= {'Correct':0, 'ArtOrDet':1, 'Nn':2, 'Prep':3, 'SVA':4, 'Vform':5, 'Vt':6, 'Wform':7}

 
# Import the sentences from the data file
f=open('/Users/highsierra/Tech-Skills/Unorganised/conll2014-master/release3.2/data/conll14st-preprocessed.m2')
for line in tqdm(f):
    parts = line.split()
    if(len(parts)>0):
        if line[0]=='S':
            # Cleaning
            line = line[2:].strip()
            
            # List of sentences
            sentences.append(line)

            # List of words, for the vocabulary info
            words = words + line.split()

            
            # Initial Outputs Processing  
            
            # By default, consider every word as non-erroneous, by creating an array with the tag "Correct" per every word.
            tags=np.zeros(shape=(len(parts)-1), dtype='int32')
            # Combine the tags associated with each sentence vertically in order to allign them with the input words
            targets.append(tags)
            # Keep track of the sentence's position
            sen_position += 1
            
        elif parts[0]=='A':
            if re.findall("ArtOrDet", parts[2]) or re.findall("Nn", parts[2]) or re.findall("Vt", parts[2]) or re.findall("Prep", parts[2]) or re.findall("Vform", parts[2]) or re.findall("Wform", parts[2]) or re.findall("SVA", parts[2]):
                # Keep track of the erroneous word's position by extracting it from the sentence annotation 
                digit = [int(j) for j in re.findall("[0-9]+", parts[2][:2])]            
    
                for key in labels:
                    if  re.search(key, parts[2]):
                        err = labels.get(key)
                 
                # Using its extracted position, place the erroneous word's tag in its sentence
                targets[sen_position - 1][digit[0]-1] = err

158784it [12:35, 210.27it/s]


In [3]:
sen_len = []
for sen in sentences:
    sen_len.append(len(sen.split()))
    
MAX_SEQUENCE_LENGTH = max(sen_len)
print(MAX_SEQUENCE_LENGTH) #print(leng.index(max(sen_len)))

222


In [4]:
MAX_VOCAB_SIZE = len(set(words))
print(MAX_VOCAB_SIZE)

33762


In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [6]:
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, filters='\t\n')
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
word2idx = tokenizer.word_index

In [7]:
X = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', value=0)

In [8]:
y = pad_sequences(targets, maxlen=MAX_SEQUENCE_LENGTH, padding='post', value=0)

In [9]:
print(X.shape)
print(y.shape)

(57151, 222)
(57151, 222)


In [None]:
#Expirement Control Parameters:
UNITS=50  #25 #100
EMBEDDING_DIM = 50  #25 #100
BATCH_SIZE = 32   #16 #64
DROPOUT_VAL = 0  #0.1 #0.2 

#Expirement Constants:
EPOCHS = 3
RECURRENT_DROPOUT_VAL = 0.2

In [13]:
word2vec = {}
with open(os.path.join('/Users/highsierra/Tech-Skills/Labortory/ML-Fundamentals/glove.6B/glove.6B.%sd.txt' % EMBEDDING_DIM)) as f:
    for line in tqdm(f):
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        word2vec[word] = vec

400000it [00:07, 50594.03it/s]


In [14]:
embedding_matrix = np.zeros((MAX_VOCAB_SIZE,EMBEDDING_DIM))
for word, i in tqdm(word2idx.items()):
    embedding_vector = word2vec.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

100%|██████████| 29347/29347 [00:00<00:00, 333882.81it/s]


In [16]:
print("embedding_matrix shape:", embedding_matrix.shape)

embedding_matrix shape: (33762, 50)


## Modeling

In [19]:
# Partition sentences it into training and test sets: 80%, 20%

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=seed)

In [20]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout,Reshape, SimpleRNN, Bidirectional
from keras.utils import to_categorical

In [22]:
def Model_(x_tr, y_tr, epos=3, my_batch_size=BATCH_SIZE):  
    input = Input(shape=(MAX_SEQUENCE_LENGTH,)) # This returns a tensor. The comma is necessary when you have only one dimension.
    model = Embedding(input_dim=MAX_VOCAB_SIZE, output_dim=EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False)(input)    
    model = SimpleRNN(units=UNITS, return_sequences=True, recurrent_dropout=RECURRENT_DROPOUT_VAL)(model)
    model = Dropout(DROPOUT_VAL)(model)
    out = TimeDistributed(Dense(8, activation='softmax'))(model)
    model = Model(input, out)
    
    return model

In [23]:
model = Model_(X_train,  y_train)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [24]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [25]:
#For training and testing the network we also need to change the labels y to categorial.

ycat_train = to_categorical(y_train, num_classes=8)
ycat_test = to_categorical(y_test, num_classes=8)

In [26]:
model.fit(X_train, ycat_train, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=1) 
#epoch costs 4 min

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x1a8213d650>

## Evaluation
Calculate the following metrics:
1. Accuracy
2. Precision
3. Recall
4. F1 

In [27]:
print(model.metrics_names)

['loss', 'accuracy']


In [28]:
scores = model.evaluate(X_test, ycat_test, verbose=1)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 99.83%


In [29]:
pred = model.predict(X_test, verbose=1) 
print(pred.shape)

(11431, 222, 8)


In [30]:
y_pred = np.argmax(pred,axis=-1)

In [31]:
from sklearn.metrics import precision_score, recall_score, f1_score


pres_score = []

for tru,pred in zip (y_test, y_pred):
    pres_score.append(precision_score(tru,pred,average='macro'))

precision = np.mean(pres_score)
print(precision)

  _warn_prf(average, modifier, msg_start, len(result))


0.8581743871908131


In [32]:
rec_score = []

for tru,pred in zip (y_test, y_pred):
    rec_score.append(recall_score(tru,pred,average='macro'))

recall= np.mean(rec_score)
print(recall)

  _warn_prf(average, modifier, msg_start, len(result))


0.8588234182027363


In [33]:
# Manually, it's calculated according to the formula:  f1 = (2 * precision * recall) / (precision + recall)
f_score = []

for tru,pred in zip (y_test, y_pred):
    f_score.append(f1_score(tru,pred,average='macro'))
f1 = np.mean(f_score)
print(f1)

0.8584975271246401


### Example of the experiments or runs using a SimpleRNN architecture:

Experiments for: Finding the optimal Dropout value from the list [0, 0.1, 0.2]
#### Run 1:  (a.k.a this one)
###### The Variable: 
    Dropout = 0 
    
###### Defaults:
    Number of layers = 1
    Number of units or neurons within a layer = 50
    Embedding size = 50
    Batch size = 32
    
###### Constants:
       Recurrent Dropout = 0.2
       
#### Run 2:  (a.k.a the next execution)
###### The Variable: 
    Dropout = 0.1 
    
###### Defaults:
    same as previous values
    
###### Constants:
    same as previous values
     
#### Run 3:  (a.k.a the final execution, or the experiment using the last proposed value for Dropout)
###### The Variable: 
    Dropout = 0.2 
    
###### Defaults:
    same as previous values
    
###### Constants:
    same as previous values