## Preparing the data

In [1]:
import re
import numpy as np
import spacy

# Set a fixed seed for the random number generator to address randomness problem and get reproducable results with Keras
np.random.seed(42)

In [2]:
# Import the sentences from the data file
f=open('/Users/highsierra/Tech-Skills/Unorganised/conll2014-master/release3.2/data/conll14st-preprocessed.m2')

sentences = []
words = []
Stat = []


word_tags=[] # the target output of each word 
error_tags = ["ArtOrDet","Nn","Vt","Prep","Vform","Wform","SVA"] # the seven grammatical errors' tags 
sen_position=0 



        
for line in f:
    parts = line.split()
    if(len(parts)>0):
        if line[0]=='S':
        # Initial Inputs Processing 
        
            # Perform basic cleaning
            cleanSen = re.sub(r"\n", '', line[2:])
            # Keep track of the sentences' lengths
            Stat.append(len(cleanSen.split()))
            
            # Create a list of sentences
            sentences.append(cleanSen)
            
            # Create a one-dimensional array of input words 
            words = words + cleanSen.split()
            
            
        # Initial Outputs Processing  
            
            # By default, consider every word as non-erroneous, by creating an array with the tag "Correct" per every word.
            tags=np.empty(shape=(len(parts)-1), dtype=object)
            tags = np.where(tags==None, "Correct", tags)
            # Combine the tags associated with each sentence vertically in order to allign them with the input words
            word_tags.append(tags)
            # Keep track of the sentence's position
            sen_position += 1
            
        elif parts[0]=='A':
            if re.findall("ArtOrDet", parts[2]) or re.findall("Nn", parts[2]) or re.findall("Vt", parts[2]) or re.findall("Prep", parts[2]) or re.findall("Vform", parts[2]) or re.findall("Wform", parts[2]) or re.findall("SVA", parts[2]):
                # Keep track of the erroneous word's position by extracting it from the sentence annotation 
                digit = [int(j) for j in re.findall("[0-9]+", parts[2][:2])]            
                  
                # Extract the erroneous words tag
                for tag in error_tags:
                    if  re.search(tag, parts[2]):
                        err = re.findall(tag, parts[2])
                        
                # Using its extracted position, place the erroneous word's tag in its sentence
                word_tags[sen_position - 1][digit[0]-1] = err[0]

In [3]:
# Generate pos tags for the sentences

POSTags = []
sentences_POSTags = []

nlp = spacy.load("en_core_web_sm")
for text in sentences:
    doc = nlp(text)
    for token in doc:
        POSTags.append(token.pos_)
    sentences_POSTags.append(POSTags)
    POSTags = []

In [4]:
# Find the sentence of longest length

SenStat=[]
for ele in sentences_POSTags:
    SenStat.append(len(ele))
LongestSen = max(SenStat)


# Pad the rest of the sentences to be of the same length ie the max length

for i in range(len(sentences)): 
    while len(sentences_POSTags[i]) < LongestSen: 
        sentences_POSTags[i].append('SPACE')

In [5]:
# Convert the textual features (POS Tags) into numerical features by using the one-hot encoding technique

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder


transformer = ColumnTransformer(transformers=[('cat', OneHotEncoder(), [0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 ,
16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 , 24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 , 32 , 33 , 34 , 35 , 36 , 37 , 38 ,
39 , 40 , 41 , 42 , 43 , 44 , 45 , 46 , 47 , 48 , 49 , 50 , 51 , 52 , 53 , 54 , 55 , 56 , 57 , 58 , 59 , 60 , 61 , 62 ,
63 , 64 , 65 , 66 , 67 , 68 , 69 , 70 , 71 , 72 , 73 , 74 , 75 , 76 , 77 , 78 , 79 , 80 , 81 , 82 , 83 , 84 , 85 , 86 ,
87 , 88 , 89 , 90 , 91 , 92 , 93 , 94 , 95 , 96 , 97 , 98 , 99 , 100 , 101 , 102 , 103 , 104 , 105 , 106 , 107 , 108 ,
109 , 110 , 111 , 112 , 113 , 114 , 115 , 116 , 117 , 118 , 119 , 120 , 121 , 122 , 123 , 124 , 125 , 126 , 127 , 128 , 129 ,
130 , 131 , 132 , 133 , 134 , 135 , 136 , 137 , 138 , 139 , 140 , 141 , 142 , 143 , 144 , 145 , 146 , 147 ,148 , 149 ,
150 , 151 , 152 , 153 , 154 , 155 , 156 , 157 , 158 , 159 ,160 , 161 , 162 , 163 , 164 , 165 , 166 , 167 , 168 ,169 ,
170 , 171 , 172 , 173 , 174 , 175 , 176 , 177 , 178 , 179 , 180 , 181 , 182 , 183 , 184 , 185 , 186 , 187 , 188 , 189 ,
190 , 191 , 192 , 193 , 194 , 195 , 196 , 197 , 198 , 199 , 200 ,201 , 202 , 203 , 204 , 205 , 206 , 207 , 208 , 209 , 210 ,
211 , 212 , 213 , 214 , 215 , 216 , 217 , 218 , 219 , 220 , 221 ,222 , 223 ,224 , 225 , 226 ,227 , 228 , 229 ,230   ])], remainder='passthrough')

encoded_sen_POSTags = np.array(sentences_POSTags)
encoded_sen_POSTags = transformer.fit_transform(encoded_sen_POSTags)
encoded_sen_POSTags = encoded_sen_POSTags.toarray()

In [6]:
# Duplicate number of POS-tagged sentences to be inligned with the input words

duplicated_sen_POSTags = []
sen=0
for nb_of_words in Stat: 
    for n in range(nb_of_words):
        duplicated_sen_POSTags.append(encoded_sen_POSTags[sen])
    sen=sen+1
duplicated_sen_POSTags = np.array(duplicated_sen_POSTags)

In [7]:
# Encode the input words using GloVe embeddings technique 

from zeugma.embeddings import EmbeddingTransformer

glove = EmbeddingTransformer('glove') 
encoded_words = glove.transform(words)

Using TensorFlow backend.


In [8]:
# Align the input words with their original sentence features 

input_ = np.concatenate((encoded_words,duplicated_sen_POSTags),axis=1)

In [9]:
# Convert tags to a one-dimensional array in order to allign with input words

flattened_word_tags = []
for sen in word_tags:
    for tag in sen:
        flattened_word_tags.append(tag)

In [10]:
# Encode the target outputs using one-hot encoding technique

from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()
lb.fit(flattened_word_tags)
output_ = lb.transform(flattened_word_tags)

In [11]:
print(input_.shape)
print(output_.shape)

(1161567, 2110)
(1161567, 8)


In [12]:
# Split the data into training 80% and testing 20%

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(input_, output_, test_size=0.2, random_state=0)

## Training the model

In [13]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import SpatialDropout1D

In [14]:
X_train = X_train.reshape(-1, 1, 2110)
X_test  = X_test.reshape(-1, 1, 2110)
y_train = y_train.reshape(-1, 1, 8)
y_test = y_test.reshape(-1, 1, 8)

In [15]:
model = Sequential()
model.add(LSTM(200, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
model.add(Dense(8, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [16]:
model.fit(X_train, y_train, epochs=3, batch_size=64) 

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x1a58322050>

## Testing the model

In [17]:
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 98.19%
