## Prepare & clean the data

In [1]:
import re

# Extract error types from: "conll14st-preprocessed.conll.ann" file
# Separate each word of a particular error type in its own list.
l1=[]
l4=[]
l5=[]
l6=[]
l7=[]
l11=[]
l14=[]
def extract_err_type(colnn_ann):
    f = open(colnn_ann, 'r')
    for line in f:
        parts = line.split()
        if(len(parts)>0):
            if(line[0:6]=="<TYPE>"):
                type_ = line[6:len(parts[0])-7]
            elif(line[0:12]=="<CORRECTION>"):
                corrected_word = line[12:len(line)-14] 
                if type_ =="Vt":
                    "Vt", line[12:len(line)-14]
                    l1.append(line[12:len(line)-14])   
                if type_ =="Vform":
                    l4.append(line[12:len(line)-14])
                if type_ =="SVA":
                    l5.append(line[12:len(line)-14])
                if type_ =="ArtOrDet":
                    l6.append(line[12:len(line)-14])
                if type_ =="Nn":
                    l7.append(line[12:len(line)-14])
                if type_ =="Prep":
                    l11.append(line[12:len(line)-14])
                if type_ =="Wform":
                    l14.append(line[12:len(line)-14])

In [2]:
# Pass in to the function, the path to conll14st-preprocessed.conll.ann in your system
extract_err_type("/Users/highsierra/Tech-Skills/Labortory/ML-Fundamentals/error_types_conll14/conll14st-preprocessed.conll.ann")

In [3]:
# Create a corresponding list containing the error type of each word
# Map between each word and its error type
ll1=[]
for i in range(len(l1)):
    ll1.append('Vt')

ll4=[]
for i in range(len(l4)):
    ll4.append('Vform')

ll5=[]
for i in range(len(l5)):
    ll5.append('SVA')

ll6=[]
for i in range(len(l6)):
    ll6.append('ArtOrDet')

ll7=[]
for i in range(len(l7)):
    ll7.append('Nn')

ll11=[]
for i in range(len(l11)):
    ll11.append('Prep')


ll14=[]
for i in range(len(l14)):
    ll14.append('Wform')

In [4]:
# Separate the words (input) from the error types (target)
import itertools

targets = list(itertools.chain(ll1, ll4, ll5, ll6, ll7, ll11, ll14))
words = list(itertools.chain(l1, l4, l5, l6, l7, l11, l14))

####  P.S. Up to this point, only the representations of words have been used. No further feature engineering has been done.

## Encode the data

In [5]:
# Encode the target vector in binary 

from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()
ybin = lb.fit(targets)
lb.classes_

array(['ArtOrDet', 'Nn', 'Prep', 'SVA', 'Vform', 'Vt', 'Wform'],
      dtype='<U8')

In [6]:
y = lb.transform(targets)
print(y)
print(y.shape)

[[0 0 0 ... 0 1 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 1 0]
 ...
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 1]]
(21138, 7)


In [7]:
# Encode the input using glove embeddings   

from zeugma.embeddings import EmbeddingTransformer

glove = EmbeddingTransformer('glove') 
X = glove.transform(words)

Using TensorFlow backend.


In [8]:
print(X)
print(X.shape)

[[ 9.4602001e-04 -8.4354997e-02  1.9994000e-01 ...  2.7858001e-01
   2.0168000e-01 -7.1859002e-01]
 [ 5.7466000e-02  1.7890000e-01 -6.2778002e-01 ... -7.5557001e-02
   2.4191999e-01  2.2470001e-03]
 [-5.2309000e-01 -6.8026000e-01 -4.6072000e-01 ...  5.2221000e-01
   7.0643997e-01 -2.5231999e-01]
 ...
 [ 5.6629997e-01  2.5286999e-01 -1.1373000e+00 ...  5.4163003e-01
  -1.7024000e+00 -4.1539001e-01]
 [ 3.4362650e-01  9.2356995e-02 -6.9733500e-01 ... -3.7974998e-02
  -1.1478500e+00 -2.5225499e-01]
 [ 1.0357000e-01  5.4285997e-01 -8.8192999e-01 ... -4.9219000e-01
   1.3172001e-01 -8.6390001e-01]]
(21138, 25)


## Create training & testing sets

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Train the model

In [10]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(hidden_layer_sizes=(300,200,100), max_iter=500, activation = 'relu', solver='adam', random_state=21, tol=0.000000001)
clf.fit(X_train, y_train) 

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(300, 200, 100), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=500,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=21, shuffle=True, solver='adam',
              tol=1e-09, validation_fraction=0.1, verbose=False,
              warm_start=False)

## Make predictions

In [11]:
y_pred = clf.predict(X_test)

## Evaluate the model

In [12]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


print(classification_report(y_test,y_pred,target_names=lb.classes_))
print(accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

    ArtOrDet       0.78      0.95      0.86      1319
          Nn       0.83      0.75      0.79       760
        Prep       0.94      0.68      0.79       496
         SVA       0.51      0.35      0.42       300
       Vform       0.46      0.29      0.35       266
          Vt       0.65      0.57      0.61       654
       Wform       0.62      0.45      0.52       433

   micro avg       0.74      0.69      0.72      4228
   macro avg       0.68      0.58      0.62      4228
weighted avg       0.73      0.69      0.70      4228
 samples avg       0.69      0.69      0.69      4228

0.6837748344370861


  _warn_prf(average, modifier, msg_start, len(result))
