# A simple feed forward model

```




```
- [Read data](#Read-data)
- [Prepare data](#Prepare-data)
- [Create and train model](#Create-and-train-model)
- [Test on unseen data](#Test-on-unseen-data)

In [1]:
import os
import sys
import json
import pickle

import numpy as np
import pandas as pd

from sklearn.model_selection import GridSearchCV, train_test_split
from keras.wrappers.scikit_learn import KerasClassifier

Using TensorFlow backend.


In [2]:
MODULES_PATH = '../modules'
MODELS_PATH = '../models'
DATA_PATH = '../data'

sys.path.append(MODULES_PATH)
from data import flatten_data, prepare_training_data, prepare_test_data, \
                    raise_one_level
from models import parameter_ffn_seq

## Read data

In [3]:
with open(os.path.join(DATA_PATH,'sentences.json'),'r') as datafile:
    sentences = json.load(datafile)

In [4]:
documents = pd.read_csv('../data/training_data.csv')

## Prepare data

In [5]:
sentences_flat = raise_one_level(sentences)
sentences_df = pd.DataFrame(sentences_flat)

In [6]:
corpora_train, corpora_test, labels_train, labels_test = train_test_split(
                                                                        sentences_df['body'],
                                                                        sentences_df['class'],
                                                                        test_size=0.25,
                                                                        random_state=123)

In [7]:
training_data = []
for i in range(1,2):
    print(i+1)

    document_matrix, labels, pipeline_instance = prepare_training_data(corpora_train, labels_train, (i,i))
    training_data.append({'document_matrix': document_matrix, 'labels': labels, 'pipeline_instance': pipeline_instance})

2


In [30]:
training_data[0]['document_matrix'].shape, training_data[0]['labels'].shape, training_data[0]['pipeline_instance']

((10139, 68), (10139, 11), Pipeline(memory=None,
      steps=[('vect', CountVectorizer(analyzer='char', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=1.0, max_features=None, min_df=1,
         ngram_range=(1, 1), preprocessor=None, stop_words=None,
         strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None, vocabulary=None)), ('tfidf', TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True))]))

In [29]:
for i in training_data:
    print(i['document_matrix'].shape)

(10139, 68)
(10139, 2161)
(10139, 21082)


In [14]:
with open(os.path.join(DATA_PATH, 'pipeline_instance.pickle'),'wb') as datafile:
        pickle.dump(pipeline_instance, datafile)

## Create and train model

In [8]:
ffn = KerasClassifier(build_fn=parameter_ffn_seq, verbose=1)

In [9]:
parameters = {'layers': [],
                   'activations': [['relu']],
                   'dropout': [[0.05], [0.15], [0.25]],
                   'attention': [128],
             'input_shape': [document_matrix.iloc[0:5].shape[1]], 
              'nb_classes': [labels.iloc[0:5].shape[1]]}
for j in [64, 128, 256, 512, 1024, 2048]:
    for i in range(3):
        parameters['layers'].append([j]*(i+1))

In [11]:
parameters

{'activations': [['relu']],
 'attention': [128],
 'dropout': [[0.05], [0.15], [0.25]],
 'input_shape': [68],
 'layers': [[64],
  [64, 64],
  [64, 64, 64],
  [128],
  [128, 128],
  [128, 128, 128],
  [256],
  [256, 256],
  [256, 256, 256],
  [512],
  [512, 512],
  [512, 512, 512],
  [1024],
  [1024, 1024],
  [1024, 1024, 1024],
  [2048],
  [2048, 2048],
  [2048, 2048, 2048]],
 'nb_classes': [11]}

In [12]:
ffn_grid = GridSearchCV(estimator=ffn, param_grid=parameters, n_jobs=3, verbose=1)

In [13]:
grid_result = ffn_grid.fit(document_matrix, labels, epochs=100)

Fitting 3 folds for each of 54 candidates, totalling 162 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:  4.0min
[Parallel(n_jobs=3)]: Done 162 out of 162 | elapsed: 30.2min finished


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [14]:
pd.DataFrame(grid_result.cv_results_).sort_values('mean_test_score', ascending=False)



Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_activations,param_attention,param_dropout,param_input_shape,param_layers,param_nb_classes,...,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
13,42.768974,1.487677,0.797,0.946,[relu],128,[0.05],68,"[1024, 1024]",11,...,0.814371,0.945946,0.783784,0.938531,0.792793,0.953523,0.059467,0.085348,0.012839,0.006121
52,100.736751,4.622157,0.795,0.926,[relu],128,[0.25],68,"[2048, 2048]",11,...,0.820359,0.926426,0.795796,0.917541,0.768769,0.934033,0.294189,0.500056,0.021074,0.006739
17,223.831091,2.026304,0.792,0.966505,[relu],128,[0.05],68,"[2048, 2048, 2048]",11,...,0.808383,0.977477,0.78979,0.961019,0.777778,0.961019,0.348028,0.060322,0.012595,0.007758
16,121.86095,1.74515,0.792,0.963001,[relu],128,[0.05],68,"[2048, 2048]",11,...,0.811377,0.965465,0.795796,0.967016,0.768769,0.956522,0.333983,0.029222,0.017605,0.004625
34,121.558473,3.723957,0.791,0.948996,[relu],128,[0.15],68,"[2048, 2048]",11,...,0.799401,0.941441,0.792793,0.946027,0.780781,0.95952,6.145709,0.740501,0.007708,0.007673
35,214.099664,4.318652,0.789,0.951494,[relu],128,[0.15],68,"[2048, 2048, 2048]",11,...,0.787425,0.93994,0.810811,0.95952,0.768769,0.955022,2.16775,0.806629,0.017191,0.008374
49,24.86237,3.771168,0.788,0.876497,[relu],128,[0.25],68,"[1024, 1024]",11,...,0.802395,0.870871,0.780781,0.88006,0.780781,0.878561,0.54279,0.303048,0.010194,0.004025
53,174.396906,5.391669,0.784,0.909499,[relu],128,[0.25],68,"[2048, 2048, 2048]",11,...,0.790419,0.906907,0.774775,0.911544,0.786787,0.910045,0.195293,0.83715,0.006685,0.001932
32,48.32623,2.711005,0.784,0.907499,[relu],128,[0.15],68,"[1024, 1024, 1024]",11,...,0.793413,0.905405,0.795796,0.904048,0.762763,0.913043,0.101066,0.19289,0.015037,0.00396
14,68.088104,1.612082,0.781,0.958498,[relu],128,[0.05],68,"[1024, 1024, 1024]",11,...,0.778443,0.954955,0.801802,0.953523,0.762763,0.967016,0.264536,0.111811,0.016032,0.006052


In [15]:
model = parameter_ffn(document_matrix, labels, **{'layers': [128, 128, 128],
                                                  'activations': ['relu'],
                                                  'dropout': [0.15],
                                                  'attention': 128})
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            (None, 68)           0                                            
__________________________________________________________________________________________________
dense_17 (Dense)                (None, 128)          8832        input_6[0][0]                    
__________________________________________________________________________________________________
activation_17 (Activation)      (None, 128)          0           dense_17[0][0]                   
__________________________________________________________________________________________________
dropout_12 (Dropout)            (None, 128)          0           activation_17[0][0]              
__________________________________________________________________________________________________
batch_norm

In [35]:
models = []
for i in training_data:
    print(i['document_matrix'].shape)
    model = simple_ffn(i['document_matrix'], i['labels'])
    model.fit(i['document_matrix'], i['labels'], epochs=100, validation_split=0.1)
    models.append(model)

(10139, 68)
Train on 9125 samples, validate on 1014 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100


Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100

Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100


Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [23]:
model.save(os.path.join(MODELS_PATH,'ffn_sample_model_sentences_bi.h5'))

## Test on unseen data 

In [36]:
test_data = []
for i in training_data:
    X_test, y_test = prepare_test_data(corpora_test, labels_test, i['pipeline_instance'])
    test_data.append({'X_test': X_test, 'y_test': y_test})

In [38]:
test_data[0]['X_test'].head()

Unnamed: 0,Unnamed: 1,!,"""",#,$,%,&,',(,),...,u,v,w,x,y,z,{,|,},~
0,0.351427,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.080407,0.080444,...,0.198643,0.0,0.16834,0.118161,0.178917,0.0,0.0,0.0,0.0,0.0
1,0.531607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.037561,0.0,0.038198,0.268115,0.121792,0.0,0.0,0.0,0.0,0.0
2,0.47093,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.038955,0.054692,0.066025,0.0,0.014035,0.0,0.0,0.0,0.0,0.0
3,0.231594,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.087272,0.0,0.266251,0.0,0.188653,0.151879,0.0,0.0,0.0,0.0
4,0.380765,0.0,0.0,0.0,0.0,0.0,0.0,0.075985,0.06534,0.06537,...,0.336291,0.018886,0.054718,0.0,0.087234,0.07023,0.0,0.0,0.0,0.0


In [39]:
test_data[0]['y_test'].head()

Unnamed: 0,af,en,nr,nso,ss,st,tn,ts,ve,xh,zu
34750,0,0,0,0,0,0,0,0,0,1,0
18986,0,0,0,1,0,0,0,0,0,0,0
13655,0,1,0,0,0,0,0,0,0,0,0
15126,0,0,1,0,0,0,0,0,0,0,0
21978,0,0,0,0,1,0,0,0,0,0,0


In [42]:
for idx, i in enumerate(test_data):
    score, accuracy = models[idx].evaluate(i['X_test'], i['y_test'])
    print('Model test accuracy', accuracy.round(4)*100)

Model test accuracy 77.2
Model test accuracy 75.77000000000001
Model test accuracy 71.95
