# A simple feed forward model

```




```
- [Read data](#Read-data)
- [Prepare data](#Prepare-data)
- [Create and train model](#Create-and-train-model)
- [Test on unseen data](#Test-on-unseen-data)

In [1]:
import os
import sys
import json
import pickle

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

In [5]:
MODULES_PATH = '../modules'
MODELS_PATH = '../models'
DATA_PATH = '../data'

sys.path.append(MODULES_PATH)
from data import flatten_data, prepare_training_data, prepare_test_data, \
                    raise_one_level
from models import simple_ffn

Using TensorFlow backend.


## Read data

In [6]:
with open(os.path.join(DATA_PATH,'sentences.json'),'r') as datafile:
    sentences = json.load(datafile)

In [7]:
documents = pd.read_csv(os.path.join(DATA_PATH,'training_data.csv'))

## Prepare data

In [8]:
sentences_flat = raise_one_level(sentences)
sentences_df = pd.DataFrame(sentences_flat)

In [5]:
# flat_corpora, flat_labels = flatten_data(single_corpora[5:9])

In [9]:
corpora_train, corpora_test, labels_train, labels_test = train_test_split(
                                                        sentences_df['body'],
                                                        sentences_df['class'],
                                                        test_size=0.75,
                                                        random_state=123)

In [28]:
training_data = []
for i in range(1,4):
    print(i+1)

    document_matrix, labels, pipeline_instance = prepare_training_data(corpora_train, labels_train, (i,i))
    training_data.append({'document_matrix': document_matrix, 'labels': labels, 'pipeline_instance': pipeline_instance})

2
3
4


In [30]:
training_data[0]['document_matrix'].shape, training_data[0]['labels'].shape, training_data[0]['pipeline_instance']

((10139, 68), (10139, 11), Pipeline(memory=None,
      steps=[('vect', CountVectorizer(analyzer='char', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=1.0, max_features=None, min_df=1,
         ngram_range=(1, 1), preprocessor=None, stop_words=None,
         strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None, vocabulary=None)), ('tfidf', TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True))]))

In [29]:
for i in training_data:
    print(i['document_matrix'].shape)

(10139, 68)
(10139, 2161)
(10139, 21082)


In [33]:
training_data[0]['document_matrix'].head().T

Unnamed: 0,0,1,2,3,4
,0.435572,0.368490,0.0,0.571279,0.404572
!,0.000000,0.000000,0.0,0.000000,0.000000
"""",0.128529,0.000000,0.0,0.000000,0.000000
#,0.000000,0.000000,0.0,0.000000,0.000000
$,0.000000,0.000000,0.0,0.000000,0.000000
%,0.000000,0.000000,0.0,0.000000,0.000000
&,0.000000,0.000000,0.0,0.000000,0.000000
',0.000000,0.000000,0.0,0.028501,0.000000
(,0.000000,0.000000,0.0,0.000000,0.000000
),0.000000,0.000000,0.0,0.000000,0.000000


In [14]:
# with open(os.path.join(DATA_PATH, 'pipeline_instance.pickle'),'wb') as datafile:
#         pickle.dump(pipeline_instance, datafile)

## Create and train model

In [44]:
layers = [128, 128]
activations = ['relu']
dropout = [0.15]
attention = [128]
max(len(layers), len(activations), len(dropout),  len(attention))

2

In [34]:
model = simple_ffn(document_matrix, labels)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 21082)        0                                            
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 128)          2698624     input_1[0][0]                    
__________________________________________________________________________________________________
activation_1 (Activation)       (None, 128)          0           dense_1[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 128)          0           activation_1[0][0]               
__________________________________________________________________________________________________
batch_norm

In [35]:
models = []
for i in training_data:
    print(i['document_matrix'].shape)
    model = simple_ffn(i['document_matrix'], i['labels'])
    model.fit(i['document_matrix'], i['labels'], epochs=100, validation_split=0.1)
    models.append(model)

(10139, 68)
Train on 9125 samples, validate on 1014 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100


Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100

Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100


Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [23]:
model.save(os.path.join(MODELS_PATH,'ffn_sample_model_sentences_bi.h5'))

## Test on unseen data 

In [36]:
test_data = []
for i in training_data:
    X_test, y_test = prepare_test_data(corpora_test, labels_test, i['pipeline_instance'])
    test_data.append({'X_test': X_test, 'y_test': y_test})

In [38]:
test_data[0]['X_test'].head()

Unnamed: 0,Unnamed: 1,!,"""",#,$,%,&,',(,),...,u,v,w,x,y,z,{,|,},~
0,0.351427,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.080407,0.080444,...,0.198643,0.0,0.16834,0.118161,0.178917,0.0,0.0,0.0,0.0,0.0
1,0.531607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.037561,0.0,0.038198,0.268115,0.121792,0.0,0.0,0.0,0.0,0.0
2,0.47093,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.038955,0.054692,0.066025,0.0,0.014035,0.0,0.0,0.0,0.0,0.0
3,0.231594,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.087272,0.0,0.266251,0.0,0.188653,0.151879,0.0,0.0,0.0,0.0
4,0.380765,0.0,0.0,0.0,0.0,0.0,0.0,0.075985,0.06534,0.06537,...,0.336291,0.018886,0.054718,0.0,0.087234,0.07023,0.0,0.0,0.0,0.0


In [39]:
test_data[0]['y_test'].head()

Unnamed: 0,af,en,nr,nso,ss,st,tn,ts,ve,xh,zu
34750,0,0,0,0,0,0,0,0,0,1,0
18986,0,0,0,1,0,0,0,0,0,0,0
13655,0,1,0,0,0,0,0,0,0,0,0
15126,0,0,1,0,0,0,0,0,0,0,0
21978,0,0,0,0,1,0,0,0,0,0,0


In [42]:
for idx, i in enumerate(test_data):
    score, accuracy = models[idx].evaluate(i['X_test'], i['y_test'])
    print('Model test accuracy', accuracy.round(4)*100)

Model test accuracy 77.2
Model test accuracy 75.77000000000001
Model test accuracy 71.95
