# A simple feed forward model

```




```
- [Read data](#Read-data)
- [Prepare data](#Prepare-data)
- [Create and train model](#Create-and-train-model)
- [Test on unseen data](#Test-on-unseen-data)

In [1]:
import os
import sys
import json
import pickle

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

In [2]:
MODULES_PATH = '../modules'
MODELS_PATH = '../models'
DATA_PATH = '../data'

sys.path.append(MODULES_PATH)
from data import flatten_data, prepare_training_data, prepare_test_data
from models import simple_ffn

Using TensorFlow backend.


## Read data

In [3]:
with open(os.path.join(DATA_PATH,'single_corpora.json'),'r') as datafile:
    single_corpora = json.load(datafile)

In [4]:
# document class balance
[len(i) for i in single_corpora]

[38238, 478811, 75634, 69254, 49012, 38206, 40485, 32279, 58398]

## Prepare data

In [5]:
flat_corpora, flat_labels = flatten_data(single_corpora[5:9])

In [6]:
corpora_train, corpora_test, labels_train, labels_test = train_test_split(
                                                        flat_corpora,
                                                        flat_labels,
                                                        test_size=0.75,
                                                        random_state=123)

In [7]:
document_matrix, labels, pipeline_instance = prepare_training_data(corpora_train, labels_train)

In [8]:
document_matrix.shape, labels.shape, pipeline_instance

((42254, 1511), (42254, 4), Pipeline(memory=None,
      steps=[('vect', CountVectorizer(analyzer='char', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=1.0, max_features=None, min_df=1,
         ngram_range=(2, 2), preprocessor=None, stop_words=None,
         strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None, vocabulary=None)), ('tfidf', TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True))]))

In [14]:
with open(os.path.join(DATA_PATH, 'pipeline_instance.pickle'),'wb') as datafile:
        pickle.dump(pipeline_instance, datafile)

## Create and train model

In [10]:
model = simple_ffn(document_matrix, labels)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1511)         0                                            
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 128)          193536      input_1[0][0]                    
__________________________________________________________________________________________________
activation_1 (Activation)       (None, 128)          0           dense_1[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 128)          0           activation_1[0][0]               
__________________________________________________________________________________________________
batch_norm

In [11]:
model.fit(document_matrix, labels, epochs=100, validation_split=0.1)

Train on 38028 samples, validate on 4226 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/10

<keras.callbacks.History at 0x7fb727dd86d8>

In [15]:
model.save(os.path.join(MODELS_PATH,'ffn_sample_model.h5'))

## Test on unseen data 

In [19]:
X_test, y_test = prepare_test_data(corpora_test, labels_test, pipeline_instance)

In [20]:
X_test.head()

Unnamed: 0,%,&,*,+,-,/,0,1,2,3,...,‹i,‹n,‹o,‹u,‹w,‹â,€¢,€š,€”,™t
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
y_test.head()

Unnamed: 0,nr,xh,zu
0,1,0,0
1,0,1,0
2,0,1,0
3,1,0,0
4,0,0,1


In [33]:
score, accuracy = model.evaluate(X_test, y_test)



In [35]:
print('Model test accuracy', accuracy.round(4))

Model test accuracy 0.9867
