# A simple feed forward model

```




```
- [Read data](#Read-data)
- [Prepare data](#Prepare-data)
- [Create and train model](#Create-and-train-model)
- [Test on unseen data](#Test-on-unseen-data)

In [1]:
import os
import sys
import json
import pickle

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

In [2]:
MODULES_PATH = '../modules'
MODELS_PATH = '../models'
DATA_PATH = '../data'

sys.path.append(MODULES_PATH)
from data import flatten_data, prepare_training_data, prepare_test_data
from models import simple_ffn

Using TensorFlow backend.


In [3]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 10764946797048720480
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 4951913267
locality {
  bus_id: 1
  links {
  }
}
incarnation: 11699701866648019640
physical_device_desc: "device: 0, name: GeForce GTX 1060 with Max-Q Design, pci bus id: 0000:01:00.0, compute capability: 6.1"
]


## Read data

In [4]:
with open(os.path.join(DATA_PATH,'single_corpora.json'),'r') as datafile:
    single_corpora = json.load(datafile)

In [5]:
# document class balance
[len(i) for i in single_corpora]

[38238, 478811, 75634, 69254, 49012, 38206, 40485, 32279, 58398]

## Prepare data

In [6]:
flat_corpora, flat_labels = flatten_data(single_corpora)

In [7]:
corpora_train, corpora_test, labels_train, labels_test = train_test_split(
                                                        flat_corpora,
                                                        flat_labels,
                                                        test_size=0.25,
                                                        random_state=123)

In [8]:
with open(os.path.join(DATA_PATH, 'train_data.pickle'),'wb') as datafile:
        pickle.dump((corpora_train, labels_train), datafile)

In [9]:
with open(os.path.join(DATA_PATH, 'test_data.pickle'),'wb') as datafile:
        pickle.dump((corpora_test, labels_test), datafile)

In [10]:
document_matrix, labels, pipeline_instance = prepare_training_data(corpora_train, labels_train)

In [12]:
document_matrix.shape, labels.shape, pipeline_instance

((653432, 2346), (653432, 9), Pipeline(memory=None,
      steps=[('vect', CountVectorizer(analyzer='char', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=1.0, max_features=None, min_df=1,
         ngram_range=(2, 2), preprocessor=None, stop_words=None,
         strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None, vocabulary=None)), ('tfidf', TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True))]))

In [11]:
with open(os.path.join(DATA_PATH, 'pipeline_instance.pickle'),'wb') as datafile:
        pickle.dump(pipeline_instance, datafile)

## Create and train model

In [22]:
model = simple_ffn(document_matrix, labels)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 2346)         0                                            
__________________________________________________________________________________________________
dense_9 (Dense)                 (None, 128)          300416      input_5[0][0]                    
__________________________________________________________________________________________________
activation_9 (Activation)       (None, 128)          0           dense_9[0][0]                    
__________________________________________________________________________________________________
dropout_5 (Dropout)             (None, 128)          0           activation_9[0][0]               
__________________________________________________________________________________________________
batch_norm

In [23]:
model.fit(document_matrix, labels, epochs=25, validation_split=0.1, verbose=2, batch_size=1024)

Train on 588088 samples, validate on 65344 samples
Epoch 1/25
 - 36s - loss: 2.0353 - acc: 0.7093 - val_loss: 1.7836 - val_acc: 0.8133
Epoch 2/25
 - 34s - loss: 1.6477 - acc: 0.7195 - val_loss: 1.4843 - val_acc: 0.8132
Epoch 3/25
 - 33s - loss: 1.4301 - acc: 0.7204 - val_loss: 1.2928 - val_acc: 0.8152
Epoch 4/25
 - 34s - loss: 1.2740 - acc: 0.7230 - val_loss: 1.1412 - val_acc: 0.8163
Epoch 5/25
 - 34s - loss: 1.1463 - acc: 0.7432 - val_loss: 1.0135 - val_acc: 0.8580
Epoch 6/25
 - 34s - loss: 1.0397 - acc: 0.7919 - val_loss: 0.8992 - val_acc: 0.8585
Epoch 7/25
 - 33s - loss: 0.9451 - acc: 0.8081 - val_loss: 0.7960 - val_acc: 0.8591
Epoch 8/25
 - 34s - loss: 0.8593 - acc: 0.8106 - val_loss: 0.7037 - val_acc: 0.8594
Epoch 9/25
 - 34s - loss: 0.7874 - acc: 0.8106 - val_loss: 0.6284 - val_acc: 0.8594
Epoch 10/25
 - 33s - loss: 0.7279 - acc: 0.8107 - val_loss: 0.5668 - val_acc: 0.8598
Epoch 11/25
 - 34s - loss: 0.6726 - acc: 0.8114 - val_loss: 0.5171 - val_acc: 0.8601
Epoch 12/25
 - 34s - lo

<keras.callbacks.History at 0x23fa8b3ae10>

In [24]:
model.fit(document_matrix, labels, epochs=25, validation_split=0.1, verbose=2, batch_size=1024)

Train on 588088 samples, validate on 65344 samples
Epoch 1/25
 - 33s - loss: 0.2544 - acc: 0.8992 - val_loss: 0.1665 - val_acc: 0.9472
Epoch 2/25
 - 33s - loss: 0.2423 - acc: 0.8998 - val_loss: 0.1563 - val_acc: 0.9476
Epoch 3/25
 - 34s - loss: 0.2292 - acc: 0.9028 - val_loss: 0.1491 - val_acc: 0.9485
Epoch 4/25
 - 33s - loss: 0.2097 - acc: 0.9111 - val_loss: 0.1403 - val_acc: 0.9510
Epoch 5/25
 - 34s - loss: 0.1898 - acc: 0.9243 - val_loss: 0.1273 - val_acc: 0.9683
Epoch 6/25
 - 34s - loss: 0.1739 - acc: 0.9407 - val_loss: 0.1124 - val_acc: 0.9777
Epoch 7/25
 - 34s - loss: 0.1618 - acc: 0.9503 - val_loss: 0.1005 - val_acc: 0.9813
Epoch 8/25
 - 33s - loss: 0.1518 - acc: 0.9539 - val_loss: 0.0915 - val_acc: 0.9825
Epoch 9/25
 - 34s - loss: 0.1444 - acc: 0.9556 - val_loss: 0.0850 - val_acc: 0.9835
Epoch 10/25
 - 34s - loss: 0.1374 - acc: 0.9575 - val_loss: 0.0801 - val_acc: 0.9839
Epoch 11/25
 - 33s - loss: 0.1308 - acc: 0.9592 - val_loss: 0.0762 - val_acc: 0.9848
Epoch 12/25
 - 34s - lo

<keras.callbacks.History at 0x23fa8b3ada0>

In [25]:
model.fit(document_matrix, labels, epochs=25, validation_split=0.1, verbose=2, batch_size=1024)

Train on 588088 samples, validate on 65344 samples
Epoch 1/25
 - 34s - loss: 0.0590 - acc: 0.9838 - val_loss: 0.0533 - val_acc: 0.9860
Epoch 2/25
 - 33s - loss: 0.0579 - acc: 0.9840 - val_loss: 0.0527 - val_acc: 0.9862
Epoch 3/25
 - 34s - loss: 0.0562 - acc: 0.9842 - val_loss: 0.0521 - val_acc: 0.9861
Epoch 4/25
 - 34s - loss: 0.0551 - acc: 0.9844 - val_loss: 0.0515 - val_acc: 0.9863
Epoch 5/25
 - 33s - loss: 0.0537 - acc: 0.9847 - val_loss: 0.0512 - val_acc: 0.9865
Epoch 6/25
 - 33s - loss: 0.0527 - acc: 0.9848 - val_loss: 0.0506 - val_acc: 0.9865
Epoch 7/25
 - 33s - loss: 0.0518 - acc: 0.9851 - val_loss: 0.0503 - val_acc: 0.9866
Epoch 8/25
 - 34s - loss: 0.0504 - acc: 0.9853 - val_loss: 0.0500 - val_acc: 0.9868
Epoch 9/25
 - 34s - loss: 0.0498 - acc: 0.9856 - val_loss: 0.0496 - val_acc: 0.9867
Epoch 10/25
 - 34s - loss: 0.0492 - acc: 0.9858 - val_loss: 0.0495 - val_acc: 0.9869
Epoch 11/25
 - 34s - loss: 0.0482 - acc: 0.9861 - val_loss: 0.0488 - val_acc: 0.9869
Epoch 12/25
 - 33s - lo

<keras.callbacks.History at 0x23fa8b79c88>

In [26]:
model.fit(document_matrix, labels, epochs=25, validation_split=0.1, verbose=2, batch_size=1024)

Train on 588088 samples, validate on 65344 samples
Epoch 1/25
 - 34s - loss: 0.0389 - acc: 0.9898 - val_loss: 0.0467 - val_acc: 0.9875
Epoch 2/25
 - 34s - loss: 0.0381 - acc: 0.9899 - val_loss: 0.0467 - val_acc: 0.9874
Epoch 3/25
 - 34s - loss: 0.0377 - acc: 0.9901 - val_loss: 0.0467 - val_acc: 0.9875
Epoch 4/25
 - 34s - loss: 0.0373 - acc: 0.9903 - val_loss: 0.0466 - val_acc: 0.9875
Epoch 5/25
 - 33s - loss: 0.0370 - acc: 0.9902 - val_loss: 0.0463 - val_acc: 0.9875
Epoch 6/25
 - 33s - loss: 0.0364 - acc: 0.9906 - val_loss: 0.0460 - val_acc: 0.9878
Epoch 7/25
 - 33s - loss: 0.0360 - acc: 0.9906 - val_loss: 0.0461 - val_acc: 0.9876
Epoch 8/25
 - 33s - loss: 0.0355 - acc: 0.9908 - val_loss: 0.0461 - val_acc: 0.9876
Epoch 9/25
 - 33s - loss: 0.0352 - acc: 0.9909 - val_loss: 0.0461 - val_acc: 0.9876
Epoch 10/25
 - 33s - loss: 0.0350 - acc: 0.9909 - val_loss: 0.0460 - val_acc: 0.9876
Epoch 11/25
 - 33s - loss: 0.0344 - acc: 0.9912 - val_loss: 0.0461 - val_acc: 0.9875
Epoch 12/25
 - 33s - lo

<keras.callbacks.History at 0x23faaca9898>

In [27]:
model.save(os.path.join(MODELS_PATH,'ffn_sample_model.h5'))