# Test the simple feed forward model

In [18]:
import os
import json
import pickle

import numpy as np
import pandas as pd

import nltk

from sklearn.model_selection import train_test_split


In [12]:
import sys

MODULES_PATH = '../modules'
MODELS_PATH = '../models'
DATA_PATH = '../data'

sys.path.append(MODULES_PATH)
from data import flatten_data, prepare_training_data, prepare_test_data
from models import simple_ffn

## Read data

In [15]:
with open(os.path.join(DATA_PATH,'single_corpora.json'),'r') as datafile:
    single_corpora = json.load(datafile)

In [5]:
# document class balance
[len(i) for i in single_corpora]

[38238, 478811, 75634, 69254, 49012, 38206, 40485, 32279, 58398]

## Prepare data

In [6]:
flat_corpora, flat_labels = flatten_data(single_corpora[2:5])

In [7]:
corpora_train, corpora_test, labels_train, labels_test = train_test_split(flat_corpora,
                                                        flat_labels,
                                                        test_size=0.25,
                                                        random_state=123)

In [8]:
document_matrix, labels, pipeline_instance = prepare_training_data(corpora_train, labels_train)

In [9]:
document_matrix.shape, labels.shape, pipeline_instance

((145416, 1936), (145416, 3), Pipeline(memory=None,
      steps=[('vect', CountVectorizer(analyzer='char', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=1.0, max_features=None, min_df=1,
         ngram_range=(2, 2), preprocessor=None, stop_words=None,
         strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None, vocabulary=None)), ('tfidf', TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True))]))

In [22]:
with open(os.path.join(DATA_PATH, 'pipeline_instance.pickle'),'wb') as datafile:
        pickle.dump(pipeline_instance, datafile)

## Create and train model

In [28]:
model = simple_ffn(document_matrix, labels)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1936)         0                                            
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 128)          247936      input_1[0][0]                    
__________________________________________________________________________________________________
activation_1 (Activation)       (None, 128)          0           dense_1[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 128)          0           activation_1[0][0]               
__________________________________________________________________________________________________
batch_norm

In [29]:
model.fit(document_matrix, labels, epochs=5, validation_split=0.1)

Train on 130874 samples, validate on 14542 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7ffbf4bf3d68>

## Test on unseen data 

In [30]:
X_test, y_test = prepare_test_data(corpora_test, labels_test, pipeline_instance)

In [31]:
X_test.head()

Unnamed: 0,$,%,&,*,+,-,/,0,1,2,...,”â,…â,…“,‰,€,€¦,€œ,€š,€”,€€
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
y_test.head()

Unnamed: 0,nr,xh,zu
0,1,0,0
1,0,1,0
2,0,1,0
3,1,0,0
4,0,0,1


In [33]:
score, accuracy = model.evaluate(X_test, y_test)



In [35]:
print('Model test accuracy', accuracy.round(4))

Model test accuracy 0.9867
