# A simple feed forward model

```




```
- [Read data](#Read-data)
- [Prepare data](#Prepare-data)
- [Create and train model](#Create-and-train-model)
- [Test on unseen data](#Test-on-unseen-data)

In [18]:
import os
import sys
import json
import pickle

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from keras.utils import to_categorical

In [2]:
MODULES_PATH = '../modules'
MODELS_PATH = '../models'
DATA_PATH = '../data'

sys.path.append(MODULES_PATH)
from data import flatten_data, prepare_training_data, prepare_test_data, \
                    raise_one_level
from models import sequence_model

Using TensorFlow backend.


## Read data

In [3]:
with open(os.path.join(DATA_PATH,'sentences.json'),'r') as datafile:
    sentences = json.load(datafile)

In [4]:
documents = pd.read_csv('../data/training_data.csv')

## Prepare data

In [5]:
sentences_flat = raise_one_level(sentences)
sentences_df = pd.DataFrame(sentences_flat)

In [6]:
corpora_train, corpora_test, labels_train, labels_test = train_test_split(
                                                        sentences_df['body'],
                                                        sentences_df['class'],
                                                        test_size=0.75,
                                                        random_state=123)

In [7]:
characters = corpora_train.apply(lambda x: list(x)).values.tolist()

In [8]:
characters = raise_one_level(characters)
print(len(characters))

3043734


In [9]:
num_unique_characters = len(set(characters))
print(num_unique_characters)

94


In [10]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [11]:
tokenizer = Tokenizer(
    char_level=True,
    filters=None,
    lower=False,
    num_words=num_unique_characters
)

tokenizer.fit_on_texts(corpora_train)
sequences = tokenizer.texts_to_sequences(corpora_train)

In [26]:
# with open(os.path.join(DATA_PATH, "character_sequence_tokenizer.pkl"), "wb") as f:
#     pickle.dump(tokenizer, f)

In [12]:
padded_sequences = pad_sequences(sequences, maxlen=500)

In [13]:
padded_sequences.shape

(10139, 500)

In [14]:
# characters per column

(padded_sequences >  0 ).sum(axis=0)

array([ 1394,  1396,  1401,  1405,  1410,  1414,  1418,  1421,  1421,
        1422,  1428,  1431,  1440,  1444,  1449,  1455,  1463,  1467,
        1475,  1480,  1486,  1493,  1498,  1504,  1511,  1516,  1519,
        1525,  1536,  1538,  1544,  1551,  1555,  1562,  1572,  1576,
        1579,  1586,  1588,  1593,  1596,  1602,  1608,  1612,  1614,
        1620,  1625,  1632,  1634,  1638,  1641,  1650,  1656,  1664,
        1671,  1674,  1679,  1682,  1685,  1688,  1691,  1696,  1700,
        1703,  1710,  1715,  1723,  1727,  1734,  1742,  1752,  1758,
        1763,  1768,  1776,  1780,  1786,  1791,  1795,  1799,  1802,
        1806,  1813,  1819,  1824,  1826,  1833,  1838,  1844,  1847,
        1857,  1863,  1875,  1883,  1887,  1893,  1898,  1909,  1913,
        1922,  1929,  1938,  1946,  1956,  1959,  1964,  1973,  1978,
        1985,  1988,  1996,  2008,  2016,  2022,  2034,  2038,  2040,
        2045,  2050,  2056,  2059,  2065,  2074,  2077,  2083,  2088,
        2096,  2108,

## Create and train model

In [16]:
labels = pd.get_dummies(labels_train)

In [15]:
model = sequence_model(padded_sequences, labels, num_unique_characters)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 500, 94)      0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, 500, 128)     114176      input_1[0][0]                    
__________________________________________________________________________________________________
activation_1 (Activation)       (None, 500, 128)     0           lstm_1[0][0]                     
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 500, 128)     0           activation_1[0][0]               
__________________________________________________________________________________________________
batch_norm

In [19]:
sequences_3d = to_categorical(padded_sequences)

In [20]:
sequences_3d.shape

(10139, 500, 94)

In [None]:
model.fit(sequences_3d, labels, epochs=100, validation_split=0.1)

Train on 9125 samples, validate on 1014 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100

In [23]:
# model.save(os.path.join(MODELS_PATH,'ffn_sample_model_sentences_bi.h5'))

## Test on unseen data 

In [36]:
test_data = []
for i in training_data:
    X_test, y_test = prepare_test_data(corpora_test, labels_test, i['pipeline_instance'])
    test_data.append({'X_test': X_test, 'y_test': y_test})

In [38]:
test_data[0]['X_test'].head()

Unnamed: 0,Unnamed: 1,!,"""",#,$,%,&,',(,),...,u,v,w,x,y,z,{,|,},~
0,0.351427,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.080407,0.080444,...,0.198643,0.0,0.16834,0.118161,0.178917,0.0,0.0,0.0,0.0,0.0
1,0.531607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.037561,0.0,0.038198,0.268115,0.121792,0.0,0.0,0.0,0.0,0.0
2,0.47093,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.038955,0.054692,0.066025,0.0,0.014035,0.0,0.0,0.0,0.0,0.0
3,0.231594,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.087272,0.0,0.266251,0.0,0.188653,0.151879,0.0,0.0,0.0,0.0
4,0.380765,0.0,0.0,0.0,0.0,0.0,0.0,0.075985,0.06534,0.06537,...,0.336291,0.018886,0.054718,0.0,0.087234,0.07023,0.0,0.0,0.0,0.0


In [39]:
test_data[0]['y_test'].head()

Unnamed: 0,af,en,nr,nso,ss,st,tn,ts,ve,xh,zu
34750,0,0,0,0,0,0,0,0,0,1,0
18986,0,0,0,1,0,0,0,0,0,0,0
13655,0,1,0,0,0,0,0,0,0,0,0
15126,0,0,1,0,0,0,0,0,0,0,0
21978,0,0,0,0,1,0,0,0,0,0,0


In [42]:
for idx, i in enumerate(test_data):
    score, accuracy = models[idx].evaluate(i['X_test'], i['y_test'])
    print('Model test accuracy', accuracy.round(4)*100)

Model test accuracy 77.2
Model test accuracy 75.77000000000001
Model test accuracy 71.95
