# A simple feed forward model

```




```
#### contents
- [Read data](#Read-data)
- [Prepare data](#Prepare-data)
- [Create and train models](#Create-and-train-models)
- [Final note](#Final-note)

In [1]:
import os
import sys
import json
import pickle

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
# GPU = False

In [3]:
# import tensorflow as tf
# from keras import backend as K

# num_cores = 10

# if GPU:
#     num_GPU = 1
#     num_CPU = 1
# else:
#     num_CPU = 1
#     num_GPU = 0

# config = tf.ConfigProto(intra_op_parallelism_threads=num_cores,
#                         inter_op_parallelism_threads=num_cores, 
#                         allow_soft_placement=True,
#                         device_count = {'CPU' : num_CPU,
#                                         'GPU' : num_GPU}
#                        )

# session = tf.Session(config=config)
# K.set_session(session)

Using TensorFlow backend.


In [2]:
MODULES_PATH = '../modules'
MODELS_PATH = '../models'
DATA_PATH = '../data'

sys.path.append(MODULES_PATH)
from data import flatten_data, prepare_training_data, prepare_test_data, \
                    raise_one_level
from models import parameter_ffn_seq

Using TensorFlow backend.


In [3]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [4]:
# to run the grid search in parallel this should just show the CPU
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 16924982804729173235
]


## Read data

- [contents](#contents)

In [5]:
with open(os.path.join(DATA_PATH, 'train_data.pickle'),'rb') as datafile:
        corpora_train, labels_train = pickle.load(datafile)

## Prepare data

- [contents](#contents)

In [6]:
with open(os.path.join(DATA_PATH, 'pipeline_instance.pickle'),'rb') as datafile:
        pipeline_instance = pickle.load(datafile)

In [7]:
X_train, y_train = prepare_test_data(corpora_train, labels_train, pipeline_instance)

X_train_grid, X_test_grid, y_train_grid, y_test_grid = train_test_split(X_train,
                                                                        y_train,
                                                                        test_size=0.75,
                                                                        random_state=123)

del X_train, y_train, corpora_train, labels_train

del X_test_grid, y_test_grid

X_train_grid.shape, y_train_grid.shape

((7604, 2692), (7604, 11))

In [8]:
input_shape = X_train_grid.shape[1]
classes     = y_train_grid.shape[1]

## Create and train models

- [contents](#contents)

In [9]:
def hyperparameters(x, y, create_model, params, cv=3, n_jobs=3):
    # check if the keys are in globals
    key_check = all([i in list(globals().keys()) for i in list(params.keys())])
    
    assert key_check, 'Some of your keys are missing from `globals()`.'
    # run models and store results
    result = {}

    for model in params:  
        print('Model: \t {} \n'.format(model))
        if 'keras' in model.lower():
            estimator = globals()[model](build_fn=create_model, 
                                                 epochs=10, 
                                                 batch_size=1024, 
                                                 verbose=2)
        else:
            estimator = globals()[model]()
            
        gscv = GridSearchCV(estimator,
                             params[model],
                             cv=cv,
                             verbose=1,
                             n_jobs=n_jobs,
                             refit=False
                           )
    
        if 'keras' in model.lower():
            gscv.fit(x,y)
        else:
            gscv.fit(x.values,y.apply(lambda x: x.argmax(), axis=1))

        result[model] = gscv

    return result

In [14]:
params_test = {'KerasClassifier':
          {
              'input_shape': [input_shape],
              'classes': [classes],
              'layers': [[32]],
              'activations': [['relu']],
              'dropout': [[0.15]]
          },
#           'SVC':
#           {
#               'kernel': ['linear'],
#               'degree': [2],
#               'gamma': [0.99],
#               'tol': [0.1],
#               'class_weight': [None],
#               'random_state': [123]
              
#           }, This takes quite a while
          'XGBClassifier':
          {
              'learning_rate': [1], 
              'n_estimators':[10], 
              'max_depth':[2],
              'min_child_weight':[3], 
              'gamma':[1], 
              'subsample':[0.2], 
              'colsample_bytree':[1.0],
              'random_state': [123]
          },
          'RandomForestClassifier':
          {
              'n_estimators': [10],
              'max_depth': [2],
              'random_state': [123],
              'class_weight':[None]
          }
         }

In [10]:
params = {'KerasClassifier':
          {
              'input_shape': [input_shape],
              'classes': [classes],
              'layers': [[32], [64], [128], [256], [512], [1024], [2048],
                         [32, 32], [64, 64], [128, 128], [256, 256], [512, 512], [1024, 1024], [2048, 2048],
                        [128, 32], [128, 64], [256, 128], [512, 256], [1024, 512]],
              'activations': [['relu']],
              'dropout': [[0.15]]
          },
#           'SVC':
#           {
#               'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
#               'degree': [2,3,5],
#               'gamma': [0.001, 0.1, 0.5, 0.99],
#               'tol': [0.1, 0.001,0.0001],
#               'class_weight': ['balanced', None],
#               'random_state': [123]
#           },
          'XGBClassifier':
          {
              'learning_rate': [0.1, 0.001], 
              'n_estimators':[50], 
              'max_depth':[5],
#               'min_child_weight':[3,5], 
              'gamma':[0.2, 0.1], 
              'subsample':[0.6], 
              'colsample_bytree':[1.0],
              'random_state': [123]
          },
          'RandomForestClassifier':
          {
              'n_estimators': [500],
              'max_depth': [2, 5, 10],
              'random_state': [123],
              'class_weight':['balanced', 'balanced_subsample', None]
          }
         }

In [11]:
test = False # set to True for a test run of the grid search

In [12]:
if test:
    grid_searches = hyperparameters(X_train_grid, y_train_grid, parameter_ffn_seq, params_test, cv=3, n_jobs=10)
else:
    grid_searches = hyperparameters(X_train_grid, y_train_grid, parameter_ffn_seq, params, cv=3, n_jobs=10)

Model: 	 KerasClassifier 

Fitting 3 folds for each of 19 candidates, totalling 57 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:  1.3min
[Parallel(n_jobs=10)]: Done  57 out of  57 | elapsed:  3.6min finished
will be corrected to return the positional maximum in the future.
Use 'series.values.argmax' to get the position of the maximum now.


Model: 	 XGBClassifier 

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   6 out of  12 | elapsed:  6.5min remaining:  6.5min
[Parallel(n_jobs=10)]: Done  12 out of  12 | elapsed: 10.2min finished
will be corrected to return the positional maximum in the future.
Use 'series.values.argmax' to get the position of the maximum now.


Model: 	 RandomForestClassifier 

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  27 out of  27 | elapsed:   45.2s finished


In [13]:
if 'leaderboard' in globals():
    del leaderboard
    
for model, grid in grid_searches.items():
    line = pd.DataFrame(grid.cv_results_).sort_values('mean_test_score', ascending=False).head(1)
    line['model'] = model
    if 'leaderboard' not in globals():
        leaderboard = line
    else:
        leaderboard = leaderboard.append(line)
leaderboard

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,model,param_activations,param_class_weight,param_classes,param_colsample_bytree,param_dropout,...,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
13,126.126382,1.260755,0.890452,0.95121,KerasClassifier,[relu],,11.0,,[0.15],...,0.896252,0.949497,0.87929,0.951272,0.895817,0.95286,3.785761,0.115447,0.007896,0.001374
0,386.376796,0.500339,0.881641,0.94687,XGBClassifier,,,,1.0,,...,0.885692,0.946319,0.873767,0.9493,0.885466,0.944992,0.582246,0.026335,0.005569,0.001801
5,15.009525,1.369089,0.885981,0.926289,RandomForestClassifier,,balanced_subsample,,,,...,0.892787,0.926584,0.874951,0.928388,0.890205,0.923896,0.158739,0.170333,0.007871,0.001846


In [14]:
leaderboard[['model', 'mean_test_score','mean_train_score']]

Unnamed: 0,model,mean_test_score,mean_train_score
13,KerasClassifier,0.890452,0.95121
0,XGBClassifier,0.881641,0.94687
5,RandomForestClassifier,0.885981,0.926289


In [22]:
best_params = grid_searches[leaderboard['model'].iloc[0]].best_params_

with open(os.path.join(MODELS_PATH, 'best_params.json'), 'w') as datafile:
    json.dump(best_params, datafile)

best_params

{'activations': ['relu'],
 'classes': 11,
 'dropout': [0.15],
 'input_shape': 2692,
 'layers': [2048, 2048]}

In [23]:
leaderboard.to_csv(os.path.join(MODELS_PATH, 'leaderboard.csv'))

In [25]:
with open(os.path.join(MODELS_PATH, 'grid_searches.pickle'), 'wb') as datafile:
    pickle.dump(grid_searches, datafile)

## Final note

- [contents](#contents)