In [31]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt

from data_preprocessing import *
from train_and_test import *
from grid_search import run_search

In [2]:
X_train, Y_train, X_test, Y_test, X_train_norm, Y_train_norm, X_test_norm, Y_test_norm = get_all_data(train_path,
                                                                                                          test_path)

In [3]:
param_grid = {
    'priors': [(i/10, 1-i/10) for i in range(1,10)],
}

search = run_search(GaussianNB(), param_grid, n_jobs=4, norm=True)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=4)]: Done  27 out of  27 | elapsed:    1.9s finished


Train score: 0.608316481294
Test score: 0.61057904818
Best params: {'priors': (0.9, 0.09999999999999998)}


In [4]:
param_grid = {
    'priors': [(i/10, 1-i/10) for i in range(1,10)],
}

search = run_search(GaussianNB(), param_grid, n_jobs=4, norm=False)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=4)]: Done  27 out of  27 | elapsed:    1.9s finished


Train score: 0.790602881699
Test score: 0.787829674377
Best params: {'priors': (0.9, 0.09999999999999998)}


In [5]:
param_grid = {
    'alpha': [0.01, .5, 1., 1.5, 2, 2.5, 5],
    'fit_prior': [True, False],
    'class_prior': [(i/10, 1-i/10) for i in range(1,10)],
}

search = run_search(MultinomialNB(), param_grid, n_jobs=4, norm=False)

Fitting 3 folds for each of 126 candidates, totalling 378 fits


[Parallel(n_jobs=4)]: Done  76 tasks      | elapsed:    2.5s


Train score: 0.778880182002
Test score: 0.770369824665
Best params: {'fit_prior': True, 'alpha': 0.01, 'class_prior': (0.8, 0.19999999999999996)}


[Parallel(n_jobs=4)]: Done 378 out of 378 | elapsed:   11.4s finished


In [7]:
param_grid = {
    'alpha': [0.01, .5, 1., 1.5, 2, 2.5, 5],
    'binarize': [0, 1, 5, 10, 50, 100],
    'fit_prior': [True, False],
    'class_prior': [(i/10, 1-i/10) for i in range(1,10)],
}

search = run_search(BernoulliNB(), param_grid, n_jobs=4, norm=False)

Fitting 3 folds for each of 756 candidates, totalling 2268 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.4s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   10.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   26.1s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:   49.2s
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:  1.3min
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed:  1.8min
[Parallel(n_jobs=4)]: Done 2268 out of 2268 | elapsed:  2.2min finished


Train score: 0.787095551062
Test score: 0.780462649182
Best params: {'binarize': 10, 'fit_prior': True, 'alpha': 0.01, 'class_prior': (0.8, 0.19999999999999996)}


In [31]:
param_grid = {
    'criterion': ['gini'],
    'min_samples_split': [6, 8, 10, 12, 14],
    'min_samples_leaf': [6, 8, 10, 12, 14],
    'max_features': ['sqrt', 'log2', None],
    'min_impurity_decrease': [.1, .01, .001, .0001, .00001, .000001]
}

search = run_search(DecisionTreeClassifier(), param_grid, n_jobs=4)

Fitting 3 folds for each of 450 candidates, totalling 1350 fits


[Parallel(n_jobs=4)]: Done  76 tasks      | elapsed:    2.2s
[Parallel(n_jobs=4)]: Done 376 tasks      | elapsed:   18.1s
[Parallel(n_jobs=4)]: Done 876 tasks      | elapsed:   37.4s
[Parallel(n_jobs=4)]: Done 1350 out of 1350 | elapsed:  1.2min finished


Train score: 0.86716380182
Test score: 0.85243848534
Best params: {'min_samples_split': 8, 'max_features': None, 'criterion': 'gini', 'min_impurity_decrease': 0.0001, 'min_samples_leaf': 14}


In [14]:
# Takes a looooong time
param_grid = {
    
}

search = run_search(KNeighborsClassifier(), param_grid, n_jobs=4)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:  1.5min finished


Train score: 0.881256319515
Test score: 0.829527036982
Best params: {}


In [18]:
param_grid = {
    'max_iter': [100, 200, 500, 1000, 2000],
    'dual': [True, False],
}

search = run_search(LinearSVC(), param_grid, n_jobs=4)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   16.0s finished


Train score: 0.851049039434
Test score: 0.84566082216
Best params: {'dual': True, 'max_iter': 200}


In [21]:
param_grid = {
    'n_estimators': [5, 10, 20, 50, 100, 200],
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [2, 8, 10, 12],
    'min_samples_leaf': [2, 8, 10, 12],
    'min_impurity_decrease': [0., .1, .01, .001, .0001, .00001]
}

search = run_search(RandomForestClassifier(), param_grid, n_jobs=4)

Fitting 3 folds for each of 1152 candidates, totalling 3456 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   32.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  2.1min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  3.7min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  5.3min
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed: 10.5min
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed: 17.2min
[Parallel(n_jobs=4)]: Done 2442 tasks      | elapsed: 21.9min
[Parallel(n_jobs=4)]: Done 3192 tasks      | elapsed: 28.9min
[Parallel(n_jobs=4)]: Done 3456 out of 3456 | elapsed: 31.5min finished


Train score: 0.896265166835
Test score: 0.856121997937
Best params: {'criterion': 'gini', 'n_estimators': 200, 'min_impurity_decrease': 0.0, 'min_samples_split': 8, 'min_samples_leaf': 2}


In [22]:
param_grid = {
    'n_estimators': [200],
    'criterion': ['gini'],
    'min_samples_split': [8, 10, 12],
    'min_samples_leaf': [2, 8, 10],
    'min_impurity_decrease': [0., .1, .001, .0001]
}

search = run_search(ExtraTreesClassifier(), param_grid, n_jobs=4)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  1.4min
[Parallel(n_jobs=4)]: Done 108 out of 108 | elapsed:  3.3min finished


Train score: 0.872345803842
Test score: 0.844187417121
Best params: {'criterion': 'gini', 'n_estimators': 200, 'min_impurity_decrease': 0.0, 'min_samples_split': 8, 'min_samples_leaf': 2}


In [30]:
param_grid = {
    'penalty': ['l2'],
    'dual': [True, False],
    'fit_intercept': [True, False],
}

search = run_search(LogisticRegression(), param_grid, n_jobs=4)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=4)]: Done  12 out of  12 | elapsed:    2.5s finished


Train score: 0.851049039434
Test score: 0.844703108885
Best params: {'fit_intercept': True, 'penalty': 'l2', 'dual': False}


In [35]:
param_grid = {
    'hidden_layer_sizes': [(100, 100)],
    'alpha': [.1, .01, .001]
}

search = run_search(MLPClassifier(), param_grid, n_jobs=4)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=4)]: Done   9 out of   9 | elapsed:  3.0min finished


Train score: 0.868996461072
Test score: 0.845439811404
Best params: {'alpha': 0.1, 'hidden_layer_sizes': (100, 100)}
