In [1]:
import pandas as pd
import numpy as np
import helper

In [2]:
#Data Loading
folder_path = "datasets"
data_file = "train.csv"

#Binary datasets
multimodel_train_set = helper.data_loader(folder_path, data_file, 'train', 'multiclass')
multimodel_test_set = helper.data_loader(folder_path, data_file, 'test', 'multiclass')

x_train, y_train = multimodel_train_set
x_test, y_test = multimodel_test_set
x_prepared = helper.data_pipeline.fit_transform(x_train)

In [3]:
#Selecting a few models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

models = {
    'KNN' : KNeighborsClassifier(),
    'Naive Bayes' : MultinomialNB(),
    'SGD Classifier' : SGDClassifier(),
    'Random Forest' : RandomForestClassifier()
}

parameters = {
    'KNN' : {'weights' : ['uniform', 'distance'], 'n_neighbors' : [3,5,7]},
    'Naive Bayes' : {'alpha' : [1]},
    'SGD Classifier' : [{'penalty': ['l2', 'l1', 'elasticnet'], 'fit_intercept': [False, True]}],
    'Random Forest' : {'n_estimators': [1, 10, 50, 100, 200]}
}

In [4]:
model_eval = helper.multiModelSelection(models, parameters)

In [5]:
model_eval.fit(x_prepared, y_train, cv = 5, n_jobs = 2, scoring = 'accuracy')

Gridsearch for KNN model
Gridsearch for Naive Bayes model
Gridsearch for SGD Classifier model
Gridsearch for Random Forest model
Done.


In [6]:
model_eval.display_results()

Unnamed: 0,Chosen Model,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,param_alpha,param_fit_intercept,param_penalty,param_n_estimators
0,Random Forest,3.646635,0.316673,0.061249,0.019067,,,{'n_estimators': 200},0.937333,0.946667,0.94,0.94,0.928,0.9384,0.006046,,,,200.0
1,Random Forest,2.27029,0.387274,0.054358,0.02013,,,{'n_estimators': 100},0.930667,0.936,0.944,0.94,0.918667,0.933867,0.008788,,,,100.0
2,Random Forest,0.736632,0.011608,0.016612,0.002301,,,{'n_estimators': 50},0.933333,0.925333,0.933333,0.925333,0.916,0.926667,0.006422,,,,50.0
3,KNN,0.877701,0.141393,5.027279,0.160144,3.0,distance,"{'n_neighbors': 3, 'weights': 'distance'}",0.924,0.94,0.933333,0.918667,0.912,0.9256,0.010028,,,,
4,KNN,1.212765,0.171859,5.515242,0.182587,7.0,distance,"{'n_neighbors': 7, 'weights': 'distance'}",0.924,0.945333,0.933333,0.917333,0.902667,0.924533,0.014425,,,,
5,KNN,0.893986,0.231028,5.223886,0.135379,5.0,distance,"{'n_neighbors': 5, 'weights': 'distance'}",0.928,0.942667,0.936,0.913333,0.902667,0.924533,0.014669,,,,
6,KNN,0.644862,0.061474,5.335443,0.300959,3.0,uniform,"{'n_neighbors': 3, 'weights': 'uniform'}",0.92,0.941333,0.933333,0.909333,0.908,0.9224,0.013129,,,,
7,KNN,1.293251,0.159423,5.047007,0.313891,5.0,uniform,"{'n_neighbors': 5, 'weights': 'uniform'}",0.921333,0.941333,0.932,0.910667,0.906667,0.9224,0.012938,,,,
8,KNN,0.857876,0.176352,5.450766,0.161403,7.0,uniform,"{'n_neighbors': 7, 'weights': 'uniform'}",0.918667,0.945333,0.928,0.914667,0.902667,0.921867,0.014276,,,,
9,Random Forest,0.169176,0.004171,0.006013,0.000635,,,{'n_estimators': 10},0.88,0.886667,0.882667,0.885333,0.861333,0.8792,0.009222,,,,10.0


In [10]:
#Models to be further evaluated:
r_forest = RandomForestClassifier(n_estimators = 200).fit(x_prepared, y_train)
knn = KNeighborsClassifier(n_neighbors = 7).fit(x_prepared, y_train)
SGD = SGDClassifier(fit_intercept= True, penalty = 'l1').fit(x_prepared, y_train)

In [11]:
import joblib

joblib.dump(r_forest, 'multin_random_forrest.pkl')
joblib.dump(knn, 'multin_knn.pkl')
joblib.dump(SGD, 'multin_sgd.pkl')

['multin_sgd.pkl']