In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import *
import numpy as np
from itertools import combinations
from sklearn.model_selection import PredefinedSplit
pd.set_option('display.max_rows', None)

In [2]:
# Initialze the estimators
clf1 = RandomForestClassifier(random_state=42)
clf2 = SVC(kernel='sigmoid',probability=True, random_state=42)
clf3 = LogisticRegression(random_state=42)
clf4 = DecisionTreeClassifier(random_state=42)
clf5 = KNeighborsClassifier()
clf6 = MultinomialNB()
clf7 = GradientBoostingClassifier(random_state=42)
clf8 = MLPClassifier(max_iter=2000,random_state=42)
clf9 = SVC(kernel='poly',probability=True, random_state=42)
clf10 = SVC(kernel='rbf',probability=True, random_state=42)


In [3]:
class ColumnExtractor(object):
    def __init__(self, cols=None):
        self.cols = cols
    def transform(self, X):
        return X[:,self.cols]
    def fit(self, X, y=None):
        return self
    def get_params(self,deep=True):
        return {"cols" : self.cols}
    
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)

In [4]:
# Initiaze the hyperparameters for each dictionary
param1 = {}
param1['feat_select__cols'] = sum([list(combinations(np.arange(0,4,1), i)) for i in range(1,5)], [])
param1['classifier__n_estimators'] = [10, 20, 30, 40]
param1['classifier__max_depth'] = [2, 3, 4]
param1['classifier__class_weight'] = [None, 'balanced']
# param1['classifier'] = [clf1]

param2 = {}
param2['feat_select__cols'] = sum([list(combinations(np.arange(0,4,1), i)) for i in range(1,5)], [])
param2['classifier__C'] = [10**-2, 10**-1, 10**0, 10**1, 10**2]
# param2['Classifier__kernel'] = [ 'linear']
param2['classifier__class_weight'] = [None, 'balanced']
param2['classifier__degree'] = [2,3]
# param2['classifier'] = [clf2]

param3 = {}
param3['feat_select__cols'] = sum([list(combinations(np.arange(0,4,1), i)) for i in range(1,5)], [])
param3['classifier__C'] = [10**-2, 10**-1, 10**0, 10**1, 10**2]
param3['classifier__penalty'] = ['l1', 'l2']
param3['classifier__class_weight'] = [None, 'balanced']
# param3['classifier'] = [clf3]

param4 = {}
param4['feat_select__cols'] = sum([list(combinations(np.arange(0,4,1), i)) for i in range(1,5)], [])
param4['classifier__max_depth'] = [2,3,4,None]
param4['classifier__min_samples_split'] = [2,3,4]
param4['classifier__class_weight'] = [None, 'balanced']
# param4['classifier'] = [clf4]

param5 = {}
param5['feat_select__cols'] = sum([list(combinations(np.arange(0,4,1), i)) for i in range(1,5)], [])
param5['classifier__n_neighbors'] = [2,3,4,5]
# param5['classifier'] = [clf5]

param6 = {}
param6['feat_select__cols'] = sum([list(combinations(np.arange(0,4,1), i)) for i in range(1,5)], [])
param6['classifier__alpha'] = [10**0, 10**1, 10**2]
# param6['classifier'] = [clf6]

param7 = {}
param7['feat_select__cols'] = sum([list(combinations(np.arange(0,4,1), i)) for i in range(1,5)], [])
param7['classifier__n_estimators'] = [10, 20, 30, 40]
param7['classifier__max_depth'] = [2, 3, 4]
# param7['classifier'] = [clf7]

param8 = {}
param8['feat_select__cols'] = sum([list(combinations(np.arange(0,4,1), i)) for i in range(1,5)], [])
param8['classifier__hidden_layer_sizes'] = [(10),(10,10),(10,10,10)]
param8['classifier__solver'] = ['adam', 'sgd']
param8['classifier__alpha'] = [0.0001, 0.0005,0.001,0.005,0.01,0.05]    
# param8['classifier'] = [clf8]

In [5]:
pipeline1 = Pipeline([('feat_select',ColumnExtractor()),('classifier', clf1)])
pipeline2 = Pipeline([('feat_select',ColumnExtractor()),('classifier', clf2)])
pipeline3 = Pipeline([('feat_select',ColumnExtractor()),('classifier', clf3)])
pipeline4 = Pipeline([('feat_select',ColumnExtractor()),('classifier', clf4)])
pipeline5 = Pipeline([('feat_select',ColumnExtractor()),('classifier', clf5)])
pipeline6 = Pipeline([('feat_select',ColumnExtractor()),('classifier', clf6)])
pipeline7 = Pipeline([('feat_select',ColumnExtractor()),('classifier', clf7)])
pipeline8 = Pipeline([('feat_select',ColumnExtractor()),('classifier', clf8)])
pipeline9 = Pipeline([('feat_select',ColumnExtractor()),('classifier', clf9)])
pipeline10 = Pipeline([('feat_select',ColumnExtractor()),('classifier', clf10)])


In [177]:
rootPath = "./"
X_train = pd.read_csv(rootPath + "X_train_tenengrad.csv")
X_test = pd.read_csv(rootPath + "X_test_tenengrad.csv")
y_train = pd.read_csv(rootPath + "y_train_tenengrad.csv").values.reshape(-a1)
y_test = pd.read_csv(rootPath + "y_test_tenengrad.csv").values.reshape(-1)

In [178]:
## Make binary labels
y_train1 = np.array(y_train)
y_train1[y_train1 > 0] = 1

y_test1 = np.array(y_test)
y_test1[y_test1 > 0] = 1

In [None]:
indices = np.zeros(len(y_train1) + len(y_test1))
indices[:len(y_train1)] = -1
Data_X = X_train.append(X_test).reset_index(drop=True)
Data_Y = np.hstack([y_train1,y_test1])

ps = PredefinedSplit(indices)
score = { "Accuracy":"balanced_accuracy" ,"Recall":"recall","Precision":"precision","F1_Score":"f1"}

In [20]:
grid1 = GridSearchCV(pipeline1, n_jobs=-1, param_grid=[param1],scoring=score,refit="F1_Score",cv=ps)
grid2 = GridSearchCV(pipeline2, n_jobs=-1, param_grid=[param2],scoring=score,refit="F1_Score",cv=ps)
grid3 = GridSearchCV(pipeline3, n_jobs=-1, param_grid=[param3],scoring=score,refit="F1_Score",cv=ps)
grid4 = GridSearchCV(pipeline4, n_jobs=-1, param_grid=[param4],scoring=score,refit="F1_Score",cv=ps)
grid5 = GridSearchCV(pipeline5, n_jobs=-1, param_grid=[param5],scoring=score,refit="F1_Score",cv=ps)
grid6 = GridSearchCV(pipeline6, n_jobs=-1, param_grid=[param6],scoring=score,refit="F1_Score",cv=ps)
grid7 = GridSearchCV(pipeline7, n_jobs=-1, param_grid=[param7],scoring=score,refit="F1_Score",cv=ps)
grid8 = GridSearchCV(pipeline8, n_jobs=-1, param_grid=[param8],scoring=score,refit="F1_Score",cv=ps)
grid9 = GridSearchCV(pipeline9, n_jobs=-1, param_grid=[param2],scoring=score,refit="F1_Score",cv=ps)
grid10 = GridSearchCV(pipeline10, n_jobs=-1, param_grid=[param2],scoring=score,refit="F1_Score",cv=ps)

In [21]:
grids = [grid1,grid2,grid3,grid4,grid5,grid6,grid7,grid8,grid9,grid10]

In [None]:
for pipe in grids:
    pipe.fit(Data_X,Data_Y)

In [None]:
grid = grids[0]
print(grid.best_estimator_)
# print(pd.DataFrame(grid.cv_results_).iloc[grid.best_index_])

In [221]:
df = pd.DataFrame(grid.cv_results_).drop(columns=["mean_fit_time","std_fit_time","mean_score_time","std_score_time","params",
                                             "mean_test_Accuracy","std_test_Accuracy","rank_test_Accuracy",
                                            "mean_test_Recall","std_test_Recall","rank_test_Recall",
                                            "mean_test_Precision","std_test_Precision","rank_test_Precision",
                                            "mean_test_F1_Score","std_test_F1_Score","rank_test_F1_Score"])

In [223]:
df["sum"] = df["split0_test_Accuracy"] + df["split0_test_Recall"] + df["split0_test_Precision"] + df["split0_test_F1_Score"]

In [224]:
df.sort_values("sum")[-10:][::-1]

Unnamed: 0,param_classifier__C,param_classifier__class_weight,param_classifier__degree,param_feat_select__cols,split0_test_Accuracy,split0_test_Recall,split0_test_Precision,split0_test_F1_Score,sum
32,0.01,balanced,2,"(2,)",0.732143,0.892857,0.892857,0.892857,3.410714
39,0.01,balanced,2,"(2, 3)",0.732143,0.892857,0.892857,0.892857,3.410714
159,1.0,balanced,2,"(2, 3)",0.732143,0.892857,0.892857,0.892857,3.410714
272,100.0,balanced,2,"(2,)",0.732143,0.892857,0.892857,0.892857,3.410714
92,0.1,balanced,2,"(2,)",0.732143,0.892857,0.892857,0.892857,3.410714
152,1.0,balanced,2,"(2,)",0.732143,0.892857,0.892857,0.892857,3.410714
212,10.0,balanced,2,"(2,)",0.732143,0.892857,0.892857,0.892857,3.410714
102,0.1,balanced,2,"(0, 2, 3)",0.732143,0.892857,0.892857,0.892857,3.410714
171,1.0,balanced,3,"(0, 3)",0.678571,0.928571,0.866667,0.896552,3.370361
44,0.01,balanced,2,"(0, 1, 2, 3)",0.678571,0.928571,0.866667,0.896552,3.370361
