In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV


#from xgboost import XGBClassifier
from sklearn import model_selection
from sklearn.utils import class_weight

import numpy as np
import pandas as pd



data = pd.read_csv('Train_clean.csv')

In [2]:
class EstimatorSelectionHelper:

    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv=3, n_jobs=3, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X,y)
            self.grid_searches[key] = gs    

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})

        rows = []
        for k in self.grid_searches:
            print(k)
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns]

In [9]:
models1 = {
    'ExtraTreesClassifier': ExtraTreesClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    'SVC': SVC(),
}

params1 = {
    'ExtraTreesClassifier': {'n_estimators': [16, 32] },
    'RandomForestClassifier': {'n_estimators': [16, 32] },
    'AdaBoostClassifier':  {'n_estimators': [16, 32] },
    'GradientBoostingClassifier': {'n_estimators': [16, 32], 'learning_rate': [0.8, 1.0] },
    'SVC': [
        {'kernel': ['linear'], 'C': [1, 10, 100]},
        {'kernel': ['rbf', 'poly', 'sigmoid', 'linear'], 'C':[0.1, 1, 10, 100], 'gamma': [1,0.1,0.01,0.001]},
    ]
}

In [4]:
X = data.drop(columns=['Survived', "Unnamed: 0"], axis=1)
y = data.Survived
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=0)


In [5]:
scaler = StandardScaler()
X_train_norm = scaler.fit_transform(X_train)
X_test_norm = scaler.transform(X_test)

In [10]:
helper1 = EstimatorSelectionHelper(models1, params1)
helper1.fit(X_train_norm, y_train, scoring='f1', n_jobs=2)

Running GridSearchCV for ExtraTreesClassifier.
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Running GridSearchCV for RandomForestClassifier.
Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   6 out of   6 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   6 out of   6 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   6 out of   6 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


Running GridSearchCV for AdaBoostClassifier.
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Running GridSearchCV for GradientBoostingClassifier.
Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=2)]: Done  12 out of  12 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


Running GridSearchCV for SVC.
Fitting 3 folds for each of 67 candidates, totalling 201 fits


[Parallel(n_jobs=2)]: Done 183 tasks      | elapsed:    6.1s
[Parallel(n_jobs=2)]: Done 201 out of 201 | elapsed:   37.1s finished


In [14]:
display(helper1.score_summary(sort_by='max_score').head(20))

ExtraTreesClassifier
RandomForestClassifier
AdaBoostClassifier
GradientBoostingClassifier
SVC


Unnamed: 0,estimator,min_score,mean_score,max_score,std_score,n_estimators,learning_rate,C,kernel,gamma
33,SVC,0.747126,0.779254,0.795455,0.0227178,,,1.0,rbf,0.1
53,SVC,0.742857,0.771172,0.786127,0.0200319,,,10.0,rbf,0.01
73,SVC,0.734463,0.768374,0.786127,0.0239871,,,100.0,rbf,0.001
39,SVC,0.696133,0.748792,0.78453,0.0380202,,,1.0,sigmoid,0.01
37,SVC,0.730337,0.76077,0.78453,0.0226218,,,1.0,rbf,0.01
59,SVC,0.696133,0.748792,0.78453,0.0380202,,,10.0,sigmoid,0.001
69,SVC,0.739884,0.764808,0.782609,0.0181546,,,100.0,rbf,0.01
5,AdaBoostClassifier,0.725275,0.758183,0.782609,0.0241629,32.0,,,,
56,SVC,0.722222,0.75826,0.781609,0.0258517,,,10.0,linear,0.01
48,SVC,0.722222,0.75826,0.781609,0.0258517,,,10.0,linear,1.0


In [8]:
'LogReg': LogisticRegression()
'LinReg': LinearRegression()
'KNeighborsClassifier': KNeighborsClassifier()

SyntaxError: illegal target for annotation (<ipython-input-8-7dffd565d7a4>, line 1)