In [47]:
#standard ds imports
import pandas as pd
import numpy as np
#viz and stats
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
# .py imports
#import wranglerer as wr
#import modeling as md
import os
#sklearn imports
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
import sklearn.preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix, plot_confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
#CATboost imports
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
import explore_r as ex
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [2]:
df = pd.read_csv('prepped_data.csv')

In [3]:
df = df.drop(columns=['date', 'day_of_week', 'start_time','home_score',
       'home_wins', 'away_score', 'away_wins','total_scores'])
df['spread'] = abs(df['spread'])

In [4]:
df.head()

Unnamed: 0,week_num,stadium,temp,humidity,wind,spread,ou,is_under,abnormal_start,is_playoff,playoff_implications,is_turf,is_outdoor
0,19,State Farm Stadium,72,0,0,1.0,51.0,0,0,1,1,0,0
1,19,Lincoln Financial Field,52,48,14,2.5,45.5,1,0,1,1,0,1
2,19,GEHA Field at Arrowhead Stadium,22,55,13,1.5,48.0,1,0,1,1,0,1
3,19,Highmark Stadium,32,10,0,5.5,49.0,1,0,1,1,1,1
4,19,Levi's Stadium,55,47,19,4.0,47.0,1,0,1,1,0,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10811 entries, 0 to 10810
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   week_num              10811 non-null  int64  
 1   stadium               10811 non-null  object 
 2   temp                  10811 non-null  int64  
 3   humidity              10811 non-null  int64  
 4   wind                  10811 non-null  int64  
 5   spread                10811 non-null  float64
 6   ou                    10811 non-null  float64
 7   is_under              10811 non-null  int64  
 8   abnormal_start        10811 non-null  int64  
 9   is_playoff            10811 non-null  int64  
 10  playoff_implications  10811 non-null  int64  
 11  is_turf               10811 non-null  int64  
 12  is_outdoor            10811 non-null  int64  
dtypes: float64(2), int64(10), object(1)
memory usage: 1.1+ MB


## ESTABLISH BASELINE 50.7%

In [6]:
df.is_under.value_counts(normalize=True)

1    0.507354
0    0.492646
Name: is_under, dtype: float64

In [7]:
df['baseline'] = 1

In [8]:
baseline_accuracy = (df.baseline == df.is_under).mean()
print(f'baseline accuracy: {baseline_accuracy:.2%}')

baseline accuracy: 50.74%


In [9]:
subset = df[df.is_under == 1]
baseline_recall = (subset.baseline == subset.is_under).mean()
print(f'baseline recall: {baseline_recall:.2%}')

baseline recall: 100.00%


In [10]:
subset = df[df.baseline == 1]
baseline_precision = (subset.baseline == subset.is_under).mean()
print(f'baseline precision: {baseline_precision:.2%}')

baseline precision: 50.74%


In [11]:
df.drop(columns='baseline',inplace=True)

In [12]:
df_no_stadium = df.drop(columns='stadium')

In [35]:
df_no_stadium.head()

Unnamed: 0,week_num,temp,humidity,wind,spread,ou,is_under,abnormal_start,is_playoff,playoff_implications,is_turf,is_outdoor
0,19,72,0,0,1.0,51.0,0,0,1,1,0,0
1,19,52,48,14,2.5,45.5,1,0,1,1,0,1
2,19,22,55,13,1.5,48.0,1,0,1,1,0,1
3,19,32,10,0,5.5,49.0,1,0,1,1,1,1
4,19,55,47,19,4.0,47.0,1,0,1,1,0,1


In [13]:
X_train, y_train, X_validate, y_validate, X_test, y_test = ex.train_validate_test(df_no_stadium,'is_under')

In [14]:
X_train.shape, y_train.shape, X_validate.shape, y_validate.shape, X_test.shape, y_test.shape 

((6471, 11), (6471,), (2394, 11), (2394,), (1946, 11), (1946,))

In [15]:
X_train.head()

Unnamed: 0,week_num,temp,humidity,wind,spread,ou,abnormal_start,is_playoff,playoff_implications,is_turf,is_outdoor
1713,12,72,0,0,4.0,49.0,0,0,1,1,0
2164,17,72,0,0,7.5,38.0,0,0,1,0,0
2554,9,72,0,0,1.0,42.5,1,0,0,0,0
3117,6,58,52,10,12.5,40.0,0,0,0,0,1
9819,13,51,49,14,5.0,39.5,0,0,1,0,1


# MODELING

## DTC VANILLA

In [16]:
# create the Decision Tree Classifier model 
dtc = DecisionTreeClassifier()

In [17]:
# fit the model to the TRAIN dataset:
dtc.fit(X_train, y_train)

In [18]:
# use the model by calling for the predictions made via the TRAIN dataset
dtc_preds = dtc.predict(X_train)
pd.crosstab(dtc_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3180,20
1,0,3271


In [19]:
print(f'Accuracy-Train {round(dtc.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(dtc.score(X_validate,y_validate),4)}')
print(classification_report(y_train,dtc_preds))
print(classification_report(y_validate,dtc.predict(X_validate)))

Accuracy-Train 0.9969
Accuracy-Validate 0.5092
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      3180
           1       1.00      0.99      1.00      3291

    accuracy                           1.00      6471
   macro avg       1.00      1.00      1.00      6471
weighted avg       1.00      1.00      1.00      6471

              precision    recall  f1-score   support

           0       0.50      0.50      0.50      1178
           1       0.52      0.51      0.52      1216

    accuracy                           0.51      2394
   macro avg       0.51      0.51      0.51      2394
weighted avg       0.51      0.51      0.51      2394



## DTC maxDepth = X

In [20]:
# create the Decision Tree Classifier model 
dtc = DecisionTreeClassifier(max_depth=5,min_samples_leaf=5)

In [21]:
# fit the model to the TRAIN dataset:
dtc.fit(X_train, y_train)

In [22]:
# use the model by calling for the predictions made via the TRAIN dataset
dtc_preds = dtc.predict(X_train)
pd.crosstab(dtc_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1604,1365
1,1576,1926


In [23]:
print(f'Accuracy-Train {round(dtc.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(dtc.score(X_validate,y_validate),4)}')
print(classification_report(y_train,dtc_preds))
print(classification_report(y_validate,dtc.predict(X_validate)))

Accuracy-Train 0.5455
Accuracy-Validate 0.5029
              precision    recall  f1-score   support

           0       0.54      0.50      0.52      3180
           1       0.55      0.59      0.57      3291

    accuracy                           0.55      6471
   macro avg       0.55      0.54      0.54      6471
weighted avg       0.55      0.55      0.54      6471

              precision    recall  f1-score   support

           0       0.49      0.46      0.48      1178
           1       0.51      0.55      0.53      1216

    accuracy                           0.50      2394
   macro avg       0.50      0.50      0.50      2394
weighted avg       0.50      0.50      0.50      2394



## DTC grid_search CROSS_VALIDATION k=5

In [24]:
cross_val_score(dtc, X_train, y_train, cv=10)

array([0.50308642, 0.52550232, 0.46367852, 0.47449768, 0.50695518,
       0.51931994, 0.51931994, 0.51468315, 0.51468315, 0.5007728 ])

In [25]:
param_grid = {
    'max_depth': [None,18, 12, 10, 5],
    'min_samples_leaf': [1, 5, 10, 20],
    'criterion': ['gini', 'entropy'],
}

In [26]:
gr_search = GridSearchCV(DecisionTreeClassifier(),
                      param_grid)

In [27]:
gr_search

In [28]:
gr_search.fit(X_train, y_train)

In [29]:
results = gr_search.cv_results_

In [30]:
results_df_init = pd.DataFrame(results)

In [31]:
results_df_init.shape

(40, 16)

In [32]:
params = pd.DataFrame(results['params'])

In [33]:
params

Unnamed: 0,criterion,max_depth,min_samples_leaf
0,gini,,1
1,gini,,5
2,gini,,10
3,gini,,20
4,gini,18.0,1
5,gini,18.0,5
6,gini,18.0,10
7,gini,18.0,20
8,gini,12.0,1
9,gini,12.0,5


In [36]:
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
37,0.004845,3e-05,0.000458,5e-06,entropy,5.0,5,"{'criterion': 'entropy', 'max_depth': 5, 'min_...",0.491892,0.510046,0.522411,0.520093,0.508501,0.510589,0.01081,1
38,0.004828,4.2e-05,0.000459,9e-06,entropy,5.0,10,"{'criterion': 'entropy', 'max_depth': 5, 'min_...",0.492664,0.504637,0.520866,0.51932,0.507728,0.509043,0.010342,2
17,0.004728,3.5e-05,0.000456,6e-06,gini,5.0,5,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",0.494981,0.506955,0.510046,0.520093,0.510046,0.508424,0.008054,3
2,0.010684,0.000396,0.000544,3e-06,gini,,10,"{'criterion': 'gini', 'max_depth': None, 'min_...",0.503475,0.49459,0.495363,0.515456,0.53323,0.508423,0.0145,4
36,0.004808,5.1e-05,0.000457,6e-06,entropy,5.0,1,"{'criterion': 'entropy', 'max_depth': 5, 'min_...",0.491892,0.506955,0.523184,0.520093,0.497682,0.507961,0.012197,5


## RFC grid_search CROSS_VALIDATION k=5

In [37]:
param_grid = {
    'max_depth': [None,18, 12, 10, 5],
    'n_estimators': [55, 101, 201],
    'criterion': ['gini', 'entropy'],
}
gr_search = GridSearchCV(RandomForestClassifier(),
                      param_grid)

In [39]:
gr_search

In [40]:
gr_search.fit(X_train, y_train)

In [41]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

In [46]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
27,0.082785,0.00151,0.003956,4.2e-05,entropy,5,55,"{'criterion': 'entropy', 'max_depth': 5, 'n_es...",0.543629,0.509274,0.534003,0.510819,0.545595,0.528664,0.015707,1
29,0.299151,0.003616,0.01308,0.000357,entropy,5,201,"{'criterion': 'entropy', 'max_depth': 5, 'n_es...",0.530502,0.520866,0.515456,0.520093,0.541731,0.525729,0.009379,2
12,0.076387,0.000545,0.003898,5.8e-05,gini,5,55,"{'criterion': 'gini', 'max_depth': 5, 'n_estim...",0.539768,0.518547,0.516229,0.503864,0.542504,0.524182,0.014741,3
13,0.138785,0.000758,0.006663,0.000122,gini,5,101,"{'criterion': 'gini', 'max_depth': 5, 'n_estim...",0.535907,0.507728,0.522411,0.527048,0.527048,0.524028,0.009248,4
9,0.118859,0.000543,0.005737,0.000115,gini,10,55,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.514286,0.523957,0.516229,0.523184,0.540958,0.523723,0.009409,5


## NBC grid_search CROSS_VALIDATION k=5

In [48]:
param_grid = {
    'var_smoothing': [.000000001,.000000002,.000000003]
}
gr_search = GridSearchCV(GaussianNB(),
                      param_grid)

In [49]:
gr_search

In [50]:
gr_search.fit(X_train, y_train)

In [51]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(3, 14)

In [52]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_var_smoothing,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.003311,0.000968,0.001571,0.000277,0.0,{'var_smoothing': 1e-09},0.51583,0.503091,0.522411,0.512365,0.532457,0.517231,0.009841,1
1,0.002139,0.000155,0.001109,0.000119,0.0,{'var_smoothing': 2e-09},0.51583,0.503091,0.522411,0.512365,0.532457,0.517231,0.009841,1
2,0.00172,0.000127,0.001021,0.000305,0.0,{'var_smoothing': 3e-09},0.51583,0.503091,0.522411,0.512365,0.532457,0.517231,0.009841,1


## GBC grid_search CROSS_VALIDATION k=5

In [53]:
param_grid = {
    'learning_rate': [0.1,0.2,0.5,1.0, 5.0],
    'n_estimators': [55, 101, 201],
    'min_samples_leaf': [1,5,10,20],
}
gr_search = GridSearchCV(GradientBoostingClassifier(),
                      param_grid)

In [54]:
gr_search

In [55]:
gr_search.fit(X_train, y_train)

In [56]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(60, 16)

In [57]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_min_samples_leaf,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
3,0.158427,0.000409,0.00112,3e-05,0.1,5,55,"{'learning_rate': 0.1, 'min_samples_leaf': 5, ...",0.528958,0.527821,0.512365,0.509274,0.532457,0.522175,0.009448,1
0,0.165198,0.014771,0.00113,8.9e-05,0.1,1,55,"{'learning_rate': 0.1, 'min_samples_leaf': 1, ...",0.520463,0.522411,0.510819,0.51391,0.529366,0.519394,0.006531,2
15,0.157105,0.000467,0.001115,1.2e-05,0.2,5,55,"{'learning_rate': 0.2, 'min_samples_leaf': 5, ...",0.500386,0.531685,0.517774,0.518547,0.52473,0.518624,0.010404,3
4,0.290151,0.000481,0.001572,2.6e-05,0.1,5,101,"{'learning_rate': 0.1, 'min_samples_leaf': 5, ...",0.52278,0.522411,0.51391,0.517774,0.51391,0.518157,0.003891,4
7,0.287086,0.000218,0.001618,4.6e-05,0.1,10,101,"{'learning_rate': 0.1, 'min_samples_leaf': 10,...",0.52973,0.51932,0.513138,0.502318,0.521638,0.517229,0.009156,5


## CATboost grid_search CROSS_VALIDATION k=5

In [58]:
param_grid = {
    'verbose': [False],
    'depth': [5, 10, 12,15],
    'learning_rate': [None,0.1, 0.3,0.7,1.0],
}
gr_search = GridSearchCV(CatBoostClassifier(),
                      param_grid)

In [59]:
# gr_search

In [None]:
# gr_search.fit(X_train, y_train)

In [None]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

In [None]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

## SCALE for Logistic regression, MLP, etc.

In [61]:
sc_X = StandardScaler()
X_train_scaled = sc_X.fit_transform(X_train)
X_validate_scaled = sc_X.transform(X_validate)
X_test_scaled = sc_X.transform(X_test)

In [68]:
mlp = MLPClassifier(hidden_layer_sizes=(1024,512,128),activation="relu",random_state=2013,
                    batch_size=200,solver='adam')
mlp.fit(X_train_scaled, y_train)
mlp_preds = mlp.predict(X_train_scaled)
print(mlp.score(X_train_scaled, y_train))
pd.crosstab(mlp_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

0.9313861845155308


is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3025,289
1,155,3002


In [69]:
mlp.n_layers_

5

In [70]:
print(f'Accuracy-Train {round(mlp.score(X_train_scaled,y_train),4)}')
print(f'Accuracy-Validate {round(mlp.score(X_validate_scaled,y_validate),4)}')
print(classification_report(y_train,mlp_preds))
print(classification_report(y_validate,mlp.predict(X_validate_scaled)))

Accuracy-Train 0.9314
Accuracy-Validate 0.5004
              precision    recall  f1-score   support

           0       0.91      0.95      0.93      3180
           1       0.95      0.91      0.93      3291

    accuracy                           0.93      6471
   macro avg       0.93      0.93      0.93      6471
weighted avg       0.93      0.93      0.93      6471

              precision    recall  f1-score   support

           0       0.49      0.53      0.51      1178
           1       0.51      0.47      0.49      1216

    accuracy                           0.50      2394
   macro avg       0.50      0.50      0.50      2394
weighted avg       0.50      0.50      0.50      2394



## KNN grid_search CROSS_VALIDATION k=5

In [122]:
param_grid = {
    'n_neighbors': [70],
    'weights': ['uniform','distance'],
    'algorithm':['ball_tree','brute'],
    'leaf_size': [3,4,5,6,7,8,9],
}
gr_search = GridSearchCV(KNeighborsClassifier(),
                      param_grid)

In [123]:
gr_search

In [124]:
gr_search.fit(X_train, y_train)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [125]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(28, 17)

In [127]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head(11)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algorithm,param_leaf_size,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
27,0.000792,1.6e-05,0.024601,0.000388,brute,9,70,distance,"{'algorithm': 'brute', 'leaf_size': 9, 'n_neig...",0.535135,0.510819,0.509274,0.545595,0.53323,0.526811,0.014329,1
25,0.001014,0.000441,0.02484,0.00056,brute,8,70,distance,"{'algorithm': 'brute', 'leaf_size': 8, 'n_neig...",0.535135,0.510819,0.509274,0.545595,0.53323,0.526811,0.014329,1
23,0.000789,6e-06,0.024214,0.000874,brute,7,70,distance,"{'algorithm': 'brute', 'leaf_size': 7, 'n_neig...",0.535135,0.510819,0.509274,0.545595,0.53323,0.526811,0.014329,1
21,0.000792,2e-05,0.024356,0.001268,brute,6,70,distance,"{'algorithm': 'brute', 'leaf_size': 6, 'n_neig...",0.535135,0.510819,0.509274,0.545595,0.53323,0.526811,0.014329,1
19,0.000809,3.2e-05,0.023858,0.000577,brute,5,70,distance,"{'algorithm': 'brute', 'leaf_size': 5, 'n_neig...",0.535135,0.510819,0.509274,0.545595,0.53323,0.526811,0.014329,1
17,0.000793,1.5e-05,0.023798,0.000604,brute,4,70,distance,"{'algorithm': 'brute', 'leaf_size': 4, 'n_neig...",0.535135,0.510819,0.509274,0.545595,0.53323,0.526811,0.014329,1
15,0.000807,2.1e-05,0.024903,0.000383,brute,3,70,distance,"{'algorithm': 'brute', 'leaf_size': 3, 'n_neig...",0.535135,0.510819,0.509274,0.545595,0.53323,0.526811,0.014329,1
1,0.002291,0.00014,0.045472,0.000787,ball_tree,3,70,distance,"{'algorithm': 'ball_tree', 'leaf_size': 3, 'n_...",0.534363,0.510819,0.510046,0.544822,0.531685,0.526347,0.013718,8
3,0.002246,3.6e-05,0.0466,0.000619,ball_tree,4,70,distance,"{'algorithm': 'ball_tree', 'leaf_size': 4, 'n_...",0.534363,0.510819,0.510046,0.544822,0.531685,0.526347,0.013718,8
5,0.002229,2.2e-05,0.046048,0.000577,ball_tree,5,70,distance,"{'algorithm': 'ball_tree', 'leaf_size': 5, 'n_...",0.534363,0.510819,0.510046,0.544822,0.531685,0.526347,0.013718,8


## LOG grid_search CROSS_VALIDATION k=5

In [128]:
param_grid = {
    'penalty': ['none','l1','l2','elasticnet'],
    'class_weight': ['balanced',None],
    'max_iter':[100,50,200]
}
gr_search = GridSearchCV(LogisticRegression(),
                      param_grid)

In [129]:
gr_search

In [130]:
gr_search.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [131]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(24, 16)

In [132]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head(11)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_max_iter,param_penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
22,0.06594,0.008926,0.001243,0.000369,,200,l2,"{'class_weight': None, 'max_iter': 200, 'penal...",0.511969,0.517774,0.528594,0.510046,0.527821,0.519241,0.007754,1
20,0.067601,0.006042,0.000941,0.000561,,200,none,"{'class_weight': None, 'max_iter': 200, 'penal...",0.517375,0.51932,0.521638,0.513138,0.520866,0.518467,0.003038,2
16,0.018389,0.005139,0.000915,0.000512,,50,none,"{'class_weight': None, 'max_iter': 50, 'penalt...",0.515058,0.509274,0.517002,0.512365,0.530912,0.516922,0.007463,3
10,0.07249,0.020006,0.001078,0.000446,balanced,200,l2,"{'class_weight': 'balanced', 'max_iter': 200, ...",0.518919,0.502318,0.523957,0.518547,0.51932,0.516612,0.007412,4
4,0.016331,0.001114,0.000809,0.000317,balanced,50,none,"{'class_weight': 'balanced', 'max_iter': 50, '...",0.515058,0.514683,0.517774,0.516229,0.517774,0.516304,0.001305,5
18,0.015036,0.00122,0.000774,0.000355,,50,l2,"{'class_weight': None, 'max_iter': 50, 'penalt...",0.511969,0.506955,0.517774,0.513138,0.530912,0.51615,0.008144,6
0,0.038746,0.016653,0.00146,0.000944,balanced,100,none,"{'class_weight': 'balanced', 'max_iter': 100, ...",0.522008,0.503091,0.513138,0.513138,0.528594,0.515994,0.008692,7
8,0.063947,0.00921,0.000801,0.000379,balanced,200,none,"{'class_weight': 'balanced', 'max_iter': 200, ...",0.514286,0.504637,0.520093,0.522411,0.513138,0.514913,0.0062,8
14,0.030316,0.003389,0.000822,0.000402,,100,l2,"{'class_weight': None, 'max_iter': 100, 'penal...",0.51583,0.504637,0.516229,0.514683,0.51932,0.51414,0.004993,9
12,0.034119,0.0076,0.001429,0.001148,,100,none,"{'class_weight': None, 'max_iter': 100, 'penal...",0.515058,0.503864,0.51932,0.510046,0.520866,0.513831,0.006242,10


## SVM grid_search CROSS_VALIDATION k=5

In [137]:
param_grid = {
    'shrinking': [True,False],
    'C':[1,5,10]
}
gr_search = GridSearchCV(SVC(),
                      param_grid)

In [138]:
gr_search

In [139]:
gr_search.fit(X_train, y_train)

In [140]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(6, 15)

In [141]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head(11)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_shrinking,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
4,0.412906,0.007322,0.21903,0.001073,10,True,"{'C': 10, 'shrinking': True}",0.528185,0.508501,0.525502,0.502318,0.544822,0.521866,0.015106,1
5,0.415259,0.005162,0.218586,0.001055,10,False,"{'C': 10, 'shrinking': False}",0.528185,0.508501,0.525502,0.502318,0.544049,0.521711,0.014872,2
2,0.402128,0.002019,0.219436,0.000844,5,True,"{'C': 5, 'shrinking': True}",0.532046,0.500773,0.523184,0.5,0.539413,0.519083,0.01611,3
3,0.403469,0.001924,0.218835,0.001024,5,False,"{'C': 5, 'shrinking': False}",0.532046,0.500773,0.523184,0.5,0.539413,0.519083,0.01611,3
0,0.403897,0.012204,0.220637,0.000745,1,True,"{'C': 1, 'shrinking': True}",0.528958,0.495363,0.504637,0.506955,0.539413,0.515065,0.016422,5
1,0.396335,0.000644,0.220261,0.001244,1,False,"{'C': 1, 'shrinking': False}",0.528958,0.495363,0.504637,0.506955,0.539413,0.515065,0.016422,5
