In [1]:
#standard ds imports
import pandas as pd
import numpy as np
#viz and stats
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
# .py imports
#import wranglerer as wr
#import modeling as md
import os
#sklearn imports
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
import sklearn.preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix, plot_confusion_matrix,balanced_accuracy_score,roc_auc_score,make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
#CATboost imports
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
import explore_r as ex
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import xgboost as xgb

### This notebook builds upon the previous notebook (model_r_round2) and now will incorporate feature selection in addition to GridSearchCV in an attempt to find optimal features/hyperparams for each algorithm.  However, let's first add XGBoost to the algorithm lineup and see if it can help push the needle.

In [2]:
df = pd.read_csv('prepped_data_stad.csv')

In [3]:
df = df.drop(columns=['date','home_score',
       'home_wins', 'away_score', 'away_wins','total_scores'])
df['spread'] = abs(df['spread'])

In [4]:
# cols = ['day_of_week','start_time','stadium']
# # recast STADIUM,Day,start_time as 'categories' dtype:
# for col in cols:
#     X_train[col] = X_train[col].astype('category')

In [5]:
#df = pd.get_dummies(df,columns=['day_of_week','start_time','stadium'])

In [6]:
X_train, y_train, X_validate, y_validate, X_test, y_test = ex.train_validate_test(df,'is_under')

In [7]:
X_train.shape, y_train.shape, X_validate.shape, y_validate.shape, X_test.shape, y_test.shape 

((6471, 14), (6471,), (2394, 14), (2394,), (1946, 14), (1946,))

In [8]:
X_train.head()

Unnamed: 0,day_of_week,start_time,week_num,stadium,temp,humidity,wind,spread,ou,abnormal_start,is_playoff,playoff_implications,is_turf,is_outdoor
1713,Sunday,1,12,Georgia Dome,72,0,0,4.0,49.0,0,0,1,1,0
2164,Sunday,1,17,NRG Stadium,72,0,0,7.5,38.0,0,0,1,0,0
2554,Sunday,8,9,Reliant Stadium,72,0,0,1.0,42.5,1,0,0,0,0
3117,Sunday,1,6,Heinz Field,58,52,10,12.5,40.0,0,0,0,0,1
9819,Sunday,1,13,Shea Stadium,51,49,14,5.0,39.5,0,0,1,0,1


In [9]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1946 entries, 4842 to 3441
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   day_of_week           1946 non-null   object 
 1   start_time            1946 non-null   int64  
 2   week_num              1946 non-null   int64  
 3   stadium               1946 non-null   object 
 4   temp                  1946 non-null   int64  
 5   humidity              1946 non-null   int64  
 6   wind                  1946 non-null   int64  
 7   spread                1946 non-null   float64
 8   ou                    1946 non-null   float64
 9   abnormal_start        1946 non-null   int64  
 10  is_playoff            1946 non-null   int64  
 11  playoff_implications  1946 non-null   int64  
 12  is_turf               1946 non-null   int64  
 13  is_outdoor            1946 non-null   int64  
dtypes: float64(2), int64(10), object(2)
memory usage: 228.0+ KB


In [10]:
X_train.shape, y_train.shape, X_validate.shape, y_validate.shape, X_test.shape, y_test.shape 

((6471, 14), (6471,), (2394, 14), (2394,), (1946, 14), (1946,))

# MODELING
## CATBoost

In [11]:
# Create and fit the thing
CATb = CatBoostClassifier(verbose=False,depth=5)
CATb.fit(X_train,y_train,cat_features=['day_of_week','start_time','stadium'])
CATb_preds = CATb.predict(X_train)
pd.crosstab(CATb_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2111,1137
1,1069,2154


In [12]:
print(f'Accuracy-Train {round(CATb.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(CATb.score(X_validate,y_validate),4)}')
print(classification_report(y_train,CATb_preds))
print(classification_report(y_validate,CATb.predict(X_validate)))

Accuracy-Train 0.6591
Accuracy-Validate 0.5063
              precision    recall  f1-score   support

           0       0.65      0.66      0.66      3180
           1       0.67      0.65      0.66      3291

    accuracy                           0.66      6471
   macro avg       0.66      0.66      0.66      6471
weighted avg       0.66      0.66      0.66      6471

              precision    recall  f1-score   support

           0       0.50      0.51      0.50      1178
           1       0.51      0.51      0.51      1216

    accuracy                           0.51      2394
   macro avg       0.51      0.51      0.51      2394
weighted avg       0.51      0.51      0.51      2394



## CATboost grid_search CROSS_VALIDATION k=5

In [13]:
param_grid = {
    'verbose': [False],
    'depth': [5, 10,15]
}
gr_search = GridSearchCV(CatBoostClassifier(),
                      param_grid)

In [14]:
gr_search

In [15]:
gr_search.fit(X_train, y_train,cat_features=['day_of_week','start_time','stadium'])

In [16]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(3, 15)

In [17]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_depth,param_verbose,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,2.787649,0.013299,0.002408,6e-05,5,False,"{'depth': 5, 'verbose': False}",0.528185,0.512365,0.51391,0.515456,0.525502,0.519084,0.006467,1
1,5.54575,0.042731,0.003826,0.000158,10,False,"{'depth': 10, 'verbose': False}",0.498842,0.503864,0.518547,0.507728,0.513138,0.508424,0.006896,2
2,74.430315,0.54386,0.012836,0.010571,15,False,"{'depth': 15, 'verbose': False}",0.495753,0.513138,0.50541,0.512365,0.510046,0.507342,0.00639,3


## XGBoost

In [18]:
#pd.get_dummies(X_train,columns=['day_of_week','start_time','stadium']).columns

In [19]:
y_train.unique(),y_validate.unique(),y_test.unique()

(array([0, 1]), array([0, 1]), array([0, 1]))

In [20]:
xgbc = xgb.XGBClassifier(n_estimators=2000,objective='binary:logistic',eval_metric='aucpr',missing=None,
                         random_state=2013,learning_rate=0.005, early_stopping_rounds=300,max_depth=12,gamma=0.1,
                         reg_lambda=1.0)

In [None]:
# xgbc.fit(X_train,y_train,
#        verbose=True,
#        eval_set=[(X_validate,y_validate)])

In [None]:
# plot_confusion_matrix(xgbc,X_validate,y_validate)

In [22]:
param_grid = {
    'max_depth': [5, 10, 12, 16],
    'n_estimators': [1000, 2000, 500],
    'eta': [0.01, 0.05,0.1],
    'gamma':[0,0.25,1.0],
    'reg_lambda':[0,1.0,10.0]
}
gr_search = GridSearchCV(xgb.XGBClassifier(),
                      param_grid)

In [23]:
gr_search

In [24]:
# too much compute req'd no run
#gr_search.fit(X_train, y_train)

In [25]:
# results = gr_search.cv_results_
# results_df_init = pd.DataFrame(results)
# results_df_init.shape

In [26]:
# params = pd.DataFrame(results['params'])
# results_df_init.sort_values(by='rank_test_score').head()

# OK, enough of that...
## Let's work with a hi-speed,low-drag version of the df
### Cols: wind, ou, is_outdoor, is_turf, abnormal_start, playoff

In [27]:
df = pd.read_csv('prepped_data_stad.csv')

In [29]:
df = df.drop(columns=['date','home_score',
       'home_wins', 'away_score', 'away_wins','total_scores',
                     'day_of_week','start_time', 'week_num','stadium', 'temp', 'humidity','spread',
                      'playoff_implications'])
#df['spread'] = abs(df['spread'])

In [30]:
df.head()

Unnamed: 0,wind,ou,is_under,abnormal_start,is_playoff,is_turf,is_outdoor
0,0,51.0,0,0,1,0,0
1,14,45.5,1,0,1,0,1
2,13,48.0,1,0,1,0,1
3,0,49.0,1,0,1,1,1
4,19,47.0,1,0,1,0,1


In [31]:
X_train, y_train, X_validate, y_validate, X_test, y_test = ex.train_validate_test(df,'is_under')

In [32]:
X_train.shape, y_train.shape, X_validate.shape, y_validate.shape, X_test.shape, y_test.shape 

((6471, 6), (6471,), (2394, 6), (2394,), (1946, 6), (1946,))

In [33]:
X_train.head()

Unnamed: 0,wind,ou,abnormal_start,is_playoff,is_turf,is_outdoor
1713,0,49.0,0,0,1,0
2164,0,38.0,0,0,0,0
2554,0,42.5,1,0,0,0
3117,10,40.0,0,0,0,1
9819,14,39.5,0,0,0,1


In [34]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1946 entries, 4842 to 3441
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   wind            1946 non-null   int64  
 1   ou              1946 non-null   float64
 2   abnormal_start  1946 non-null   int64  
 3   is_playoff      1946 non-null   int64  
 4   is_turf         1946 non-null   int64  
 5   is_outdoor      1946 non-null   int64  
dtypes: float64(1), int64(5)
memory usage: 106.4 KB


# MODELING with CV on slim df

## DTC with CV

In [35]:
param_grid = {
    'max_depth': [None,18, 12, 10, 5],
    'min_samples_leaf': [1, 5, 10, 20],
    'criterion': ['gini', 'entropy'],
}

gr_search = GridSearchCV(DecisionTreeClassifier(),param_grid)

In [36]:
gr_search

In [37]:
gr_search.fit(X_train, y_train)

In [38]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(40, 16)

In [39]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
19,0.002135,5.6e-05,0.000426,9e-06,gini,5,20,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",0.522008,0.528594,0.51391,0.520093,0.530139,0.522949,0.005904,1
39,0.002132,2.9e-05,0.000424,5e-06,entropy,5,20,"{'criterion': 'entropy', 'max_depth': 5, 'min_...",0.522008,0.528594,0.512365,0.520093,0.530139,0.52264,0.00639,2
38,0.00217,3.6e-05,0.000433,6e-06,entropy,5,10,"{'criterion': 'entropy', 'max_depth': 5, 'min_...",0.511197,0.51932,0.517002,0.520093,0.523184,0.518159,0.004004,3
18,0.00214,5.7e-05,0.000434,1.4e-05,gini,5,10,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",0.513514,0.51932,0.51391,0.520093,0.523184,0.518004,0.003738,4
16,0.00213,3.7e-05,0.000421,8e-06,gini,5,1,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",0.50888,0.520093,0.516229,0.516229,0.523957,0.517077,0.005001,5


## RFC with CV

In [40]:
param_grid = {
    'max_depth': [None,18, 12, 10, 5],
    'n_estimators': [55, 101, 201],
    'criterion': ['gini', 'entropy'],
}
gr_search = GridSearchCV(RandomForestClassifier(),
                      param_grid)

In [41]:
gr_search

In [42]:
gr_search.fit(X_train, y_train)

In [43]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(30, 16)

In [44]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
28,0.099771,0.000906,0.006755,6.2e-05,entropy,5,101,"{'criterion': 'entropy', 'max_depth': 5, 'n_es...",0.526641,0.509274,0.527821,0.518547,0.529366,0.52233,0.007524,1
13,0.102405,0.0017,0.006957,7.7e-05,gini,5,101,"{'criterion': 'gini', 'max_depth': 5, 'n_estim...",0.530502,0.501546,0.517002,0.525502,0.532457,0.521402,0.011274,2
27,0.055118,0.000588,0.003917,3.3e-05,entropy,5,55,"{'criterion': 'entropy', 'max_depth': 5, 'n_es...",0.525869,0.496136,0.518547,0.527048,0.530912,0.519702,0.012445,3
29,0.198051,0.000968,0.012933,0.000135,entropy,5,201,"{'criterion': 'entropy', 'max_depth': 5, 'n_es...",0.526641,0.503864,0.517774,0.522411,0.522411,0.51862,0.007893,4
18,0.112522,0.000625,0.008942,0.000111,entropy,18,55,"{'criterion': 'entropy', 'max_depth': 18, 'n_e...",0.513514,0.507728,0.528594,0.513138,0.525502,0.517695,0.007967,5


## NBC with CV

In [45]:
param_grid = {
    'var_smoothing': [.000000001,.000000002,.000000003]
}
gr_search = GridSearchCV(GaussianNB(),
                      param_grid)

In [46]:
gr_search

In [47]:
gr_search.fit(X_train, y_train)

In [48]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(3, 14)

In [49]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_var_smoothing,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.00348,0.001022,0.001417,0.000185,0.0,{'var_smoothing': 1e-09},0.51583,0.513138,0.517002,0.516229,0.527048,0.517849,0.00478,1
1,0.002035,0.000191,0.000954,8.5e-05,0.0,{'var_smoothing': 2e-09},0.51583,0.513138,0.517002,0.516229,0.527048,0.517849,0.00478,1
2,0.001883,0.000315,0.000905,0.000101,0.0,{'var_smoothing': 3e-09},0.51583,0.513138,0.517002,0.516229,0.527048,0.517849,0.00478,1


## GBC with CV

In [50]:
param_grid = {
    'learning_rate': [0.1,0.2,0.5,1.0, 5.0],
    'n_estimators': [55, 101, 201],
    'min_samples_leaf': [1,5,10,20],
}
gr_search = GridSearchCV(GradientBoostingClassifier(),
                      param_grid)

In [51]:
gr_search

In [52]:
gr_search.fit(X_train, y_train)

In [53]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(60, 16)

In [54]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_min_samples_leaf,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
19,0.1353,0.000441,0.001528,4.9e-05,0.2,10,101,"{'learning_rate': 0.2, 'min_samples_leaf': 10,...",0.528958,0.52473,0.528594,0.508501,0.517774,0.521711,0.007733,1
46,0.139342,0.000484,0.001731,5.8e-05,1.0,20,101,"{'learning_rate': 1.0, 'min_samples_leaf': 20,...",0.504247,0.51932,0.522411,0.516229,0.536321,0.519706,0.01034,2
35,0.274147,0.002032,0.002701,5.6e-05,0.5,20,201,"{'learning_rate': 0.5, 'min_samples_leaf': 20,...",0.511969,0.542504,0.522411,0.5,0.520866,0.51955,0.013975,3
8,0.271505,0.001629,0.002531,6.9e-05,0.1,10,201,"{'learning_rate': 0.1, 'min_samples_leaf': 10,...",0.525097,0.516229,0.520866,0.506182,0.527821,0.519239,0.007618,4
21,0.074132,0.00026,0.001081,1.8e-05,0.2,20,55,"{'learning_rate': 0.2, 'min_samples_leaf': 20,...",0.528958,0.515456,0.51932,0.496909,0.535549,0.519238,0.013217,5


## CATb with CV

In [None]:
# param_grid = {
#     'verbose': [False],
#     'depth': [5, 10, 12,15],
#     'learning_rate': [None,0.1, 0.3,0.7,1.0],
# }
# gr_search = GridSearchCV(CatBoostClassifier(),
#                       param_grid)

In [55]:
param_grid = {
    'verbose': [False],
    'depth': [5, 10, 12],
    'learning_rate': [None,0.1,0.2, 0.5],
}
gr_search = GridSearchCV(CatBoostClassifier(),
                      param_grid)

In [56]:
gr_search

In [57]:
gr_search.fit(X_train, y_train)

In [58]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(12, 16)

In [59]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_depth,param_learning_rate,param_verbose,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,1.199895,0.00743,0.001046,0.000173,5,0.1,False,"{'depth': 5, 'learning_rate': 0.1, 'verbose': ...",0.503475,0.527821,0.520866,0.526275,0.520866,0.51986,0.00866,1
0,1.197172,0.017975,0.001052,0.000164,5,,False,"{'depth': 5, 'learning_rate': None, 'verbose':...",0.510425,0.522411,0.51391,0.520866,0.523184,0.518159,0.00507,2
6,2.248317,0.01381,0.001592,9e-06,10,0.2,False,"{'depth': 10, 'learning_rate': 0.2, 'verbose':...",0.515058,0.516229,0.520093,0.516229,0.520866,0.517695,0.002326,3
7,2.260833,0.018347,0.001595,1.3e-05,10,0.5,False,"{'depth': 10, 'learning_rate': 0.5, 'verbose':...",0.507336,0.512365,0.526275,0.512365,0.525502,0.516769,0.007674,4
11,4.931926,0.035767,0.002045,0.000229,12,0.5,False,"{'depth': 12, 'learning_rate': 0.5, 'verbose':...",0.50888,0.51932,0.522411,0.510046,0.523184,0.516768,0.006114,5


## SCALE for Logistic regression, MLP, etc.

In [60]:
sc_X = StandardScaler()
X_train_scaled = sc_X.fit_transform(X_train)
X_validate_scaled = sc_X.transform(X_validate)
X_test_scaled = sc_X.transform(X_test)

## MLP

In [63]:
mlp = MLPClassifier(hidden_layer_sizes=(1024,512,128),activation="relu",random_state=2013,
                    batch_size=500,solver='adam')
mlp.fit(X_train_scaled, y_train)
mlp_preds = mlp.predict(X_train_scaled)
print(mlp.score(X_train_scaled, y_train))
pd.crosstab(mlp_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

0.6158244475351569




is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1817,1123
1,1363,2168


In [64]:
print(f'Accuracy-Train {round(mlp.score(X_train_scaled,y_train),4)}')
print(f'Accuracy-Validate {round(mlp.score(X_validate_scaled,y_validate),4)}')
print(classification_report(y_train,mlp_preds))
print(classification_report(y_validate,mlp.predict(X_validate_scaled)))

Accuracy-Train 0.6158
Accuracy-Validate 0.4879
              precision    recall  f1-score   support

           0       0.62      0.57      0.59      3180
           1       0.61      0.66      0.64      3291

    accuracy                           0.62      6471
   macro avg       0.62      0.62      0.61      6471
weighted avg       0.62      0.62      0.62      6471

              precision    recall  f1-score   support

           0       0.48      0.46      0.47      1178
           1       0.50      0.52      0.51      1216

    accuracy                           0.49      2394
   macro avg       0.49      0.49      0.49      2394
weighted avg       0.49      0.49      0.49      2394



## KNN with CV

In [65]:
param_grid = {
    'n_neighbors': [70],
    'weights': ['uniform','distance'],
    'algorithm':['ball_tree','brute'],
    'leaf_size': [3,4,5,6,7,8,9],
}
gr_search = GridSearchCV(KNeighborsClassifier(),
                      param_grid)

In [66]:
gr_search

In [67]:
gr_search.fit(X_train, y_train)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [68]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(28, 17)

In [69]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algorithm,param_leaf_size,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,0.001875,2e-05,0.026484,0.000692,ball_tree,3,70,distance,"{'algorithm': 'ball_tree', 'leaf_size': 3, 'n_...",0.511969,0.517002,0.517774,0.511592,0.527821,0.517232,0.005865,1
3,0.002004,0.000213,0.026496,0.000621,ball_tree,4,70,distance,"{'algorithm': 'ball_tree', 'leaf_size': 4, 'n_...",0.511969,0.517002,0.517774,0.511592,0.527821,0.517232,0.005865,1
5,0.001927,0.000121,0.026334,0.000665,ball_tree,5,70,distance,"{'algorithm': 'ball_tree', 'leaf_size': 5, 'n_...",0.511969,0.517002,0.517774,0.511592,0.527821,0.517232,0.005865,1
13,0.001757,2.2e-05,0.024556,0.000841,ball_tree,9,70,distance,"{'algorithm': 'ball_tree', 'leaf_size': 9, 'n_...",0.509653,0.51932,0.517002,0.508501,0.528594,0.516614,0.007284,4
7,0.001819,6e-05,0.024745,0.000947,ball_tree,6,70,distance,"{'algorithm': 'ball_tree', 'leaf_size': 6, 'n_...",0.509653,0.51932,0.517002,0.508501,0.528594,0.516614,0.007284,4


## LOG with CV

In [75]:
param_grid = {
    'penalty': ['none','l1','l2','elasticnet'],
    'class_weight': ['balanced',None],
    'max_iter':[100,50,200,500]
}
gr_search = GridSearchCV(LogisticRegression(),
                      param_grid)

In [76]:
gr_search

In [77]:
gr_search.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
80 fits failed out of a total of 160.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/homebrew/anaconda3

In [78]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(32, 16)

In [79]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_max_iter,param_penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
18,0.019897,0.002053,0.000442,1.4e-05,,100,l2,"{'class_weight': None, 'max_iter': 100, 'penal...",0.516602,0.510819,0.528594,0.515456,0.529366,0.520167,0.007455,1
28,0.0202,0.005519,0.00089,0.000342,,500,none,"{'class_weight': None, 'max_iter': 500, 'penal...",0.516602,0.509274,0.527821,0.517002,0.530139,0.520167,0.007738,1
16,0.016883,0.005203,0.000729,0.000316,,100,none,"{'class_weight': None, 'max_iter': 100, 'penal...",0.516602,0.509274,0.527821,0.517002,0.530139,0.520167,0.007738,1
30,0.017651,0.00135,0.000441,1.9e-05,,500,l2,"{'class_weight': None, 'max_iter': 500, 'penal...",0.516602,0.510819,0.528594,0.515456,0.529366,0.520167,0.007455,1
26,0.017199,0.002465,0.000948,0.000418,,200,l2,"{'class_weight': None, 'max_iter': 200, 'penal...",0.516602,0.510819,0.528594,0.515456,0.529366,0.520167,0.007455,1


## SVM with CV

In [80]:
param_grid = {
    'shrinking': [True,False],
    'C':[1,5,10]
}
gr_search = GridSearchCV(SVC(),
                      param_grid)

In [81]:
gr_search

In [82]:
gr_search.fit(X_train, y_train)

In [83]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(6, 15)

In [84]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_shrinking,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.363679,0.019941,0.211439,0.000551,1,True,"{'C': 1, 'shrinking': True}",0.528958,0.510046,0.526275,0.522411,0.526275,0.522793,0.006706,1
1,0.351247,0.001153,0.211317,0.000843,1,False,"{'C': 1, 'shrinking': False}",0.528958,0.510046,0.526275,0.522411,0.526275,0.522793,0.006706,1
4,0.363932,0.006286,0.210401,0.000807,10,True,"{'C': 10, 'shrinking': True}",0.523552,0.503091,0.526275,0.529366,0.523957,0.521248,0.009311,3
5,0.357517,0.002236,0.210013,0.000804,10,False,"{'C': 10, 'shrinking': False}",0.523552,0.503091,0.526275,0.529366,0.523957,0.521248,0.009311,3
2,0.356448,0.003494,0.210921,0.000606,5,True,"{'C': 5, 'shrinking': True}",0.521236,0.503091,0.523184,0.52473,0.517774,0.518003,0.00781,5
