In [1]:
import pandas as pd
import numpy as np
import os
import csv
from tqdm import tqdm, trange
import random
import re

# Scikit-learn
from sklearn.datasets import load_iris
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, BaggingClassifier # RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score # mean_squared error, roc_auc_score, accuracy_score, f1 score
from sklearn.linear_model import LogisticRegression
#

import warnings
warnings.filterwarnings('ignore')

In [3]:
dataframes = []
total_wanted_files = 1000 #1000 files to be used
files_used = []
teams = []

for file_name in tqdm(os.listdir('Cleaned_data'), desc='Read Files'):
  if total_wanted_files == 0:
    break

  _data = pd.read_csv(f'./Cleaned_data/{file_name}')
  _data['Win'] = 0
  files_used.append(file_name)
  total_wanted_files -= 1

  _data.replace(np.nan, 0, inplace=True)

  home_df = _data[_data['Home Team'] == 1]
  away_df = _data[_data['Home Team'] == 0]
  teams.append(away_df)
  teams.append(home_df)


  home_stats = []
  away_stats = []

  terms = ['Q1', 'Q2', 'H1', 'Q3', 'Q4', 'H2']
  columns = [4, 5, 7, 8, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]

  home_q1 = [0] * 16
  home_q2 = [0] * 16
  home_h1 = [0] * 16
  home_q3 = [0] * 16
  home_q4 = [0] * 16
  home_h2 = [0] * 16

  home_stats.append(home_q1)
  home_stats.append(home_q2)
  home_stats.append(home_h1)
  home_stats.append(home_q3)
  home_stats.append(home_q4)
  home_stats.append(home_h2)

  away_q1 = [0] * 16
  away_q2 = [0] * 16
  away_h1 = [0] * 16
  away_q3 = [0] * 16
  away_q4 = [0] * 16
  away_h2 = [0] * 16

  away_stats.append(away_q1)
  away_stats.append(away_q2)
  away_stats.append(away_h1)
  away_stats.append(away_q3)
  away_stats.append(away_q4)
  away_stats.append(away_h2)


  for term in range(6): #add team stats together and add term last
    for col in range(len(columns)):
      for row in range(len(home_df)):
        if home_df.iloc[row, 1] == terms[term]:
          home_stats[term][col] += float(home_df.iloc[row, columns[col]])
    home_stats[term].append(terms[term])

  for term in range(6):
    for col in range(len(columns)):
      for row in range(len(away_df)):
        if away_df.iloc[row, 1] == terms[term]:
          away_stats[term][col] += float(away_df.iloc[row, columns[col]])
    away_stats[term].append(terms[term])

  


  for arr in home_stats: #add column identifying home team
    arr.append(1)
  for arr in away_stats:
    arr.append(0)


  total_points = 0
  for arr in home_stats:
    total_points += arr[-4]

  for arr in away_stats:
    total_points -= arr[-4]

  


  for i in range(6): #add column identifying winner
    if total_points > 0:
      home_stats[i].append(1)
      away_stats[i].append(0)
    else:
      home_stats[i].append(0)
      away_stats[i].append(1)

  home = pd.DataFrame(home_stats, 
  columns=['FG', 'FGA', '3P', '3PA', 'FT', 'FTA',
          'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK',
          'TOV', 'PF', 'PTS', '+/-', 'Term', 'Home Team', 'Win'])

  away = pd.DataFrame(away_stats, 
  columns=['FG', 'FGA', '3P', '3PA', 'FT', 'FTA',
          'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK',
          'TOV', 'PF', 'PTS', '+/-', 'Term', 'Home Team', 'Win'])


  dataframes.append(home)
  dataframes.append(away)


dataframe = pd.concat(dataframes)
#dataframe = dataframe.stack(level=-1).reset_index(drop=True)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
  print(dataframe)
#display(dataframe) #a 3-d dataframe

Read Files:   3%|▎         | 1000/32881 [03:47<2:00:49,  4.40it/s]


     FG   FGA    3P   3PA    FT   FTA   ORB   DRB   TRB   AST   STL   BLK   
0  12.0  20.0   3.0   5.0   1.0   4.0   2.0   6.0   8.0   7.0   4.0   1.0  \
1  13.0  21.0   3.0   6.0   2.0   4.0   2.0   7.0   9.0   9.0   3.0   1.0   
2  25.0  41.0   6.0  11.0   3.0   8.0   4.0  13.0  17.0  16.0   7.0   2.0   
3   5.0  19.0   1.0   4.0   7.0   8.0   4.0   3.0   7.0   1.0   1.0   0.0   
4   8.0  24.0   1.0   5.0   4.0   7.0   6.0   6.0  12.0   3.0   2.0   0.0   
5  13.0  43.0   2.0   9.0  11.0  15.0  10.0   9.0  19.0   4.0   3.0   0.0   
0  11.0  19.0   1.0   5.0   4.0   7.0   1.0   5.0   6.0   7.0   1.0   3.0   
1   7.0  16.0   0.0   3.0   7.0  10.0   2.0   5.0   7.0   3.0   4.0   1.0   
2  18.0  35.0   1.0   8.0  11.0  17.0   3.0  10.0  13.0  10.0   5.0   4.0   
3  14.0  21.0   0.0   1.0   4.0   5.0   2.0   9.0  11.0  12.0   1.0   3.0   
4  10.0  18.0   0.0   4.0   7.0  10.0   3.0  10.0  13.0   6.0   1.0   1.0   
5  24.0  39.0   0.0   5.0  11.0  15.0   5.0  19.0  24.0  18.0   2.0   4.0   

Functions that converts information

In [2]:
def convert_pos(s):
  if len(s) == 1:
    if s == 'G':
      return 1
    elif s == 'F':
      return 3
    elif s == 'C':
      return 5
    
  elif len(s) == 2:
    if s == 'PG':
      return 1
    elif s == 'SG' or s == 'GF':
      return 2
    elif s == 'SF':
      return 3
    elif s == 'PF' or s == 'FC':
      return 4
    
  elif len(s) == 3:
    if s == 'G-F' or s == 'F-G':
      return 2
    elif s == 'F-C' or s == 'C-F':
      return 4


def convert_term(term):
  if term == 'Q1':
    return 1
  elif term == 'Q2':
    return 2
  elif term == 'H1':
    return 3
  elif term == 'Q3':
    return 4
  elif term == 'Q4':
    return 5
  elif term == 'H2':
    return 6

Insert position for every player

In [None]:
temp = dataframes.copy(deep=True)


#for row_idx in range(len(temp)):
  #print(temp.loc[row_idx, :])

#remove null rows, percentages
temp = temp.drop(['FG%', '3P%', 'FT%', 'Name'], axis=1)
temp.dropna(axis=0, inplace=True)
display(temp)

Find the winning team and identify the players on that team

In [None]:
for file_name in tqdm(files_used, desc='Files Adjusted'):
  _data = pd.read_csv(f'./Cleaned_data/{file_name}').copy(deep=True)
  _data['Win'] = 0
  _temp = data.loc[:, 'Home Team' : ]
  away_total_pts = 0
  home_total_pts = 0

  away_players = []
  home_players = []

  for i in range(len(_data)):
    #if (i + 1) % 3 == 0:
      #continue
    try:
      if temp.iloc[i,0] == 0:
        away_total_pts += int(temp.iloc[i,-3])
        away_players.append(i)
        #print(f'total points so far : {away_total_pts}')
      else:
        home_total_pts += int(temp.iloc[i,-3])
        home_players.append(i)
      #print(data.iloc[i,1])
    except:
      continue
  #print(f'away points total : {away_total_pts}')
  #print(f'home points total : {home_total_pts}')
  
  if away_total_pts > home_total_pts:
    for idx in away_players:
      temp.iloc[idx, -1] = 1
    #temp.loc[away_total_pts > home_total_pts, 'Win'] = 1
    #players_won = temp[temp['Home Team'] == 0]
    #print(players_won)
    #players_won['Win'] = 1
  elif home_total_pts > away_total_pts:
    for idx in home_players:
      temp.iloc[idx, -1] = 1
    #temp.loc[home_total_pts > away_total_pts, 'Win'] = 1
    #players_won = temp[temp['Home Team'] == 1]
    #print(players_won)
    #players_won['Win'] = 1
print(temp)

Convert terms into usuable data

In [None]:
for i in trange(len(temp), desc='Term adjusted rows'):
  temp.iloc[i, 1] = convert_term(temp.iloc[i, 1])
  #if pd.isna(temp.iloc[i, 3]):
    #temp.drop(i)

print(temp)

In [4]:
temp = dataframe.copy(deep=True)
temp.drop('PTS', axis=1, inplace=True)
#temp.drop('+/-', axis=1, inplace=True)
temp.drop('TRB', axis=1, inplace=True)

print(temp)

temp.dropna(inplace=True)
print(temp.columns)

temp['Term'] = temp['Term'].apply(convert_term)
'''

rand_data = temp.copy(deep=True)
num_rand_features = 10
all_features = len(rand_data.columns) - 1
rand_indices = random.sample(range(all_features), num_rand_features)


rand_features = rand_data.iloc[:, rand_indices]
print(rand_features)
rand_arr = rand_features.to_numpy()

'''
arr = temp.to_numpy()

x = arr[:, :-1]
y = arr[:, -1].astype('int')

print(arr)



      FG   FGA   3P   3PA   FT  FTA  ORB   DRB   AST  STL  BLK   TOV    PF   
0   12.0  20.0  3.0   5.0  1.0  4.0  2.0   6.0   7.0  4.0  1.0   5.0   8.0  \
1   13.0  21.0  3.0   6.0  2.0  4.0  2.0   7.0   9.0  3.0  1.0   4.0   8.0   
2   25.0  41.0  6.0  11.0  3.0  8.0  4.0  13.0  16.0  7.0  2.0   9.0  16.0   
3    5.0  19.0  1.0   4.0  7.0  8.0  4.0   3.0   1.0  1.0  0.0   2.0   7.0   
4    8.0  24.0  1.0   5.0  4.0  7.0  6.0   6.0   3.0  2.0  0.0   1.0  10.0   
..   ...   ...  ...   ...  ...  ...  ...   ...   ...  ...  ...   ...   ...   
1    9.0  19.0  0.0   0.0  3.0  4.0  3.0   5.0   5.0  3.0  2.0   5.0   5.0   
2   17.0  40.0  1.0   4.0  4.0  7.0  6.0  14.0  12.0  5.0  3.0  11.0   7.0   
3   13.0  21.0  4.0   8.0  0.0  0.0  0.0   5.0   9.0  3.0  1.0   1.0   6.0   
4    8.0  18.0  0.0   1.0  3.0  5.0  2.0   8.0   6.0  0.0  2.0   1.0   4.0   
5   21.0  39.0  4.0   9.0  3.0  5.0  2.0  13.0  15.0  3.0  3.0   2.0  10.0   

     +/- Term  Home Team  Win  
0    5.0   Q1          1    0  

In [33]:

#print(temp.isna().sum())
y = y.ravel()
print(y)


[0 0 0 ... 0 0 0]


In [9]:
'''
temp.drop('PTS', axis=1, inplace=True)
temp.dropna(inplace=True)

for _ in trange(100, desc='Combinations Tried'):
    rand_data = temp.copy(deep=True)
    num_rand_features = 15
    all_features = len(rand_data.columns) - 1
    rand_indices = random.sample(range(all_features), num_rand_features)
    rand_features = rand_data.iloc[:, rand_indices]
    print(rand_features)

    rand_arr = rand_features.to_numpy()
    arr = temp.to_numpy()
    x = rand_arr[:, :]
    y = arr[:, -1:].astype('int')

    y = y.ravel()

    cv_inner = KFold(n_splits=2, shuffle=True, random_state=42)          #1
    cv_outer = KFold(n_splits=10, shuffle=True, random_state=42)
    # 10x2 nested cross validation
    # 9 folds for training, 1 fold for testing
    # 9 folds --> 50% (training) - 50% (validation) split 
    regressor = GradientBoostingClassifier()       # RandomForestClassifier, GradientBoostingClassifier, LogisticRegression        #2

    p_grid = {#'bootstrap': [True, False],
            'max_depth': [20, 40, 60, None],
            'max_features': [None, 'sqrt'],
            'min_samples_leaf': [1, 2, 4],
            'min_samples_split': [2, 5, 10],
            'n_estimators': [200, 1000, 1600]}


    #Gradient Boosting Classifier, roc_auc
    #p_grid = {'max_depth': [40, 80, 120, 160, None],
    #        'max_features': [None, 'sqrt'],
    #        'min_samples_leaf': [1, 2, 4, 6, 8],
    #        'min_samples_split': [1, 2, 5, 8, 10],
    #        'n_estimators': [500, 1000, 1500, 1800, 2000]}

    history = []
    pointer = 1
    for train_index, test_index in tqdm(cv_outer.split(x, y), desc='Times Looped'):                          #3
        print('\nNestedCV: {} of outer fold {}'.format(pointer, cv_outer.get_n_splits()))
        x_train, x_test = x[train_index], x[test_index] # x (features)
        y_train, y_test = y[train_index], y[test_index] # y (target)

        model = GridSearchCV(regressor, param_grid=p_grid,
                            scoring='accuracy', cv=cv_inner, n_jobs=-1) # classification --> roc_auc, f1, accuracy   # Grid search  #4
        model.fit(x_train, y_train) # trains ML model
        
        pred_test = model.predict(x_test) # predict test data
        pred_training = model.predict(x_train) # predict training data
        
        # Evaluate model performance
        # scikit learn accuracy
        auc_train = accuracy_score(y_train, pred_training)     #roc_auc_score, f1, accuracy       #5
        auc_test = accuracy_score(y_test, pred_test)       #roc_auc_score
        
        print("""
        Best set of parameters: {}
        Best MSE              : {:.2f}

        Training
            MSE: {:.3f}
        Test
            MSE: {:.3f}
        """.format(
            model.best_params_,
            model.best_score_,
            auc_train,
            auc_test,
            )
        )
        history.append(auc_test)
        pointer += 1
        
    print('Overall test performance: {:.2f}'.format(np.mean(history)))
'''

'\ntemp.drop(\'PTS\', axis=1, inplace=True)\ntemp.dropna(inplace=True)\n\nfor _ in trange(100, desc=\'Combinations Tried\'):\n    rand_data = temp.copy(deep=True)\n    num_rand_features = 15\n    all_features = len(rand_data.columns) - 1\n    rand_indices = random.sample(range(all_features), num_rand_features)\n    rand_features = rand_data.iloc[:, rand_indices]\n    print(rand_features)\n\n    rand_arr = rand_features.to_numpy()\n    arr = temp.to_numpy()\n    x = rand_arr[:, :]\n    y = arr[:, -1:].astype(\'int\')\n\n    y = y.ravel()\n\n    cv_inner = KFold(n_splits=2, shuffle=True, random_state=42)          #1\n    cv_outer = KFold(n_splits=10, shuffle=True, random_state=42)\n    # 10x2 nested cross validation\n    # 9 folds for training, 1 fold for testing\n    # 9 folds --> 50% (training) - 50% (validation) split \n    regressor = GradientBoostingClassifier()       # RandomForestClassifier, GradientBoostingClassifier, LogisticRegression        #2\n\n    p_grid = {#\'bootstrap\': 

In [5]:
cv_inner = KFold(n_splits=2, shuffle=True, random_state=42)          #1
cv_outer = KFold(n_splits=10, shuffle=True, random_state=42)
# 10x2 nested cross validation
# 9 folds for training, 1 fold for testing
# 9 folds --> 50% (training) - 50% (validation) split 
classifier = GradientBoostingClassifier()       # RandomForestClassifier, GradientBoostingClassifier, LogisticRegression        #2

p_grid = {#'bootstrap': [True, False],
        'max_depth': [40, 60],
        'max_features': [None, 'sqrt'],
        'min_samples_leaf': [4, 6, 8],
        'min_samples_split': [4, 6, 8],
        'n_estimators': [1600, 2000]}


#Gradient Boosting Classifier, roc_auc
#p_grid = {'max_depth': [40, 80, 120, 160, None],
#        'max_features': [None, 'sqrt'],
#        'min_samples_leaf': [1, 2, 4, 6, 8],
#        'min_samples_split': [1, 2, 5, 8, 10],
#        'n_estimators': [500, 1000, 1500, 1800, 2000]}

history = []
pointer = 1
for train_index, test_index in tqdm(cv_outer.split(x, y), desc='Times Looped'):                          #3
    print('\nNestedCV: {} of outer fold {}'.format(pointer, cv_outer.get_n_splits()))
    x_train, x_test = x[train_index], x[test_index] # x (features)
    y_train, y_test = y[train_index], y[test_index] # y (target)

    model = GridSearchCV(classifier, param_grid=p_grid,
                        scoring='roc_auc', cv=cv_inner, n_jobs=-1) # classification --> roc_auc, f1, accuracy   # Grid search  #4
    model.fit(x_train, y_train) # trains ML model
    
    pred_test = model.predict(x_test) # predict test data
    pred_training = model.predict(x_train) # predict training data

    
    # Evaluate model performance
    # scikit learn accuracy
    auc_train = roc_auc_score(y_train, pred_training)     #roc_auc_score, f1, accuracy       #5
    auc_test = roc_auc_score(y_test, pred_test)       #roc_auc_score

    
    print("""
    Best set of parameters: {}
    Best MSE              : {:.2f}

    Training
        MSE: {:.3f}
    Test
        MSE: {:.3f}
    """.format(
        model.best_params_,
        model.best_score_,
        auc_train,
        auc_test,
        )
    )
    history.append(auc_test)
    pointer += 1

print('Overall test performance: {:.2f}'.format(np.mean(history)))

Times Looped: 0it [00:00, ?it/s]


NestedCV: 1 of outer fold 10


Times Looped: 1it [12:36, 756.76s/it]


    Best set of parameters: {'max_depth': 40, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 8, 'n_estimators': 2000}
    Best MSE              : 0.74

    Training
        MSE: 1.000
    Test
        MSE: 0.676
    

NestedCV: 2 of outer fold 10


Times Looped: 2it [24:56, 746.91s/it]


    Best set of parameters: {'max_depth': 40, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 8, 'n_estimators': 1600}
    Best MSE              : 0.74

    Training
        MSE: 1.000
    Test
        MSE: 0.672
    

NestedCV: 3 of outer fold 10


Times Looped: 3it [37:25, 747.75s/it]


    Best set of parameters: {'max_depth': 60, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 6, 'n_estimators': 1600}
    Best MSE              : 0.74

    Training
        MSE: 1.000
    Test
        MSE: 0.679
    

NestedCV: 4 of outer fold 10


Times Looped: 4it [49:46, 745.26s/it]


    Best set of parameters: {'max_depth': 40, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 4, 'n_estimators': 1600}
    Best MSE              : 0.74

    Training
        MSE: 1.000
    Test
        MSE: 0.671
    

NestedCV: 5 of outer fold 10


Times Looped: 5it [1:02:06, 743.11s/it]


    Best set of parameters: {'max_depth': 40, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 6, 'n_estimators': 2000}
    Best MSE              : 0.73

    Training
        MSE: 1.000
    Test
        MSE: 0.689
    

NestedCV: 6 of outer fold 10


Times Looped: 6it [1:14:28, 742.94s/it]


    Best set of parameters: {'max_depth': 60, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 6, 'n_estimators': 2000}
    Best MSE              : 0.74

    Training
        MSE: 1.000
    Test
        MSE: 0.670
    

NestedCV: 7 of outer fold 10


Times Looped: 7it [1:26:12, 730.16s/it]


    Best set of parameters: {'max_depth': 40, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 8, 'n_estimators': 1600}
    Best MSE              : 0.74

    Training
        MSE: 1.000
    Test
        MSE: 0.665
    

NestedCV: 8 of outer fold 10


Times Looped: 8it [1:38:01, 723.39s/it]


    Best set of parameters: {'max_depth': 60, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 8, 'n_estimators': 2000}
    Best MSE              : 0.74

    Training
        MSE: 1.000
    Test
        MSE: 0.683
    

NestedCV: 9 of outer fold 10


Times Looped: 9it [1:50:00, 722.13s/it]


    Best set of parameters: {'max_depth': 40, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 4, 'n_estimators': 1600}
    Best MSE              : 0.74

    Training
        MSE: 1.000
    Test
        MSE: 0.677
    

NestedCV: 10 of outer fold 10


Times Looped: 10it [2:01:49, 730.92s/it]


    Best set of parameters: {'max_depth': 60, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 8, 'n_estimators': 1600}
    Best MSE              : 0.74

    Training
        MSE: 1.000
    Test
        MSE: 0.673
    
Overall test performance: 0.68





In [6]:
cv_inner = KFold(n_splits=2, shuffle=True, random_state=42)          #1
cv_outer = KFold(n_splits=10, shuffle=True, random_state=42)
# 10x2 nested cross validation
# 9 folds for training, 1 fold for testing
# 9 folds --> 50% (training) - 50% (validation) split 
classifier = GradientBoostingClassifier()       # RandomForestClassifier, GradientBoostingClassifier, LogisticRegression        #2

p_grid = {#'bootstrap': [True, False],
        'max_depth': [40, 60],
        'max_features': [None, 'sqrt'],
        'min_samples_leaf': [4, 6, 8],
        'min_samples_split': [4, 6, 8],
        'n_estimators': [1600, 2000]}


#Gradient Boosting Classifier, roc_auc
#p_grid = {'max_depth': [40, 80, 120, 160, None],
#        'max_features': [None, 'sqrt'],
#        'min_samples_leaf': [1, 2, 4, 6, 8],
#        'min_samples_split': [1, 2, 5, 8, 10],
#        'n_estimators': [500, 1000, 1500, 1800, 2000]}

history = []
pointer = 1
for train_index, test_index in tqdm(cv_outer.split(x, y), desc='Times Looped'):                          #3
    print('\nNestedCV: {} of outer fold {}'.format(pointer, cv_outer.get_n_splits()))
    x_train, x_test = x[train_index], x[test_index] # x (features)
    y_train, y_test = y[train_index], y[test_index] # y (target)

    model = GridSearchCV(classifier, param_grid=p_grid,
                        scoring='f1', cv=cv_inner, n_jobs=-1) # classification --> roc_auc, f1, accuracy   # Grid search  #4
    model.fit(x_train, y_train) # trains ML model
    
    pred_test = model.predict(x_test) # predict test data
    pred_training = model.predict(x_train) # predict training data

    
    # Evaluate model performance
    # scikit learn accuracy
    auc_train = f1_score(y_train, pred_training)     #roc_auc_score, f1, accuracy       #5
    auc_test = f1_score(y_test, pred_test)       #roc_auc_score

    
    print("""
    Best set of parameters: {}
    Best MSE              : {:.2f}

    Training
        MSE: {:.3f}
    Test
        MSE: {:.3f}
    """.format(
        model.best_params_,
        model.best_score_,
        auc_train,
        auc_test,
        )
    )
    history.append(auc_test)
    pointer += 1

print('Overall test performance: {:.2f}'.format(np.mean(history)))

Times Looped: 0it [00:00, ?it/s]


NestedCV: 1 of outer fold 10


Times Looped: 1it [12:43, 763.50s/it]


    Best set of parameters: {'max_depth': 60, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 8, 'n_estimators': 1600}
    Best MSE              : 0.67

    Training
        MSE: 1.000
    Test
        MSE: 0.678
    

NestedCV: 2 of outer fold 10


Times Looped: 2it [24:59, 747.43s/it]


    Best set of parameters: {'max_depth': 60, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 6, 'n_estimators': 2000}
    Best MSE              : 0.67

    Training
        MSE: 1.000
    Test
        MSE: 0.676
    

NestedCV: 3 of outer fold 10


Times Looped: 3it [36:59, 734.94s/it]


    Best set of parameters: {'max_depth': 60, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 8, 'n_estimators': 1600}
    Best MSE              : 0.67

    Training
        MSE: 1.000
    Test
        MSE: 0.671
    

NestedCV: 4 of outer fold 10


Times Looped: 4it [49:07, 732.25s/it]


    Best set of parameters: {'max_depth': 60, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 4, 'n_estimators': 2000}
    Best MSE              : 0.68

    Training
        MSE: 1.000
    Test
        MSE: 0.675
    

NestedCV: 5 of outer fold 10


Times Looped: 5it [1:01:24, 733.81s/it]


    Best set of parameters: {'max_depth': 40, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 6, 'n_estimators': 2000}
    Best MSE              : 0.67

    Training
        MSE: 1.000
    Test
        MSE: 0.686
    

NestedCV: 6 of outer fold 10


Times Looped: 6it [1:14:26, 750.21s/it]


    Best set of parameters: {'max_depth': 40, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 6, 'n_estimators': 2000}
    Best MSE              : 0.67

    Training
        MSE: 1.000
    Test
        MSE: 0.677
    

NestedCV: 7 of outer fold 10


Times Looped: 7it [1:26:45, 746.65s/it]


    Best set of parameters: {'max_depth': 60, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 8, 'n_estimators': 2000}
    Best MSE              : 0.68

    Training
        MSE: 1.000
    Test
        MSE: 0.665
    

NestedCV: 8 of outer fold 10


Times Looped: 8it [1:39:06, 744.65s/it]


    Best set of parameters: {'max_depth': 40, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 6, 'n_estimators': 1600}
    Best MSE              : 0.67

    Training
        MSE: 1.000
    Test
        MSE: 0.673
    

NestedCV: 9 of outer fold 10


Times Looped: 9it [1:51:00, 735.26s/it]


    Best set of parameters: {'max_depth': 60, 'max_features': 'sqrt', 'min_samples_leaf': 6, 'min_samples_split': 8, 'n_estimators': 1600}
    Best MSE              : 0.67

    Training
        MSE: 1.000
    Test
        MSE: 0.674
    

NestedCV: 10 of outer fold 10


Times Looped: 10it [2:02:58, 737.89s/it]


    Best set of parameters: {'max_depth': 40, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 4, 'n_estimators': 2000}
    Best MSE              : 0.68

    Training
        MSE: 1.000
    Test
        MSE: 0.684
    
Overall test performance: 0.68





In [7]:
cv_inner = KFold(n_splits=2, shuffle=True, random_state=42)          #1
cv_outer = KFold(n_splits=10, shuffle=True, random_state=42)
# 10x2 nested cross validation
# 9 folds for training, 1 fold for testing
# 9 folds --> 50% (training) - 50% (validation) split 
classifier = GradientBoostingClassifier()       # RandomForestClassifier, GradientBoostingClassifier, LogisticRegression        #2

p_grid = {#'bootstrap': [True, False],
        'max_depth': [40, 60],
        'max_features': [None, 'sqrt'],
        'min_samples_leaf': [4, 6, 8],
        'min_samples_split': [4, 6, 8],
        'n_estimators': [1600, 2000]}


#Gradient Boosting Classifier, roc_auc
#p_grid = {'max_depth': [40, 80, 120, 160, None],
#        'max_features': [None, 'sqrt'],
#        'min_samples_leaf': [1, 2, 4, 6, 8],
#        'min_samples_split': [1, 2, 5, 8, 10],
#        'n_estimators': [500, 1000, 1500, 1800, 2000]}

history = []
pointer = 1
for train_index, test_index in tqdm(cv_outer.split(x, y), desc='Times Looped'):                          #3
    print('\nNestedCV: {} of outer fold {}'.format(pointer, cv_outer.get_n_splits()))
    x_train, x_test = x[train_index], x[test_index] # x (features)
    y_train, y_test = y[train_index], y[test_index] # y (target)

    model = GridSearchCV(classifier, param_grid=p_grid,
                        scoring='accuracy', cv=cv_inner, n_jobs=-1) # classification --> roc_auc, f1, accuracy   # Grid search  #4
    model.fit(x_train, y_train) # trains ML model
    
    pred_test = model.predict(x_test) # predict test data
    pred_training = model.predict(x_train) # predict training data

    
    # Evaluate model performance
    # scikit learn accuracy
    auc_train = accuracy_score(y_train, pred_training)     #roc_auc_score, f1, accuracy       #5
    auc_test = accuracy_score(y_test, pred_test)       #roc_auc_score

    
    print("""
    Best set of parameters: {}
    Best MSE              : {:.2f}

    Training
        MSE: {:.3f}
    Test
        MSE: {:.3f}
    """.format(
        model.best_params_,
        model.best_score_,
        auc_train,
        auc_test,
        )
    )
    history.append(auc_test)
    pointer += 1

print('Overall test performance: {:.2f}'.format(np.mean(history)))

Times Looped: 0it [00:00, ?it/s]


NestedCV: 1 of outer fold 10


Times Looped: 1it [12:30, 750.32s/it]


    Best set of parameters: {'max_depth': 40, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 4, 'n_estimators': 2000}
    Best MSE              : 0.67

    Training
        MSE: 1.000
    Test
        MSE: 0.678
    

NestedCV: 2 of outer fold 10


Times Looped: 2it [24:52, 745.79s/it]


    Best set of parameters: {'max_depth': 40, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 8, 'n_estimators': 1600}
    Best MSE              : 0.67

    Training
        MSE: 1.000
    Test
        MSE: 0.680
    

NestedCV: 3 of outer fold 10


Times Looped: 3it [37:21, 747.25s/it]


    Best set of parameters: {'max_depth': 40, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 4, 'n_estimators': 2000}
    Best MSE              : 0.67

    Training
        MSE: 1.000
    Test
        MSE: 0.678
    

NestedCV: 4 of outer fold 10


Times Looped: 4it [50:13, 756.72s/it]


    Best set of parameters: {'max_depth': 60, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 6, 'n_estimators': 2000}
    Best MSE              : 0.68

    Training
        MSE: 1.000
    Test
        MSE: 0.677
    

NestedCV: 5 of outer fold 10


Times Looped: 5it [1:02:24, 747.57s/it]


    Best set of parameters: {'max_depth': 60, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 4, 'n_estimators': 1600}
    Best MSE              : 0.67

    Training
        MSE: 1.000
    Test
        MSE: 0.673
    

NestedCV: 6 of outer fold 10


Times Looped: 6it [1:14:49, 746.83s/it]


    Best set of parameters: {'max_depth': 40, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 4, 'n_estimators': 1600}
    Best MSE              : 0.67

    Training
        MSE: 1.000
    Test
        MSE: 0.674
    

NestedCV: 7 of outer fold 10


Times Looped: 7it [1:27:06, 743.36s/it]


    Best set of parameters: {'max_depth': 40, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 4, 'n_estimators': 2000}
    Best MSE              : 0.68

    Training
        MSE: 1.000
    Test
        MSE: 0.666
    

NestedCV: 8 of outer fold 10


Times Looped: 8it [1:39:31, 743.88s/it]


    Best set of parameters: {'max_depth': 60, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 8, 'n_estimators': 2000}
    Best MSE              : 0.67

    Training
        MSE: 1.000
    Test
        MSE: 0.676
    

NestedCV: 9 of outer fold 10


Times Looped: 9it [1:51:29, 735.94s/it]


    Best set of parameters: {'max_depth': 40, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 6, 'n_estimators': 1600}
    Best MSE              : 0.67

    Training
        MSE: 1.000
    Test
        MSE: 0.678
    

NestedCV: 10 of outer fold 10


Times Looped: 10it [2:03:23, 740.32s/it]


    Best set of parameters: {'max_depth': 40, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 8, 'n_estimators': 1600}
    Best MSE              : 0.67

    Training
        MSE: 1.000
    Test
        MSE: 0.674
    
Overall test performance: 0.68



