In [2]:
import pandas as pd
import os
import chardet
import numpy as np
import glob
import xgboost
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.feature_selection import VarianceThreshold

from sklearn.preprocessing import  StandardScaler, RobustScaler, MinMaxScaler
from sklearn.svm import SVC
from sklearn.metrics import brier_score_loss
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier


In [3]:

from plotly.subplots import make_subplots
import plotly.graph_objs as go
import pprint
from scipy import stats
from IPython.display import display

__author__ = "Juanma Hernández"
__copyright__ = "Copyright 2019"
__credits__ = ["Juanma Hernández", "George Fisher"]
__license__ = "GPL"
__maintainer__ = "Juanma Hernández"
__email__ = "https://twitter.com/juanmah"
__status__ = "Utility script"


def plot_grid_search(clf):
    """Plot as many graphs as parameters are in the grid search results.

    Each graph has the values of each parameter in the X axis and the Score in the Y axis.

    Parameters
    ----------
    clf: estimator object result of a GridSearchCV
        This object contains all the information of the cross validated results for all the parameters combinations.
    """
    # Convert the cross validated results in a DataFrame ordered by `rank_test_score` and `mean_fit_time`.
    # As it is frequent to have more than one combination with the same max score,
    # the one with the least mean fit time SHALL appear first.
    cv_results = pd.DataFrame(clf.cv_results_).sort_values(by=['rank_test_score', 'mean_fit_time'])

    # Get parameters
    parameters=cv_results['params'][0].keys()

    # Calculate the number of rows and columns necessary
    rows = -(-len(parameters) // 2)
    columns = min(len(parameters), 2)
    # Create the subplot
    fig = make_subplots(rows=rows, cols=columns)
    # Initialize row and column indexes
    row = 1
    column = 1

    # For each of the parameters
    for parameter in parameters:

        # As all the graphs have the same traces, and by default all traces are shown in the legend,
        # the description appears multiple times. Then, only show legend of the first graph.
        if row == 1 and column == 1:
            show_legend = True
        else:
            show_legend = False

        # Mean test score
        mean_test_score = cv_results[cv_results['rank_test_score'] != 1]
        fig.add_trace(go.Scatter(
            name='Mean test score',
            x=mean_test_score['param_' + parameter],
            y=mean_test_score['mean_test_score'],
            mode='markers',
            marker=dict(size=mean_test_score['mean_fit_time'],
                        color='SteelBlue',
                        sizeref=2. * cv_results['mean_fit_time'].max() / (40. ** 2),
                        sizemin=4,
                        sizemode='area'),
            text=mean_test_score['params'].apply(
                lambda x: pprint.pformat(x, width=-1).replace('{', '').replace('}', '').replace('\n', '<br />')),
            showlegend=show_legend),
            row=row,
            col=column)

        # Best estimators
        rank_1 = cv_results[cv_results['rank_test_score'] == 1]
        fig.add_trace(go.Scatter(
            name='Best estimators',
            x=rank_1['param_' + parameter],
            y=rank_1['mean_test_score'],
            mode='markers',
            marker=dict(size=rank_1['mean_fit_time'],
                        color='Crimson',
                        sizeref=2. * cv_results['mean_fit_time'].max() / (40. ** 2),
                        sizemin=4,
                        sizemode='area'),
            text=rank_1['params'].apply(str),
            showlegend=show_legend),
            row=row,
            col=column)

        fig.update_xaxes(title_text=parameter, row=row, col=column)
        fig.update_yaxes(title_text='Score', row=row, col=column)

        # Check the linearity of the series
        # Only for numeric series
        if pd.to_numeric(cv_results['param_' + parameter], errors='coerce').notnull().all():
            x_values = cv_results['param_' + parameter].sort_values().unique().tolist()
            r = stats.linregress(x_values, range(0, len(x_values))).rvalue
            # If not so linear, then represent the data as logarithmic
            if r < 0.86:
                fig.update_xaxes(type='log', row=row, col=column)

        # Increment the row and column indexes
        column += 1
        if column > columns:
            column = 1
            row += 1

            # Show first the best estimators
    fig.update_layout(legend=dict(traceorder='reversed'),
                      width=columns * 360 + 100,
                      height=rows * 360,
                      title='Best score: {:.6f} with {}'.format(cv_results['mean_test_score'].iloc[0],
                                                                str(cv_results['params'].iloc[0]).replace('{',
                                                                                                          '').replace(
                                                                    '}', '')),
                      hovermode='closest',
                      template='none')
    fig.show()


def table_grid_search(clf, all_columns=False, all_ranks=False, save=True):
    """Show tables with the grid search results.

    Parameters
    ----------
    clf: estimator object result of a GridSearchCV
        This object contains all the information of the cross validated results for all the parameters combinations.

    all_columns: boolean, default: False
        If true all columns are returned. If false, the following columns are dropped:

        - params. As each parameter has a column with the value.
        - std_*. Standard deviations.
        - split*. Split scores.

    all_ranks: boolean, default: False
        If true all ranks are returned. If false, only the rows with rank equal to 1 are returned.

    save: boolean, default: True
        If true, results are saved to a CSV file.
    """
    # Convert the cross validated results in a DataFrame ordered by `rank_test_score` and `mean_fit_time`.
    # As it is frequent to have more than one combination with the same max score,
    # the one with the least mean fit time SHALL appear first.
    cv_results = pd.DataFrame(clf.cv_results_).sort_values(by=['rank_test_score', 'mean_fit_time'])

    # Reorder
    columns = cv_results.columns.tolist()
    # rank_test_score first, mean_test_score second and std_test_score third
    columns = columns[-1:] + columns[-3:-1] + columns[:-3]
    cv_results = cv_results[columns]

    if save:
        cv_results.to_csv('--'.join(cv_results['params'][0].keys()) + '.csv', index=True, index_label='Id')

    # Unless all_columns are True, drop not wanted columns: params, std_* split*
    if not all_columns:
        cv_results.drop('params', axis='columns', inplace=True)
        cv_results.drop(list(cv_results.filter(regex='^std_.*')), axis='columns', inplace=True)
        cv_results.drop(list(cv_results.filter(regex='^split.*')), axis='columns', inplace=True)

    # Unless all_ranks are True, filter out those rows which have rank equal to one
    if not all_ranks:
        cv_results = cv_results[cv_results['rank_test_score'] == 1]
        cv_results.drop('rank_test_score', axis = 'columns', inplace = True)        

    display(cv_results)

In [5]:
# Define the path where your CSV files are located
csv_path = "Data/*.csv"

# Get a list of all CSV file paths matching the pattern
csv_files = glob.glob(csv_path)

# Initialize an empty dictionary to store the DataFrames
dataframes = {}
# Read each CSV file into a DataFrame and store it in the dictionary
for file in csv_files:
    # Extract the file name without the extension as the dictionary key
    file_name = file.split(".",1)[0]
    file_name = file_name.split("Data/")[1]

        # Read the CSV file into a DataFrame
    with open(file, 'rb') as f:
        result = chardet.detect(f.read())
    encoding = result['encoding']
    df = pd.read_csv(file,encoding=encoding)
        
        # Store the DataFrame in the dictionary
    dataframes[file_name] = df

# Access the individual DataFrames by their respective keys


In [7]:
# Put all of the male games into one folder
maleGames = pd.DataFrame()
maleGames = pd.concat([dataframes["MRegularSeasonDetailedResults"],dataframes["MNCAATourneyDetailedResults"]], ignore_index=True)
maleGames

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108877,2022,146,1242,76,1274,50,N,0,29,58,...,21,9,13,5,21,7,14,7,4,20
108878,2022,146,1314,69,1389,49,N,0,25,61,...,16,9,10,4,25,11,7,4,7,18
108879,2022,152,1242,81,1437,65,N,0,29,54,...,31,8,10,12,17,12,9,3,0,11
108880,2022,152,1314,81,1181,77,N,0,27,64,...,22,12,20,13,25,12,4,7,4,18


In [8]:

# Set the maximum number of columns to display when using pandas to 500.
pd.set_option('display.max_columns', 500)

# Initialize a new column "CityID" in the "maleGames" DataFrame and assign all values to 0.
maleGames["CityID"] = 0

# Loop through each row in the "maleGames" DataFrame, starting from index 34074 to the end.
for x in range(34074, len(maleGames)):
    # Initialize a flag to indicate if the matching CityID is found for the current game.
    done = 0
    
    # Extract the current row as a pandas Series and store it in the variable "currGame".
    currGame = maleGames.iloc[x]
    
    # Get the highest and lowest indices of the "dataframes["MGameCities"]" DataFrame.
    hi = len(dataframes["MGameCities"])
    lo = 0
    
    # Check if the "Season" value of the current game is greater than 2009.
    if currGame["Season"] > 2009:
        # Binary search loop to find the matching CityID for the current game.
        while not done and hi != lo:
            # Calculate the middle index of the "dataframes["MGameCities"]" DataFrame.
            mid = int((hi + lo) / 2)
            
            # Compare the "Season" values of the current game and the middle row.
            if currGame["Season"] > dataframes["MGameCities"].iloc[mid]["Season"]:
                lo = mid
            elif currGame["Season"] < dataframes["MGameCities"].iloc[mid]["Season"]:
                hi = mid
            else:
                # If the "Season" values are equal, compare the "DayNum" values.
                if currGame["DayNum"] > dataframes["MGameCities"].iloc[mid]["DayNum"]:
                    lo = mid
                elif currGame["DayNum"] < dataframes["MGameCities"].iloc[mid]["DayNum"]:
                    hi = mid
                else:
                    # If the "DayNum" values are equal, compare the "WTeamID" values.
                    if currGame["WTeamID"] > dataframes["MGameCities"].iloc[mid]["WTeamID"]:
                        lo = mid
                    elif currGame["WTeamID"] < dataframes["MGameCities"].iloc[mid]["WTeamID"]:
                        hi = mid
                    else:
                        # If all the values match, assign the CityID from the matching row.
                        maleGames.loc[x, "CityID"] = dataframes["MGameCities"].iloc[mid]["CityID"]
                        done = 1

In [9]:
# Remove all of the games without a city
maleGames = maleGames[maleGames["CityID"]!= 0]
maleGames

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,WFGM3,WFGA3,WFTM,WFTA,WOR,WDR,WAst,WTO,WStl,WBlk,WPF,LFGM,LFGA,LFGM3,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF,CityID
34074,2010,7,1143,75,1293,70,H,0,24,52,5,12,22,32,13,19,15,12,8,1,14,26,52,8,21,10,15,11,20,11,17,7,3,24,4027
34075,2010,7,1314,88,1198,72,H,0,34,61,4,13,16,19,12,32,23,26,10,8,19,25,68,8,23,14,17,13,16,15,20,14,2,18,4061
34076,2010,7,1326,100,1108,60,H,0,39,73,14,33,8,12,13,34,29,6,8,6,19,21,61,7,17,11,20,11,24,7,13,4,5,16,4080
34077,2010,7,1393,75,1107,43,H,0,29,60,2,17,15,31,14,32,21,21,21,6,16,15,55,5,28,8,14,12,27,10,32,12,4,27,4340
34078,2010,9,1143,95,1178,61,H,0,29,61,7,17,30,35,15,30,11,10,5,3,20,20,57,4,18,17,27,14,18,14,15,4,2,21,4027
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108877,2022,146,1242,76,1274,50,N,0,29,58,5,14,13,26,6,33,18,10,7,3,16,19,55,3,21,9,13,5,21,7,14,7,4,20,4071
108878,2022,146,1314,69,1389,49,N,0,25,61,6,22,13,21,12,33,15,6,6,5,12,18,60,4,16,9,10,4,25,11,7,4,7,18,4266
108879,2022,152,1242,81,1437,65,N,0,29,54,13,24,10,13,10,25,18,7,4,4,8,22,57,13,31,8,10,12,17,12,9,3,0,11,4237
108880,2022,152,1314,81,1181,77,N,0,27,64,10,26,17,24,15,33,7,10,2,5,19,30,72,5,22,12,20,13,25,12,4,7,4,18,4237


In [10]:
# Each column is being swapped to represent Team 1 and Team 2 instead of Winning team and losing team
column_name_mapping = {
    'WTeamID': 'Team1',
    'WScore' : 'Team1Score',
    'LTeamID': 'Team2',
    'LScore': 'Team2Score',
    'WLoc': 'Team1Loc',
    'WFGM': 'Team1FGM',
    'WFGA': 'Team1FGA',
    'WFGM3': 'Team1FGM3',
    'WFGA3': 'Team1FGA3',
    'WFTM': 'Team1FTM',
    'WOR': 'Team1OR',
    'WDR': 'Team1DR',
    'WAst': 'Team1Ast',
    'WTO': 'Team1TO',
    'WStl': 'Team1Stl',
    'WBlk': 'Team1Blk',
    'WPF': 'Team1PF',
    'LFGM': 'Team2FGM',
    'LFGA': 'Team2FGA',
    'LFGM3': 'Team2FGM3',
    'LFGA3': 'Team2FGA3',
    'LFTM': 'Team2FTM',
    'LOR': 'Team2OR',
    'LDR': 'Team2DR',
    'LAst': 'Team2Ast',
    'LTO': 'Team2TO',
    'LStl': 'Team2Stl',
    'LBlk': 'Team2Blk',
    'LPF': 'Team2PF'
}
# Each row is being duplicated and swapped around so that the winning team and losing team are both team 1 and team 2

newDf = pd.DataFrame(np.repeat(maleGames.values, 2, axis=0))
for x in range(1,len(newDf),2):
    newDf.loc[x,2],newDf.loc[x,4] = newDf.loc[x,4],newDf.loc[x,2]
    newDf.loc[x,3],newDf.loc[x,5] = newDf.loc[x,5],newDf.loc[x,3]
    for y in range(8,21):
       newDf.loc[x,y],newDf.loc[x,y+13] = newDf.loc[x,y+13],newDf.loc[x,y]    
newDf.columns = maleGames.columns
newDf.rename(columns=column_name_mapping, inplace=True)
# The winner and Point difference are added as new rows
for x in range(len(newDf)):
    if x % 2 == 0:
        newDf.loc[x, "Win"] = 1
    newDf.loc[x, "PointDif"] = newDf.loc[x, "Team1Score"] - newDf.loc[x, "Team2Score"]
newDf

Unnamed: 0,Season,DayNum,Team1,Team1Score,Team2,Team2Score,Team1Loc,NumOT,Team1FGM,Team1FGA,Team1FGM3,Team1FGA3,Team1FTM,WFTA,Team1OR,Team1DR,Team1Ast,Team1TO,Team1Stl,Team1Blk,Team1PF,Team2FGM,Team2FGA,Team2FGM3,Team2FGA3,Team2FTM,LFTA,Team2OR,Team2DR,Team2Ast,Team2TO,Team2Stl,Team2Blk,Team2PF,CityID,Win,PointDif
0,2010,7,1143,75,1293,70,H,0,24,52,5,12,22,32,13,19,15,12,8,1,14,26,52,8,21,10,15,11,20,11,17,7,3,24,4027,1.0,5.0
1,2010,7,1293,70,1143,75,H,0,26,52,8,21,10,15,11,20,11,17,7,3,24,24,52,5,12,22,32,13,19,15,12,8,1,14,4027,,-5.0
2,2010,7,1314,88,1198,72,H,0,34,61,4,13,16,19,12,32,23,26,10,8,19,25,68,8,23,14,17,13,16,15,20,14,2,18,4061,1.0,16.0
3,2010,7,1198,72,1314,88,H,0,25,68,8,23,14,17,13,16,15,20,14,2,18,34,61,4,13,16,19,12,32,23,26,10,8,19,4061,,-16.0
4,2010,7,1326,100,1108,60,H,0,39,73,14,33,8,12,13,34,29,6,8,6,19,21,61,7,17,11,20,11,24,7,13,4,5,16,4080,1.0,40.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148715,2022,152,1437,65,1242,81,N,0,22,57,13,31,8,10,12,17,12,9,3,0,11,29,54,13,24,10,13,10,25,18,7,4,4,8,4237,,-16.0
148716,2022,152,1314,81,1181,77,N,0,27,64,10,26,17,24,15,33,7,10,2,5,19,30,72,5,22,12,20,13,25,12,4,7,4,18,4237,1.0,4.0
148717,2022,152,1181,77,1314,81,N,0,30,72,5,22,12,20,13,25,12,4,7,4,18,27,64,10,26,17,24,15,33,7,10,2,5,19,4237,,-4.0
148718,2022,154,1242,72,1314,69,N,0,29,66,6,17,8,14,7,27,11,9,6,5,16,23,73,5,23,18,22,20,29,9,13,2,6,13,4237,1.0,3.0


In [11]:
# Each team in March Madness has their seed added
newDf 
newDf["Team1Seed"] = 0
newDf["Team2Seed"] = 0 
for x in range(0,len(newDf)):
    if(newDf["DayNum"].iloc[x]>131):
        currTeam1 = newDf["Team1"].iloc[x]
        currTeam2 = newDf["Team2"].iloc[x]
        currSeason = newDf["Season"].iloc[x]
        for y in range(0,len(dataframes["MNCAATourneySeeds"])):
            if(currTeam1 == dataframes["MNCAATourneySeeds"]["TeamID"].iloc[y] and currSeason == dataframes["MNCAATourneySeeds"]["Season"].iloc[y]):
                newDf.loc[x,"Team1Seed"] = dataframes["MNCAATourneySeeds"]["Seed"].iloc[y]
            if(currTeam2 == dataframes["MNCAATourneySeeds"]["TeamID"].iloc[y] and currSeason == dataframes["MNCAATourneySeeds"]["Season"].iloc[y]):
                newDf.loc[x,"Team2Seed"] = dataframes["MNCAATourneySeeds"]["Seed"].iloc[y]
copyDf = newDf.copy()
# This section updates new columns to work out each teams win/loss ratio
copyDf['AWin'] = (copyDf['Team1Score'] > copyDf['Team2Score']).astype(int)
copyDf['ALoss'] = (copyDf['Team1Score'] < copyDf['Team2Score']).astype(int)
copyDf['BWin'] = (copyDf['Team1Score'] < copyDf['Team2Score']).astype(int)
copyDf['BLoss'] = (copyDf['Team1Score'] > copyDf['Team2Score']).astype(int)
copyDf['AWins'] = copyDf.groupby(['Season', 'Team1'])['AWin'].transform('sum')
copyDf['ALosses'] = copyDf.groupby(['Season', 'Team1'])['ALoss'].transform('sum')
copyDf['BWins'] = copyDf.groupby(['Season', 'Team2'])['BWin'].transform('sum')
copyDf['BLosses'] = copyDf.groupby(['Season', 'Team2'])['BLoss'].transform('sum')
copyDf["TeamAWRatio"] = copyDf["AWins"] / (copyDf["AWins"] + copyDf["ALosses"])
copyDf["TeamBWRatio"] = copyDf["BWins"] / (copyDf["BWins"] + copyDf["BLosses"])
copyDf = copyDf.drop(["ALoss", "BLoss", "BWin", "AWins", "BLosses", "ALosses", "BWins"], axis= 1)
copyDf

Unnamed: 0,Season,DayNum,Team1,Team1Score,Team2,Team2Score,Team1Loc,NumOT,Team1FGM,Team1FGA,Team1FGM3,Team1FGA3,Team1FTM,WFTA,Team1OR,Team1DR,Team1Ast,Team1TO,Team1Stl,Team1Blk,Team1PF,Team2FGM,Team2FGA,Team2FGM3,Team2FGA3,Team2FTM,LFTA,Team2OR,Team2DR,Team2Ast,Team2TO,Team2Stl,Team2Blk,Team2PF,CityID,Win,PointDif,Team1Seed,Team2Seed,AWin,TeamAWRatio,TeamBWRatio
0,2010,7,1143,75,1293,70,H,0,24,52,5,12,22,32,13,19,15,12,8,1,14,26,52,8,21,10,15,11,20,11,17,7,3,24,4027,1.0,5.0,0,0,1,0.685714,0.852941
1,2010,7,1293,70,1143,75,H,0,26,52,8,21,10,15,11,20,11,17,7,3,24,24,52,5,12,22,32,13,19,15,12,8,1,14,4027,,-5.0,0,0,0,0.852941,0.685714
2,2010,7,1314,88,1198,72,H,0,34,61,4,13,16,19,12,32,23,26,10,8,19,25,68,8,23,14,17,13,16,15,20,14,2,18,4061,1.0,16.0,0,0,1,0.500000,0.193548
3,2010,7,1198,72,1314,88,H,0,25,68,8,23,14,17,13,16,15,20,14,2,18,34,61,4,13,16,19,12,32,23,26,10,8,19,4061,,-16.0,0,0,0,0.193548,0.500000
4,2010,7,1326,100,1108,60,H,0,39,73,14,33,8,12,13,34,29,6,8,6,19,21,61,7,17,11,20,11,24,7,13,4,5,16,4080,1.0,40.0,0,0,1,0.783784,0.064516
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148715,2022,152,1437,65,1242,81,N,0,22,57,13,31,8,10,12,17,12,9,3,0,11,29,54,13,24,10,13,10,25,18,7,4,4,8,4237,,-16.0,Z02,Y01,0,0.789474,0.850000
148716,2022,152,1314,81,1181,77,N,0,27,64,10,26,17,24,15,33,7,10,2,5,19,30,72,5,22,12,20,13,25,12,4,7,4,18,4237,1.0,4.0,W08,X02,1,0.743590,0.820513
148717,2022,152,1181,77,1314,81,N,0,30,72,5,22,12,20,13,25,12,4,7,4,18,27,64,10,26,17,24,15,33,7,10,2,5,19,4237,,-4.0,X02,W08,0,0.820513,0.743590
148718,2022,154,1242,72,1314,69,N,0,29,66,6,17,8,14,7,27,11,9,6,5,16,23,73,5,23,18,22,20,29,9,13,2,6,13,4237,1.0,3.0,Y01,W08,1,0.850000,0.743590


In [12]:

# Create two copies of the original DataFrame "copyDf" to work with separately.
T1 = copyDf.copy()
T2 = copyDf.copy()

# Rename columns in "T1" and "T2" to distinguish between Team1 and Team2 statistics.
# For columns starting with "Team1," replace "Team1" with "opponent_" to get the corresponding columns for Team2.
# For columns starting with "Team2," replace "Team2" with "opponent_" to get the corresponding columns for Team1.
T1.columns = ["Team1" + x.replace("Team1", "").replace("Team2", "opponent_") for x in list(T1.columns)]
T2.columns = ["Team2" + x.replace("Team1", "").replace("Team2", "opponent_") for x in list(T2.columns)]

# Define a list of features to be used in further analysis or modeling.
features = ['Team1FGM', 'Team1FGA', 'Team1FGM3', 'Team1FGA3', 'Team1FTM', 'WFTA', 'Team1OR', 'Team1DR', 'Team1Ast', 'Team1TO', 'Team1Stl', 'Team1Blk', 'Team1PF', 'Team2FGM', 'Team2FGA', 'Team2FGM3', 'Team2FGA3', 'Team2FTM', 'LFTA',
            'Team2OR', 'Team2DR', 'Team2Ast', 'Team2TO', 'Team2Stl', 'Team2Blk', 'Team2PF', 'PointDif']

# Group "copyDf" by "Season" and 'Team1' and calculate the mean for the selected features.
meanDf = copyDf.groupby(["Season", 'Team1'])[features].mean().reset_index()

# Rename the first two columns of "T1" and "T2" to "Season" and "DayNum".
T2.columns.values[0] = "Season"
T1.columns.values[0] = "Season"
T1.columns.values[1] = "DayNum"
T2.columns.values[1] = "DayNum"

# Drop unnecessary columns from "T2" to focus on Team2 statistics only.
T2 = T2.drop(['Team2Score', 'Team2opponent_', 'Team2opponent_Score', 'Team2Loc', 'Team2NumOT', 'Team2opponent_Seed', 'Team2CityID', 'Team2Seed', 'Team2Win', 'Team2TeamAWRatio', 'Team2TeamBWRatio', 'Team2AWin'], axis=1)

# Drop unnecessary columns from "T1" to focus on Team1 statistics only.
T1 = T1.drop(['Team1Score', 'Team1opponent_', 'Team1opponent_Score', 'Team1Loc', 'Team1NumOT', 'Team1opponent_Seed', 'Team1CityID', 'Team1Seed', 'Team1Win', 'Team1TeamAWRatio', 'Team1TeamBWRatio', 'Team1AWin'], axis=1)
copyDf

Unnamed: 0,Season,DayNum,Team1,Team1Score,Team2,Team2Score,Team1Loc,NumOT,Team1FGM,Team1FGA,Team1FGM3,Team1FGA3,Team1FTM,WFTA,Team1OR,Team1DR,Team1Ast,Team1TO,Team1Stl,Team1Blk,Team1PF,Team2FGM,Team2FGA,Team2FGM3,Team2FGA3,Team2FTM,LFTA,Team2OR,Team2DR,Team2Ast,Team2TO,Team2Stl,Team2Blk,Team2PF,CityID,Win,PointDif,Team1Seed,Team2Seed,AWin,TeamAWRatio,TeamBWRatio
0,2010,7,1143,75,1293,70,H,0,24,52,5,12,22,32,13,19,15,12,8,1,14,26,52,8,21,10,15,11,20,11,17,7,3,24,4027,1.0,5.0,0,0,1,0.685714,0.852941
1,2010,7,1293,70,1143,75,H,0,26,52,8,21,10,15,11,20,11,17,7,3,24,24,52,5,12,22,32,13,19,15,12,8,1,14,4027,,-5.0,0,0,0,0.852941,0.685714
2,2010,7,1314,88,1198,72,H,0,34,61,4,13,16,19,12,32,23,26,10,8,19,25,68,8,23,14,17,13,16,15,20,14,2,18,4061,1.0,16.0,0,0,1,0.500000,0.193548
3,2010,7,1198,72,1314,88,H,0,25,68,8,23,14,17,13,16,15,20,14,2,18,34,61,4,13,16,19,12,32,23,26,10,8,19,4061,,-16.0,0,0,0,0.193548,0.500000
4,2010,7,1326,100,1108,60,H,0,39,73,14,33,8,12,13,34,29,6,8,6,19,21,61,7,17,11,20,11,24,7,13,4,5,16,4080,1.0,40.0,0,0,1,0.783784,0.064516
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148715,2022,152,1437,65,1242,81,N,0,22,57,13,31,8,10,12,17,12,9,3,0,11,29,54,13,24,10,13,10,25,18,7,4,4,8,4237,,-16.0,Z02,Y01,0,0.789474,0.850000
148716,2022,152,1314,81,1181,77,N,0,27,64,10,26,17,24,15,33,7,10,2,5,19,30,72,5,22,12,20,13,25,12,4,7,4,18,4237,1.0,4.0,W08,X02,1,0.743590,0.820513
148717,2022,152,1181,77,1314,81,N,0,30,72,5,22,12,20,13,25,12,4,7,4,18,27,64,10,26,17,24,15,33,7,10,2,5,19,4237,,-4.0,X02,W08,0,0.820513,0.743590
148718,2022,154,1242,72,1314,69,N,0,29,66,6,17,8,14,7,27,11,9,6,5,16,23,73,5,23,18,22,20,29,9,13,2,6,13,4237,1.0,3.0,Y01,W08,1,0.850000,0.743590


In [13]:
T2.head()

Unnamed: 0,Season,DayNum,Team2,Team2FGM,Team2FGA,Team2FGM3,Team2FGA3,Team2FTM,Team2WFTA,Team2OR,Team2DR,Team2Ast,Team2TO,Team2Stl,Team2Blk,Team2PF,Team2opponent_FGM,Team2opponent_FGA,Team2opponent_FGM3,Team2opponent_FGA3,Team2opponent_FTM,Team2LFTA,Team2opponent_OR,Team2opponent_DR,Team2opponent_Ast,Team2opponent_TO,Team2opponent_Stl,Team2opponent_Blk,Team2opponent_PF,Team2PointDif
0,2010,7,1143,24,52,5,12,22,32,13,19,15,12,8,1,14,26,52,8,21,10,15,11,20,11,17,7,3,24,5.0
1,2010,7,1293,26,52,8,21,10,15,11,20,11,17,7,3,24,24,52,5,12,22,32,13,19,15,12,8,1,14,-5.0
2,2010,7,1314,34,61,4,13,16,19,12,32,23,26,10,8,19,25,68,8,23,14,17,13,16,15,20,14,2,18,16.0
3,2010,7,1198,25,68,8,23,14,17,13,16,15,20,14,2,18,34,61,4,13,16,19,12,32,23,26,10,8,19,-16.0
4,2010,7,1326,39,73,14,33,8,12,13,34,29,6,8,6,19,21,61,7,17,11,20,11,24,7,13,4,5,16,40.0


In [14]:
T1.head()

Unnamed: 0,Season,DayNum,Team1,Team1FGM,Team1FGA,Team1FGM3,Team1FGA3,Team1FTM,Team1WFTA,Team1OR,Team1DR,Team1Ast,Team1TO,Team1Stl,Team1Blk,Team1PF,Team1opponent_FGM,Team1opponent_FGA,Team1opponent_FGM3,Team1opponent_FGA3,Team1opponent_FTM,Team1LFTA,Team1opponent_OR,Team1opponent_DR,Team1opponent_Ast,Team1opponent_TO,Team1opponent_Stl,Team1opponent_Blk,Team1opponent_PF,Team1PointDif
0,2010,7,1143,24,52,5,12,22,32,13,19,15,12,8,1,14,26,52,8,21,10,15,11,20,11,17,7,3,24,5.0
1,2010,7,1293,26,52,8,21,10,15,11,20,11,17,7,3,24,24,52,5,12,22,32,13,19,15,12,8,1,14,-5.0
2,2010,7,1314,34,61,4,13,16,19,12,32,23,26,10,8,19,25,68,8,23,14,17,13,16,15,20,14,2,18,16.0
3,2010,7,1198,25,68,8,23,14,17,13,16,15,20,14,2,18,34,61,4,13,16,19,12,32,23,26,10,8,19,-16.0
4,2010,7,1326,39,73,14,33,8,12,13,34,29,6,8,6,19,21,61,7,17,11,20,11,24,7,13,4,5,16,40.0


In [15]:
# Copying all of the important columns into a new dataframe for analysing

regSeason = copyDf[['Season', 'DayNum', 'Team1', 'Team1Score', 'Team2', 'Team2Score', 'TeamAWRatio', 'TeamBWRatio']]

reg = regSeason.merge(T1, on = ['Season', 'Team1', 'DayNum' ])

reg = reg.merge(T2, on = ['Season', 'Team2', 'DayNum'])
reg['AWin'] = (reg['Team1Score'] > reg['Team2Score']).astype(int)
reg = reg.drop(["Team1PointDif", 'Team2PointDif'], axis= 1)

reg

Unnamed: 0,Season,DayNum,Team1,Team1Score,Team2,Team2Score,TeamAWRatio,TeamBWRatio,Team1FGM,Team1FGA,Team1FGM3,Team1FGA3,Team1FTM,Team1WFTA,Team1OR,Team1DR,Team1Ast,Team1TO,Team1Stl,Team1Blk,Team1PF,Team1opponent_FGM,Team1opponent_FGA,Team1opponent_FGM3,Team1opponent_FGA3,Team1opponent_FTM,Team1LFTA,Team1opponent_OR,Team1opponent_DR,Team1opponent_Ast,Team1opponent_TO,Team1opponent_Stl,Team1opponent_Blk,Team1opponent_PF,Team2FGM,Team2FGA,Team2FGM3,Team2FGA3,Team2FTM,Team2WFTA,Team2OR,Team2DR,Team2Ast,Team2TO,Team2Stl,Team2Blk,Team2PF,Team2opponent_FGM,Team2opponent_FGA,Team2opponent_FGM3,Team2opponent_FGA3,Team2opponent_FTM,Team2LFTA,Team2opponent_OR,Team2opponent_DR,Team2opponent_Ast,Team2opponent_TO,Team2opponent_Stl,Team2opponent_Blk,Team2opponent_PF,AWin
0,2010,7,1143,75,1293,70,0.685714,0.852941,24,52,5,12,22,32,13,19,15,12,8,1,14,26,52,8,21,10,15,11,20,11,17,7,3,24,26,52,8,21,10,15,11,20,11,17,7,3,24,24,52,5,12,22,32,13,19,15,12,8,1,14,1
1,2010,7,1293,70,1143,75,0.852941,0.685714,26,52,8,21,10,15,11,20,11,17,7,3,24,24,52,5,12,22,32,13,19,15,12,8,1,14,24,52,5,12,22,32,13,19,15,12,8,1,14,26,52,8,21,10,15,11,20,11,17,7,3,24,0
2,2010,7,1314,88,1198,72,0.500000,0.193548,34,61,4,13,16,19,12,32,23,26,10,8,19,25,68,8,23,14,17,13,16,15,20,14,2,18,25,68,8,23,14,17,13,16,15,20,14,2,18,34,61,4,13,16,19,12,32,23,26,10,8,19,1
3,2010,7,1198,72,1314,88,0.193548,0.500000,25,68,8,23,14,17,13,16,15,20,14,2,18,34,61,4,13,16,19,12,32,23,26,10,8,19,34,61,4,13,16,19,12,32,23,26,10,8,19,25,68,8,23,14,17,13,16,15,20,14,2,18,0
4,2010,7,1326,100,1108,60,0.783784,0.064516,39,73,14,33,8,12,13,34,29,6,8,6,19,21,61,7,17,11,20,11,24,7,13,4,5,16,21,61,7,17,11,20,11,24,7,13,4,5,16,39,73,14,33,8,12,13,34,29,6,8,6,19,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148715,2022,152,1437,65,1242,81,0.789474,0.850000,22,57,13,31,8,10,12,17,12,9,3,0,11,29,54,13,24,10,13,10,25,18,7,4,4,8,29,54,13,24,10,13,10,25,18,7,4,4,8,22,57,13,31,8,10,12,17,12,9,3,0,11,0
148716,2022,152,1314,81,1181,77,0.743590,0.820513,27,64,10,26,17,24,15,33,7,10,2,5,19,30,72,5,22,12,20,13,25,12,4,7,4,18,30,72,5,22,12,20,13,25,12,4,7,4,18,27,64,10,26,17,24,15,33,7,10,2,5,19,1
148717,2022,152,1181,77,1314,81,0.820513,0.743590,30,72,5,22,12,20,13,25,12,4,7,4,18,27,64,10,26,17,24,15,33,7,10,2,5,19,27,64,10,26,17,24,15,33,7,10,2,5,19,30,72,5,22,12,20,13,25,12,4,7,4,18,0
148718,2022,154,1242,72,1314,69,0.850000,0.743590,29,66,6,17,8,14,7,27,11,9,6,5,16,23,73,5,23,18,22,20,29,9,13,2,6,13,23,73,5,23,18,22,20,29,9,13,2,6,13,29,66,6,17,8,14,7,27,11,9,6,5,16,1


In [16]:
# Preparing data for learning
y = reg["AWin"]
x = reg.iloc[:, np.r_[0,2,4, 6:60]]
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state = 0)

pred_Y = {"randomforest" : {}, "LogRegression": {}, "XGBoost": {}, "Bagging": {}, "SVM" : {} }



In [17]:
# Finding the number of features which pass a necessary variance threshold
x_copy = x.copy()
threshold = [0.05, 0.01, 0.005, 0.001]
num_features = {}
for i in range (0, 4):
    x_copy = x.copy()
    feature_selection = VarianceThreshold(threshold = threshold[i] )
    feature_selection.fit_transform(x_copy)
    num_features[threshold[i]] = x.shape[1]
num_features

{0.05: 57, 0.01: 57, 0.005: 57, 0.001: 57}

In [18]:
#Fitting a RandomForestClassifier to the data with each of the three scalers 
scalers = [
    StandardScaler(),
    RobustScaler(),
    MinMaxScaler(),
]


k_fold = KFold(n_splits=5, shuffle=True, random_state=0)


for i in range(0, 3):
    X_traincopy = X_train.copy()
    scalers[i].fit_transform(X_traincopy)
    RF = RandomForestClassifier(n_estimators= 100, max_depth = None)
    RF.fit(X_train, y_train)
    pred_Y["randomforest"] = RF.predict_proba(X_test)
    print(brier_score_loss(y_test, pred_Y["randomforest"][:,1]
    ))  


 

0.0562534471938318
0.05574987000179308
0.05584834364353595


In [19]:
# Performs feature selection based on importance determined by RF2
RF2 = RandomForestClassifier(n_estimators= 100, max_depth= None)
scaler = RobustScaler()
scaler.fit(X_train)
RF2.fit(X_train, y_train)
sel = SelectFromModel(RF2)
sel.get_support()
X_filter_test = sel.transform(X_test)
X_filter_train = sel.transform(X_train)



In [20]:
# Storing feature importance in an array
arr = pd.DataFrame( RF2.feature_importances_,RF2.feature_names_in_, columns=['Importance'])
arr = arr.sort_values(by=['Importance'], ascending= False)
arr

Unnamed: 0,Importance
Team2FGM,0.054076
Team1FGM,0.05236
Team2opponent_FGM,0.050911
Team2DR,0.048283
Team1opponent_FGM,0.046339
Team1DR,0.044754
Team2opponent_DR,0.041305
Team1opponent_DR,0.037981
Team2Ast,0.034414
TeamAWRatio,0.033829


In [21]:
# initializing scores and results
scores = {}
detailed_results= {}

In [22]:
# Performing a grid search to find the best parameters for the random forest
params = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 6],
}

grid_search = GridSearchCV(RandomForestClassifier(),
                           param_grid=params, scoring= 'neg_brier_score', cv = 5)
grid_result = grid_search.fit(X_test, y_test)
scores['randomforest_train'] = grid_search.best_score_

scores['randomforest_test']  = brier_score_loss(y_test,grid_search.best_estimator_.predict_proba(X_test)[:,1]
 )




In [23]:
# Plotting the results of the search with different parameters
cv_results = pd.DataFrame(grid_search.cv_results_).sort_values(by=['rank_test_score', 'mean_fit_time'])
detailed_results["RandomForest"] = cv_results.iloc[0]
plot_grid_search(grid_result)



In [24]:
# Calculating the Brier score of the random forest classifier
grid_result = grid_search.fit(X_filter_test, y_test)
print(grid_search.best_score_)
scores['randomforest_filter_train'] = grid_result.best_score_

grid_search.best_estimator_.predict_proba(X_filter_test)
scores['randomforest_filter_test']  = brier_score_loss(y_test,grid_search.best_estimator_.predict_proba(X_filter_test)[:,1]
 )
cv_results = pd.DataFrame(grid_search.cv_results_).sort_values(by=['rank_test_score', 'mean_fit_time'])

detailed_results["RandomForest_filter"] = cv_results.iloc[0]



-0.10527657055225088


In [25]:
# Performing a grid search to find the best parameters for the XGB Classifier
learning_rate = [ 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
max_depth = [3, 4, 5, 6]


param_grid = dict(learning_rate=learning_rate, max_depth = max_depth,  gamma = [0, 0.5, 1, 1.5])
XG = XGBClassifier()

X_train = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)


grid_search = GridSearchCV(XG, param_grid, scoring="neg_brier_score", cv=k_fold)

grid_result = grid_search.fit(X_test, y_test)



In [26]:
# Storing the results of the search
cv_results = pd.DataFrame(grid_search.cv_results_).sort_values(by=['rank_test_score', 'mean_fit_time'])
detailed_results["XG"] = cv_results.iloc[0]
# Plotting the results of the search with different parameters
display(cv_results)
scores['xg_train'] = grid_result.best_score_
plot_grid_search(grid_result)
scores['xg_test']  = brier_score_loss(y_test,grid_search.best_estimator_.predict_proba(X_test)[:,1]
 )

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_gamma,param_learning_rate,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
93,1.707536,0.190089,0.006394,0.000710,1.5,0.8,4,"{'gamma': 1.5, 'learning_rate': 0.8, 'max_dept...",-0.011448,-0.011964,-0.011866,-0.012719,-0.012700,-0.012139,0.000497,1
21,1.481912,0.046918,0.004858,0.000593,0,0.8,4,"{'gamma': 0, 'learning_rate': 0.8, 'max_depth'...",-0.010522,-0.012007,-0.014144,-0.012574,-0.012493,-0.012348,0.001162,2
45,1.512246,0.018801,0.005929,0.000483,0.5,0.8,4,"{'gamma': 0.5, 'learning_rate': 0.8, 'max_dept...",-0.010147,-0.012149,-0.014066,-0.012914,-0.012975,-0.012450,0.001304,3
69,1.661792,0.069708,0.005847,0.000530,1,0.8,4,"{'gamma': 1, 'learning_rate': 0.8, 'max_depth'...",-0.011524,-0.013018,-0.012503,-0.012980,-0.013284,-0.012662,0.000622,4
42,2.014346,0.063930,0.006772,0.000713,0.5,0.7,5,"{'gamma': 0.5, 'learning_rate': 0.7, 'max_dept...",-0.012566,-0.014325,-0.013220,-0.012044,-0.012820,-0.012995,0.000767,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,1.170105,0.019230,0.005497,0.000463,1.5,0.4,3,"{'gamma': 1.5, 'learning_rate': 0.4, 'max_dept...",-0.018467,-0.018939,-0.019464,-0.018784,-0.019561,-0.019043,0.000414,92
72,1.186551,0.029740,0.005302,0.000687,1.5,0.3,3,"{'gamma': 1.5, 'learning_rate': 0.3, 'max_dept...",-0.023345,-0.023226,-0.023449,-0.022587,-0.023072,-0.023136,0.000302,93
48,1.150525,0.023053,0.004899,0.000241,1,0.3,3,"{'gamma': 1, 'learning_rate': 0.3, 'max_depth'...",-0.023345,-0.023066,-0.023627,-0.022821,-0.023076,-0.023187,0.000276,94
0,0.950647,0.009586,0.004404,0.000527,0,0.3,3,"{'gamma': 0, 'learning_rate': 0.3, 'max_depth'...",-0.023345,-0.023406,-0.023629,-0.022821,-0.022966,-0.023233,0.000297,95


In [27]:
# Plotting the XG filter by rank_test_score and mean_fit_time
grid_result = grid_search.fit(X_filter_test, y_test)
scores['xg_filter'] = grid_result.best_score_
plot_grid_search(grid_result)
scores['xg_filter_test']  = brier_score_loss(y_test,grid_search.best_estimator_.predict_proba(X_filter_test)[:,1]
 )
cv_results = pd.DataFrame(grid_search.cv_results_).sort_values(by=['rank_test_score', 'mean_fit_time'])

detailed_results["XG_filter"] = cv_results.iloc[0]



In [28]:
# Performing a grid search to find the best parameters for the SGD Classifier
sgd = SGDClassifier(loss = 'log_loss')
parameters = {
    'alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2], # learning rate
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'penalty':  ['l1', 'l2', 'elasticnet'],
    'eta0': [1, 10, 100], 

}
grid_search = GridSearchCV(sgd, parameters, cv=k_fold,  scoring='neg_brier_score')
grid_result = grid_search.fit(X_test, y_test)


In [29]:
# Storing and plotting the results of the SGD Classifier
cv_results = pd.DataFrame(grid_search.cv_results_).sort_values(by=['rank_test_score', 'mean_fit_time'])
detailed_results["SGD"] = cv_results.iloc[0]
display(cv_results)
scores['sgd_train'] = grid_result.best_score_
plot_grid_search(grid_result)
scores['sgd_test']  = brier_score_loss(y_test,grid_search.best_estimator_.predict_proba(X_test)[:,1]
 )

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_eta0,param_learning_rate,param_penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
15,1.268995,0.083334,0.001578,0.000285,0.0001,10,adaptive,l1,"{'alpha': 0.0001, 'eta0': 10, 'learning_rate':...",-0.000008,-8.704317e-06,-0.000009,-0.000020,-0.000013,-0.000012,0.000004,1
24,0.769921,0.040304,0.001777,0.000442,0.0001,100,adaptive,l1,"{'alpha': 0.0001, 'eta0': 100, 'learning_rate'...",-0.000091,-1.047745e-07,-0.000112,-0.000112,-0.000025,-0.000068,0.000047,2
21,0.231620,0.025568,0.002051,0.001124,0.0001,100,invscaling,l1,"{'alpha': 0.0001, 'eta0': 100, 'learning_rate'...",-0.000066,-2.490824e-05,-0.000193,-0.000265,-0.000055,-0.000121,0.000092,3
6,1.223985,0.016383,0.007610,0.005270,0.0001,1,adaptive,l1,"{'alpha': 0.0001, 'eta0': 1, 'learning_rate': ...",-0.000302,-3.094798e-04,-0.000313,-0.000261,-0.000311,-0.000299,0.000020,4
0,0.222783,0.012481,0.003394,0.001281,0.0001,1,constant,l1,"{'alpha': 0.0001, 'eta0': 1, 'learning_rate': ...",-0.000711,-3.559335e-04,-0.000315,-0.000403,-0.000314,-0.000420,0.000149,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,0.294712,0.076654,0.002448,0.001287,10.0,100,constant,elasticnet,"{'alpha': 10.0, 'eta0': 100, 'learning_rate': ...",-0.487113,-4.988233e-01,-0.498823,-0.506892,-0.499832,-0.498297,0.006354,185
173,0.346683,0.102934,0.003157,0.002152,100.0,10,constant,elasticnet,"{'alpha': 100.0, 'eta0': 10, 'learning_rate': ...",-0.507452,-4.941351e-01,-0.494777,-0.500151,-0.495189,-0.498341,0.005032,186
153,0.253817,0.068966,0.003581,0.003773,10.0,100,constant,l1,"{'alpha': 10.0, 'eta0': 100, 'learning_rate': ...",-0.512887,-4.988233e-01,-0.498823,-0.493108,-0.499832,-0.500695,0.006541,187
180,0.260438,0.040575,0.003068,0.003268,100.0,100,constant,l1,"{'alpha': 100.0, 'eta0': 100, 'learning_rate':...",-0.512887,-4.988233e-01,-0.498823,-0.493108,-0.499832,-0.500695,0.006541,187


In [30]:
# Plotting the SGD filter by rank_test_score and mean_fit_time
grid_result = grid_search.fit(X_filter_test, y_test)
scores['sgd_filter_test'] = grid_result.best_score_
plot_grid_search(grid_result)
scores['sgd_filter_train']  = brier_score_loss(y_test,grid_search.best_estimator_.predict_proba(X_filter_test)[:,1]
 )
cv_results = pd.DataFrame(grid_search.cv_results_).sort_values(by=['rank_test_score', 'mean_fit_time'])

detailed_results["SGD_filter"] = cv_results.iloc[0]


In [31]:
# Performing a grid search to find the best parameters for the Gaussian Naive Bayes Classifier
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
parameters = {
    'var_smoothing' : [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1000],
}
grid_search = GridSearchCV(gnb, parameters, cv=k_fold,  scoring='neg_brier_score')
grid_result = grid_search.fit(X_filter_test, y_test)
cv_results = pd.DataFrame(grid_search.cv_results_).sort_values(by=['rank_test_score', 'mean_fit_time'])
display(cv_results)
scores['gnb_train'] = grid_result.best_score_
scores['gnb_test']  = brier_score_loss(y_test,grid_search.best_estimator_.predict_proba(X_filter_test)[:,1]
)
plot_grid_search(grid_result)
# Plotting the Gaussian Naive Bayes filter by rank_test_score and mean_fit_time
cv_results = pd.DataFrame(grid_search.cv_results_).sort_values(by=['rank_test_score', 'mean_fit_time'])

detailed_results["gnb"] = cv_results.iloc[0]


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_var_smoothing,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.042984,0.009179,0.00729,0.002369,0.0,{'var_smoothing': 1e-09},-0.073057,-0.074757,-0.073861,-0.076898,-0.076508,-0.075016,0.001484,1
1,0.023438,0.000631,0.007541,0.00185,0.0,{'var_smoothing': 1e-08},-0.073057,-0.074757,-0.073861,-0.076898,-0.076508,-0.075016,0.001484,2
2,0.026215,0.001553,0.007264,0.004505,0.0,{'var_smoothing': 1e-07},-0.073057,-0.074757,-0.073861,-0.076899,-0.076508,-0.075016,0.001484,3
3,0.024294,0.001553,0.005618,0.000882,1e-06,{'var_smoothing': 1e-06},-0.073062,-0.074762,-0.073865,-0.076903,-0.076514,-0.075021,0.001484,4
4,0.023901,0.001174,0.005676,0.000271,1e-05,{'var_smoothing': 1e-05},-0.073114,-0.074811,-0.073906,-0.076948,-0.07657,-0.07507,0.001485,5
5,0.023986,0.000845,0.006123,0.001267,0.0001,{'var_smoothing': 0.0001},-0.073682,-0.07534,-0.074343,-0.077423,-0.077151,-0.075588,0.001487,6
8,0.029192,0.003044,0.006915,0.000573,0.1,{'var_smoothing': 0.1},-0.077733,-0.078578,-0.077711,-0.079742,-0.080425,-0.078838,0.001086,7
6,0.023937,0.001653,0.005504,0.000371,0.001,{'var_smoothing': 0.001},-0.077339,-0.07871,-0.077241,-0.080502,-0.080541,-0.078867,0.001447,8
7,0.025164,0.00126,0.008936,0.002926,0.01,{'var_smoothing': 0.01},-0.080115,-0.081267,-0.079606,-0.082833,-0.083021,-0.081368,0.001383,9
9,0.025939,0.002932,0.008186,0.00172,1.0,{'var_smoothing': 1},-0.093008,-0.092703,-0.093113,-0.093309,-0.093778,-0.093182,0.000357,10


In [32]:

grid_result = grid_search.fit(X_filter_test, y_test)
scores['gnb_filter_train'] = grid_result.best_score_
scores['gnb_filter_test']  = brier_score_loss(y_test,grid_search.best_estimator_.predict_proba(X_filter_test)[:,1]
)
cv_results = pd.DataFrame(grid_search.cv_results_).sort_values(by=['rank_test_score', 'mean_fit_time'])

detailed_results["gnb_filter"] = cv_results.iloc[0]
plot_grid_search(grid_result)


In [33]:

scores


{'randomforest_train': -0.10588688107332389,
 'randomforest_test': 0.10361089553641564,
 'randomforest_filter_train': -0.10527657055225088,
 'randomforest_filter_test': 0.10375253508292755,
 'xg_train': -0.01213944911926525,
 'xg_test': 0.0022932741272337276,
 'xg_filter': -0.04236346260013586,
 'xg_filter_test': 0.0356410556330695,
 'sgd_train': -1.1829234868766572e-05,
 'sgd_test': 6.088656268093158e-06,
 'sgd_filter_test': -0.044379397527881684,
 'sgd_filter_train': 0.04751334216886106,
 'gnb_train': -0.0750159663611173,
 'gnb_test': 0.07497361939904136,
 'gnb_filter_train': -0.0750159663611173,
 'gnb_filter_test': 0.07497361939904136}

In [34]:
detailed_results

{'RandomForest': mean_fit_time                                      2.714082
 std_fit_time                                       0.020989
 mean_score_time                                    0.065375
 std_score_time                                     0.000515
 param_max_depth                                           6
 param_n_estimators                                      100
 params                {'max_depth': 6, 'n_estimators': 100}
 split0_test_score                                 -0.107101
 split1_test_score                                 -0.103377
 split2_test_score                                 -0.107054
 split3_test_score                                 -0.104116
 split4_test_score                                 -0.107786
 mean_test_score                                   -0.105887
 std_test_score                                     0.001782
 rank_test_score                                           1
 Name: 4, dtype: object,
 'RandomForest_filter': mean_fit_time       