Mens Tourney Prediction Analysis

I feel the following are important in determing a teams success in the tourney

1) Seeding
2) Strength of Conference
3) Individual team statistics
4) Experience
5) Ability of team to win on the road


In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
from math import pi
import seaborn as sns
import time

from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn import preprocessing, metrics,ensemble, model_selection
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import accuracy_score, roc_curve, auc, classification_report, confusion_matrix

pd.set_option('display.max_columns', 999)
pd.options.display.float_format = '{:.6f}'.format

start_time = time.time()

In [None]:
#standard files

#df_tourney = pd.read_csv('NCAATourneyCompactResults.csv')
#df_season = pd.read_csv('RegularSeasonDetailedResults.csv')
#df_teams = pd.read_csv('Teams.csv')
#df_seeds = pd.read_csv('NCAATourneySeeds.csv')
#df_conferences = pd.read_csv('Conferences.csv')
#df_rankings = pd.read_csv('MasseyOrdinals.csv')
#df_sample_sub = pd.read_csv('SampleSubmissionStage1.csv')

#my custom file


#df_tourney_experience = pd.read_csv('tourney_experience_senior_class.csv')

# Kaggle locations

df_tourney = pd.read_csv('../input/mens-machine-learning-competition-2018/NCAATourneyCompactResults.csv')
df_season = pd.read_csv('../input/mens-machine-learning-competition-2018/RegularSeasonDetailedResults.csv')
df_teams = pd.read_csv('../input/mens-machine-learning-competition-2018/Teams.csv')
df_seeds = pd.read_csv('../input/mens-machine-learning-competition-2018/NCAATourneySeeds.csv')
df_conferences = pd.read_csv('../input/mens-machine-learning-competition-2018/Conferences.csv')
df_rankings = pd.read_csv('../input/mens-machine-learning-competition-2018/MasseyOrdinals.csv')
df_sample_sub = pd.read_csv('../input/mens-machine-learning-competition-2018/SampleSubmissionStage1.csv')

#private data file

df_tourney_experience = pd.read_csv('../input/ncaa-tourney-experience/Tourney_Experience_Senior_Class.csv')


In [None]:
df_season.head(5)

In [None]:
#Calculate Winning/losing Team Possesion Feature

#https://www.nbastuffer.com/analytics101/possession/


wPos = df_season.apply(lambda row: 0.96*(row.WFGA + row.WTO + 0.44*row.WFTA - row.WOR), axis=1)
lPos = df_season.apply(lambda row: 0.96*(row.LFGA + row.LTO + 0.44*row.LFTA - row.LOR), axis=1)

#two teams use almost the same number of possessions in a game
#(plus/minus one or two - depending on how quarters end)
#so let's just take the average

df_season['Possesions'] = (wPos+lPos)/2

In [None]:
df_season.head(5)

In [None]:
#Name Player Impact Estimate Definition PIE measures a player's overall statistical contribution
#against the total statistics in games they play in. PIE yields results which are
#comparable to other advanced statistics (e.g. PER) using a simple formula.
#Formula (PTS + FGM + FTM - FGA - FTA + DREB + (.5 * OREB) + AST + STL + (.5 * BLK) - PF - TO)
# / (GmPTS + GmFGM + GmFTM - GmFGA - GmFTA + GmDREB + (.5 * GmOREB) + GmAST + GmSTL + (.5 * GmBLK) - GmPF - GmTO)

#We will use this to measure Team Skill

wtmp = df_season.apply(lambda row: row.WScore + row.WFGM + row.WFTM - row.WFGA - row.WFTA + row.WDR + 0.5*row.WOR + row.WAst +row.WStl + 0.5*row.WBlk - row.WPF - row.WTO, axis=1)
ltmp = df_season.apply(lambda row: row.LScore + row.LFGM + row.LFTM - row.LFGA - row.LFTA + row.LDR + 0.5*row.LOR + row.LAst +row.LStl + 0.5*row.LBlk - row.LPF - row.LTO, axis=1) 

df_season['WPIE'] = wtmp/(wtmp + ltmp)
df_season['LPIE'] = ltmp/(wtmp + ltmp)



In [None]:
#Four factors statistic from the NBA

#https://www.nbastuffer.com/analytics101/four-factors/


#Effective Field Goal Percentage=(Field Goals Made) + 0.5*3P Field Goals Made))/(Field Goal Attempts)
#you have to put the ball in the bucket eventually

df_season['WeFGP'] = df_season.apply(lambda row:(row.WFGM + 0.5 * row.WFGM3) / row.WFGA, axis=1)      
df_season['LeFGP'] = df_season.apply(lambda row:(row.LFGM + 0.5 * row.LFGM3) / row.LFGA, axis=1) 

#Turnover Rate= Turnovers/(Field Goal Attempts + 0.44*Free Throw Attempts + Turnovers)
#he who doesnt turn the ball over wins games

df_season['WTOR'] = df_season.apply(lambda row: row.WTO / (row.WFGA + 0.44*row.WFTA + row.WTO), axis=1)
df_season['LTOR'] = df_season.apply(lambda row: row.LTO / (row.LFGA + 0.44*row.LFTA + row.LTO), axis=1)


#Offensive Rebounding Percentage = (Offensive Rebounds)/[(Offensive Rebounds)+(Opponent’s Defensive Rebounds)]
#You can win games controlling the offensive glass

df_season['WORP'] = df_season.apply(lambda row: row.WOR / (row.WOR + row.LDR), axis=1)
df_season['LORP'] = df_season.apply(lambda row: row.LOR / (row.LOR + row.WDR), axis=1)

#Free Throw Rate=(Free Throws Made)/(Field Goals Attempted) or Free Throws Attempted/Field Goals Attempted
#You got to get to the line to win close games

df_season['WFTAR'] = df_season.apply(lambda row: row.WFTA / row.WFGA, axis=1)
df_season['LFTAR'] = df_season.apply(lambda row: row.LFTA / row.LFGA, axis=1)

#4 Factors is weighted as follows
#1. Shooting (40%)
#2. Turnovers (25%)
#3. Rebounding (20%)
#4. Free Throws (15%)

df_season['W4Factor'] = df_season.apply(lambda row: .40*row.WeFGP + .25*row.WTOR + .20*row.WORP + .15*row.WFTAR, axis=1)
df_season['L4Factor'] = df_season.apply(lambda row: .40*row.LeFGP + .25*row.LTOR + .20*row.LORP + .15*row.LFTAR, axis=1)                                      
                                       


In [None]:
#Offensive efficiency (OffRtg) =  (Points / Possessions)
#Every possession counts

df_season['WOffRtg'] = df_season.apply(lambda row: (row.WScore / row.Possesions), axis=1)
df_season['LOffRtg'] = df_season.apply(lambda row: (row.LScore / row.Possesions), axis=1)

#Defensive efficiency (DefRtg) = (Opponent points / Opponent possessions)
#defense wins championships

df_season['WDefRtg'] = df_season.LOffRtg
df_season['LDefRtg'] = df_season.WOffRtg

                        
#Assist Ratio : Percentage of team possessions that end in assists
#distribute the rock - dont go isolation all the time

df_season['WAstR'] = df_season.apply(lambda row: row.WAst / (row.WFGA + 0.44*row.WFTA + row.WAst + row.WTO), axis=1)
df_season['LAstR'] = df_season.apply(lambda row: row.LAst / (row.LFGA + 0.44*row.LFTA + row.LAst + row.LTO), axis=1)


#DREB% : Percentage of team defensive rebounds
#control your own glass

df_season['WDRP'] = df_season.apply(lambda row: row.WDR / (row.WDR + row.LOR), axis=1)
df_season['LDRP'] = df_season.apply(lambda row: row.LDR / (row.LDR + row.WOR), axis=1) 

#Free Throw Percentage
#Make your damn free throws

df_season['WFTPCT'] = df_season.apply(lambda row : 0 if row.WFTA < 1 else row.WFTM / row.WFTA, axis=1)
df_season['LFTPCT'] = df_season.apply(lambda row : 0 if row.LFTA < 1 else row.LFTM / row.LFTA, axis=1)



In [None]:
df_season.drop(['WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF'], axis=1, inplace=True)
df_season.drop(['LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF'], axis=1, inplace=True)


In [None]:
df_season.head()

In [None]:
df_season_composite = pd.DataFrame()

#This will aggregate individual games into season totals for a team

#calculates wins and losses to get winning percentage

df_season_composite['WINS'] = df_season['WTeamID'].groupby([df_season['Season'], df_season['WTeamID']]).count()
df_season_composite['LOSSES'] = df_season['LTeamID'].groupby([df_season['Season'], df_season['LTeamID']]).count()
df_season_composite['WINPCT'] = df_season_composite['WINS'] / (df_season_composite['WINS'] + df_season_composite['LOSSES'])

# calculates averages for games team won

df_season_composite['WPIE'] = df_season['WPIE'].groupby([df_season['Season'], df_season['WTeamID']]).mean()
df_season_composite['WeFGP'] = df_season['WeFGP'].groupby([df_season['Season'], df_season['WTeamID']]).mean()
df_season_composite['WTOR'] = df_season['WTOR'].groupby([df_season['Season'], df_season['WTeamID']]).mean()
df_season_composite['WORP'] = df_season['WORP'].groupby([df_season['Season'], df_season['WTeamID']]).mean()
df_season_composite['WFTAR'] = df_season['WFTAR'].groupby([df_season['Season'], df_season['WTeamID']]).mean()
df_season_composite['W4Factor'] = df_season['W4Factor'].groupby([df_season['Season'], df_season['WTeamID']]).mean()
df_season_composite['WOffRtg'] = df_season['WOffRtg'].groupby([df_season['Season'], df_season['WTeamID']]).mean()
df_season_composite['WDefRtg'] = df_season['WDefRtg'].groupby([df_season['Season'], df_season['WTeamID']]).mean()
df_season_composite['WAstR'] = df_season['WAstR'].groupby([df_season['Season'], df_season['WTeamID']]).mean()
df_season_composite['WDRP'] = df_season['WDRP'].groupby([df_season['Season'], df_season['WTeamID']]).mean()
df_season_composite['WFTPCT'] = df_season['WFTPCT'].groupby([df_season['Season'], df_season['WTeamID']]).mean()

# calculates averages for games team lost

df_season_composite['LPIE'] = df_season['LPIE'].groupby([df_season['Season'], df_season['LTeamID']]).mean()
df_season_composite['LeFGP'] = df_season['LeFGP'].groupby([df_season['Season'], df_season['LTeamID']]).mean()
df_season_composite['LTOR'] = df_season['LTOR'].groupby([df_season['Season'], df_season['LTeamID']]).mean()
df_season_composite['LORP'] = df_season['LORP'].groupby([df_season['Season'], df_season['LTeamID']]).mean()
df_season_composite['LFTAR'] = df_season['LFTAR'].groupby([df_season['Season'], df_season['LTeamID']]).mean()
df_season_composite['L4Factor'] = df_season['L4Factor'].groupby([df_season['Season'], df_season['LTeamID']]).mean()
df_season_composite['LOffRtg'] = df_season['LOffRtg'].groupby([df_season['Season'], df_season['LTeamID']]).mean()
df_season_composite['LDefRtg'] = df_season['LDefRtg'].groupby([df_season['Season'], df_season['LTeamID']]).mean()
df_season_composite['LAstR'] = df_season['LAstR'].groupby([df_season['Season'], df_season['LTeamID']]).mean()
df_season_composite['LDRP'] = df_season['LDRP'].groupby([df_season['Season'], df_season['LTeamID']]).mean()
df_season_composite['LFTPCT'] = df_season['LFTPCT'].groupby([df_season['Season'], df_season['LTeamID']]).mean()

# calculates weighted average using winning percent to weight the statistic


df_season_composite['PIE'] = df_season_composite['WPIE'] * df_season_composite['WINPCT'] + df_season_composite['LPIE'] * (1 - df_season_composite['WINPCT'])
df_season_composite['FG_PCT'] = df_season_composite['WeFGP'] * df_season_composite['WINPCT'] + df_season_composite['LeFGP'] * (1 - df_season_composite['WINPCT'])
df_season_composite['TURNOVER_RATE'] = df_season_composite['WTOR'] * df_season_composite['WINPCT'] + df_season_composite['LTOR'] * (1 - df_season_composite['WINPCT'])
df_season_composite['OFF_REB_PCT'] = df_season_composite['WORP'] * df_season_composite['WINPCT'] + df_season_composite['LORP'] * (1 - df_season_composite['WINPCT'])
df_season_composite['FT_RATE'] = df_season_composite['WFTAR'] * df_season_composite['WINPCT'] + df_season_composite['LFTAR'] * (1 - df_season_composite['WINPCT'])
df_season_composite['4FACTOR'] = df_season_composite['W4Factor'] * df_season_composite['WINPCT'] + df_season_composite['L4Factor'] * (1 - df_season_composite['WINPCT'])
df_season_composite['OFF_EFF'] = df_season_composite['WOffRtg'] * df_season_composite['WINPCT'] + df_season_composite['LOffRtg'] * (1 - df_season_composite['WINPCT'])
df_season_composite['DEF_EFF'] = df_season_composite['WDefRtg'] * df_season_composite['WINPCT'] + df_season_composite['LDefRtg'] * (1 - df_season_composite['WINPCT'])
df_season_composite['ASSIST_RATIO'] = df_season_composite['WAstR'] * df_season_composite['WINPCT'] + df_season_composite['LAstR'] * (1 - df_season_composite['WINPCT'])
df_season_composite['DEF_REB_PCT'] = df_season_composite['WDRP'] * df_season_composite['WINPCT'] + df_season_composite['LDRP'] * (1 - df_season_composite['WINPCT'])
df_season_composite['FT_PCT'] = df_season_composite['WFTPCT'] * df_season_composite['WINPCT'] + df_season_composite['LFTPCT'] * (1 - df_season_composite['WINPCT'])

df_season_composite.reset_index(inplace = True)


In [None]:
#Kentucy and Witchita State went undefeated causing problems with the data since cant calculate average stats without WINPCT

df_season_composite[df_season_composite['LOSSES'].isnull()]

In [None]:
#Complete hack to fix the data

df_season_composite.loc[4064,'WINPCT'] = 1
df_season_composite.loc[4064,'LOSSES'] = 0
df_season_composite.loc[4064,'PIE'] = df_season_composite.loc[4064,'WPIE']
df_season_composite.loc[4064,'FG_PCT'] = df_season_composite.loc[4064,'WeFGP']
df_season_composite.loc[4064,'TURNOVER_RATE'] = df_season_composite.loc[4064,'WTOR']
df_season_composite.loc[4064,'OFF_REB_PCT'] = df_season_composite.loc[4064,'WORP']
df_season_composite.loc[4064,'FT_RATE'] = df_season_composite.loc[4064,'WFTAR']
df_season_composite.loc[4064,'4FACTOR'] = df_season_composite.loc[4064,'W4Factor']
df_season_composite.loc[4064,'OFF_EFF'] = df_season_composite.loc[4064,'WOffRtg']
df_season_composite.loc[4064,'DEF_EFF'] = df_season_composite.loc[4064,'WDefRtg']
df_season_composite.loc[4064,'ASSIST_RATIO'] = df_season_composite.loc[4064,'WAstR']
df_season_composite.loc[4064,'DEF_REB_PCT'] = df_season_composite.loc[4064,'WDRP']
df_season_composite.loc[4064,'FT_PCT'] = df_season_composite.loc[4064,'WFTPCT']

df_season_composite.loc[4211,'WINPCT'] = 1
df_season_composite.loc[4211,'LOSSES'] = 0
df_season_composite.loc[4211,'PIE'] = df_season_composite.loc[4211,'WPIE']
df_season_composite.loc[4211,'FG_PCT'] = df_season_composite.loc[4211,'WeFGP']
df_season_composite.loc[4211,'TURNOVER_RATE'] = df_season_composite.loc[4211,'WTOR']
df_season_composite.loc[4211,'OFF_REB_PCT'] = df_season_composite.loc[4211,'WORP']
df_season_composite.loc[4211,'FT_RATE'] = df_season_composite.loc[4211,'WFTAR']
df_season_composite.loc[4211,'4FACTOR'] = df_season_composite.loc[4211,'W4Factor']
df_season_composite.loc[4211,'OFF_EFF'] = df_season_composite.loc[4211,'WOffRtg']
df_season_composite.loc[4211,'DEF_EFF'] = df_season_composite.loc[4211,'WDefRtg']
df_season_composite.loc[4211,'ASSIST_RATIO'] = df_season_composite.loc[4211,'WAstR']
df_season_composite.loc[4211,'DEF_REB_PCT'] = df_season_composite.loc[4211,'WDRP']
df_season_composite.loc[4211,'FT_PCT'] = df_season_composite.loc[4211,'WFTPCT']


In [None]:
#we only need the final summary stats

df_season_composite.drop(['WINS','WPIE','WeFGP','WTOR','WORP','WFTAR','W4Factor','WOffRtg','WDefRtg','WAstR','WDRP','WFTPCT'], axis=1, inplace=True)
df_season_composite.drop(['LOSSES','LPIE','LeFGP','LTOR','LORP','LFTAR','L4Factor','LOffRtg','LDefRtg','LAstR','LDRP','LFTPCT'], axis=1, inplace=True)


In [None]:
df_season_composite.head()

In [None]:
#a little housekeeping to make easier to graph correlation matrix

columns = list(df_season_composite.columns.values) 
columns.pop(columns.index('WINPCT')) 
columns.append('WINPCT')
df_season_composite = df_season_composite[columns]
df_season_composite.rename(columns={'WTeamID':'TeamID'}, inplace=True)
df_season_composite.head()

In [None]:
#This shows we have some good predictors of winning percentage

#the PIE variable is very powerfully correlated with winning percentage
#also we can see turnovers will kill you as well as having a bad defense


corrmatrix = df_season_composite.iloc[:, 2:].corr()

f, ax = plt.subplots(figsize=(11, 7))
sns.heatmap(corrmatrix, vmax=.8, cbar=True, annot=True, square=True);


In [None]:
#Strength of Schedule

#We will use the RPI ranking of the teams before entering the tourney to get a measure of strength of schedule.

#Rating Percentage Index (RPI) Formula=.25*(Team’s Winning Percentage)+
#.50*(Opponents’  Average Winning Percentage)+0.25*(Opponents’ Opponents’  Average Winning Percentage)

#The rating percentage index, commonly known as the RPI, is a quantity used to rank sports teams based upon
#a team's wins and losses and its strength of schedule. It is one of the sports rating systems by which NCAA basketball,
#baseball, softball, hockey, soccer, lacrosse, and volleyball teams are ranked.

#The final pre-tournament rankings each year have a RankingDayNum of 133.
#and can thus be used to make predictions of the games from the NCAA® tournament

In [None]:
df_RPI = df_rankings[df_rankings['SystemName'] == 'RPI']
df_RPI_final = df_RPI[df_RPI['RankingDayNum'] == 133]
df_RPI_final.drop(labels=['RankingDayNum', 'SystemName'], inplace=True, axis=1)
df_RPI_final.head()

In [None]:
#Get seeds of teams for all tourney games

df_seeds.head()

In [None]:
# Convert string to an integer

df_seeds['seed_int'] = df_seeds['Seed'].apply( lambda x : int(x[1:3]) )
df_seeds.drop(labels=['Seed'], inplace=True, axis=1) 
df_seeds.rename(columns={'seed_int':'Seed'},inplace=True)
df_seeds.head()

In [None]:
#Create dataframe of team features for all seasons

#ranks only start since 2003

df_seeds_final = df_seeds[df_seeds['Season'] > 2002]

#2 step merge

df_tourney_stage = pd.merge(left=df_seeds_final, right=df_RPI_final, how='left', on=['Season', 'TeamID'])
df_tourney_final = pd.merge(left=df_tourney_stage, right=df_season_composite, how='left', on=['Season', 'TeamID'])
df_tourney_final.head()


In [None]:
#I couldnt figure out how to manipulate/calculate the way I wanted so I exported to Excel and am reimporting it back in here.

#df_tourney_experience = pd.read_csv('tourney_experience_senior_class.csv')

#This indicates the number of tourney games that the senior class would have played in going in to this
#years tourney (basically games played in the prior 3 tourneys) Using it as a gage of tourney experience of the team. 
#All things being equal between two #teams the team with more experience in the tourney I feel would win the game.

df_tourney_experience.tail()

In [None]:
#this function looks up the number of games for a year/team combination

def get_wins(year, teamid):
    
    row_id = df_tourney_experience[df_tourney_experience['TeamID'] == teamid].index[0]
    column_id = df_tourney_experience.columns.get_loc(str(year))
    games = df_tourney_experience.iloc[row_id,column_id]
      
    return games


In [None]:
#iterates thru the dataframe to build another single column dataframe by calling the function

result = []
             

for row in df_tourney_final.iterrows():
    
    years = (df_tourney_final['Season'])
    teams = (df_tourney_final['TeamID'])
    
for i in range(len(df_tourney_final)):
    
    matrix = ((years[i], teams[i]))
    result.append(get_wins(*matrix))
    

team_experience = pd.DataFrame(result, columns=['experience']) 

team_experience.head()
  


In [None]:
#merges them together 

df_tourney_final = pd.concat((df_tourney_final, team_experience), axis=1)

df_tourney_final.head()

In [None]:
#generate teams in the tourney

df_tourney.drop(labels=['DayNum', 'WScore', 'LScore', 'WLoc', 'NumOT'], inplace=True, axis=1)
df_tourney = pd.merge(left=df_tourney, right=df_seeds, how='left', left_on=['Season', 'WTeamID'], right_on=['Season', 'TeamID'])
df_tourney = pd.merge(left=df_tourney, right=df_seeds, how='left', left_on=['Season', 'LTeamID'], right_on=['Season', 'TeamID'])
df_tourney.drop(labels=['TeamID_x', 'TeamID_y'], inplace=True, axis=1)
df_tourney.rename(columns={'Seed_x':'WSeed', 'Seed_y':'LSeed'},inplace=True)
df_tourney.head()

In [None]:
#Great graph showing how seeding has extreme effect in early rounds

# No 16 seed has ever beaten a number 1 seed (absence of +15 values)
# Very rarely does a #15 seed beat a #2 seed (low value of +13 values)

# this needs to be in our model

df_tourney['SeedDiff'] = df_tourney['WSeed'] - df_tourney['LSeed']
sns.countplot(df_tourney['SeedDiff'])

In [None]:
#quick and dirty to see how good a predictor Seed difference is

df_wins = pd.DataFrame()
df_wins['SeedDiff'] = df_tourney['SeedDiff']
df_wins['Result'] = 1

df_losses = pd.DataFrame()
df_losses['SeedDiff'] = -df_tourney['SeedDiff']
df_losses['Result'] = 0

df_predictions = pd.concat((df_wins, df_losses))
df_predictions.head()

In [None]:
#setup the data

X_train = df_predictions.SeedDiff.values.reshape(-1,1)
y_train = df_predictions.Result.values
X_train, y_train = shuffle(X_train, y_train)

In [None]:
#use Logistic regression with Gridsearch for parameter tuning

logreg = LogisticRegression(random_state=0)
params = {'C': np.logspace(start=-5, stop=3, num=9)}
clf = GridSearchCV(logreg, params, scoring='neg_log_loss', refit=True, cv=10, )
clf.fit(X_train, y_train)
print('Best log_loss: {:.4}, with best C: {}'.format(clf.best_score_, clf.best_params_['C']))

In [None]:
#model is accurately reflecting the low probability of major upsets based on seeds differentials

X = np.arange(-15, 15).reshape(-1, 1)  # this creates the range of seed differentials
preds = clf.predict_proba(X)[:,1]  # the 1 signifies winning

plt.plot(X, preds)
plt.xlabel('Team1 seed - Team2 seed')
plt.ylabel('P(Team1 will win)')

In [None]:
#Seeding alone seems to predict 70% accurately

train_acc = accuracy_score(y_true=y_train, y_pred=clf.predict(X_train))
        
print('Training Accuracy: %.2f%%' % (100 * train_acc))


In [None]:
df_tourney.head()

In [None]:
#sample submission file

df_sample_sub.head()

In [None]:
#This generates a submission file for 2014-2017 using the simple Seeds model

n_test_games = len(df_sample_sub)

def get_year_t1_t2(ID):
    """Return a tuple with ints `year`, `team1` and `team2`."""
    return (int(x) for x in ID.split('_'))

X_test = np.zeros(shape=(n_test_games, 1))

for ii, row in df_sample_sub.iterrows():
    year, t1, t2 = get_year_t1_t2(row.ID)
    team1 = df_tourney_final[(df_tourney_final.TeamID == t1) & (df_tourney_final.Season == year)].Seed.values[0]
    team2 = df_tourney_final[(df_tourney_final.TeamID == t2) & (df_tourney_final.Season == year)].Seed.values[0]
    diff_seed = team1 - team2
    X_test[ii, 0] = diff_seed

      
preds = clf.predict_proba(X_test)[:,1]

df_sample_sub['Pred'] = preds

df_sample_sub.to_csv('SeedModel.csv', index=False)

df_sample_sub.head()



In [None]:
df_tourney_final.head()

In [None]:
#Generate a list of all matchups in the tourney since 2003

#df_tourney_list = pd.read_csv('NCAATourneyCompactResults.csv')
df_tourney_list = pd.read_csv('../input/mens-machine-learning-competition-2018/NCAATourneyCompactResults.csv')
df_tourney_list.drop(labels=['DayNum', 'WScore', 'LScore', 'WLoc', 'NumOT'], inplace=True, axis=1)
df_tourney_list = df_tourney_list[df_tourney_list['Season'] > 2002]
df_tourney_list.reset_index(inplace = True, drop=True)
df_tourney_list.head()

In [None]:
#gets the features for the winning team

df_model_winners = pd.merge(left=df_tourney_list, right=df_tourney_final ,how='left', left_on=['Season', 'WTeamID'], right_on=['Season', 'TeamID'])
df_model_winners.drop(labels=['TeamID'], inplace=True, axis=1)
df_model_winners.head()

In [None]:
#gets the features for the losing team

df_model_losers = pd.merge(left=df_tourney_list, right=df_tourney_final ,how='left', left_on=['Season', 'LTeamID'], right_on=['Season', 'TeamID'])
df_model_losers.drop(labels=['TeamID'], inplace=True, axis=1)
df_model_losers.head()

In [None]:
#This generates the differences between the features between winning and losing team and assigns 1 as the classifier for winning

df_model_winner_diff = (df_model_winners.iloc[:, 3:] - df_model_losers.iloc[:, 3:])
df_model_winner_diff['result'] = 1
df_model_winner_diff = pd.merge(left=df_model_winner_diff, right=df_tourney_list, left_index=True, right_index=True, how='inner')

#This generates the  between the features between losing and winning team and assigns 0 as the classifier for losing

df_model_loser_diff = (df_model_losers.iloc[:, 3:] - df_model_winners.iloc[:, 3:])
df_model_loser_diff['result'] = 0
df_model_loser_diff = pd.merge(left=df_model_loser_diff, right=df_tourney_list, left_index=True, right_index=True, how='inner')



In [None]:
df_model_winner_diff.head()

In [None]:
df_model_loser_diff.head()

In [None]:

df_predictions_tourney = pd.concat((df_model_winner_diff, df_model_loser_diff), axis=0)

df_predictions_tourney.sort_values('Season', inplace=True)

df_predictions_tourney.reset_index(inplace = True, drop=True)

In [None]:
df_predictions_tourney.head()
df_predictions_tourney.to_csv("df_predictions_tourney.csv", index=None)

In [None]:
# The plan is to test out 6 different models using Grid Search Cross Validation

#1 -  Ranks  -   This will be the RPI plus Seed

#2 -  Experience  - This will be the experience feature only

#3 -  Stats -  This will be the seasons teams statistics features

#4 -  Full -  encompassing all features of models 1, 2, and 3

#5 - An ensemble model with features being the actual predictions of models 1, 2, and 3

#6 - An ensemble model with features being the actual predictions of models 1, 2, and 3 plus model 4

#Time to split the dataframe into its various components for modeling and testing



In [None]:
# Lets test out the predictive power of the team statistic features on the training data before we begin full modeling

from sklearn.feature_selection import SelectPercentile, f_classif

X_features_stats = df_predictions_tourney.iloc[:1426, 2:14]
y = df_predictions_tourney['result'][df_predictions_tourney['Season'] < 2014]

selector = SelectPercentile(f_classif, percentile=100)
selector.fit(X_features_stats, y)
p_scores = (selector.pvalues_) 
F_scores = (selector.scores_)

df_significance = pd.DataFrame({"Feature": X_features_stats.columns, "p_value":p_scores , "F_score":F_scores})

df_significance

In [None]:
# Based on the above we will drop DEF_REB_PCT and FT_PCT as their p values are > 0.05 and thus we cant rule
# out the null hypothesis.  We will also drop FT_RATE as they doesn't seem to have much predictive power with
# the lower F_score as well.

# drop from predictions file used for training analysis

df_predictions_tourney.drop(labels=['DEF_REB_PCT', 'FT_PCT', 'FT_RATE' ], inplace=True, axis=1)

# drop from team statistics file used for building tourney testing analysis

df_tourney_final.drop(labels=['DEF_REB_PCT', 'FT_PCT', 'FT_RATE' ], inplace=True, axis=1)


In [None]:
# Lets split the entire dataset into training/test sets and into feature categories for modeling

labels = df_predictions_tourney['result']
IDs = df_predictions_tourney.iloc[:, 13:]
features = df_predictions_tourney.iloc[:, 0:12]                  # model 4
features_rank = df_predictions_tourney.iloc[:, 0:2]              # model 1
features_experience = df_predictions_tourney.iloc[:, 11:12]      # model 2
features_stats = df_predictions_tourney.iloc[:, 2:11]            # model 3

# Test data set split (2014-1017 onward which corresponds from row 1426 to the end)

labels_submission = df_predictions_tourney['result'][df_predictions_tourney['Season'] > 2013]
IDs_submission = df_predictions_tourney.iloc[1426:, 13:]
features_submission = df_predictions_tourney.iloc[1426:,  0:12]

# Training data set split (2003 thru 2013 which is from the beginning thru row 1425)

y = df_predictions_tourney['result'][df_predictions_tourney['Season'] < 2014]
IDs_training = df_predictions_tourney.iloc[:1426, 13:]
X_features = df_predictions_tourney.iloc[:1426, 0:12]
X_features_rank = df_predictions_tourney.iloc[:1426, 0:2]
X_features_experience = df_predictions_tourney.iloc[:1426, 11:12]
X_features_stats = df_predictions_tourney.iloc[:1426, 2:11]


In [None]:
# Lets test out the predictive power of the Seeding and RPI features on the training data before we begin full modeling

from sklearn.feature_selection import SelectPercentile, f_classif

selector = SelectPercentile(f_classif, percentile=100)
selector.fit(X_features_rank, y)
p_scores = (selector.pvalues_) 
F_scores = (selector.scores_)

df_significance = pd.DataFrame({"Feature": X_features_rank.columns, "p_value":p_scores , "F_score":F_scores})

df_significance

In [None]:
# Both ranking stats are powerful predictors so we will keep them!


In [None]:
# Lets test out the predictive power of the experience feature on the training data before we begin full modeling

from sklearn.feature_selection import SelectPercentile, f_classif

selector = SelectPercentile(f_classif, percentile=100)
selector.fit(X_features_experience, y)
p_scores = (selector.pvalues_) 
F_scores = (selector.scores_)

df_significance = pd.DataFrame({"Feature":X_features_experience.columns, "p_value":p_scores , "F_score":F_scores})

df_significance

In [None]:
# Experience is a factor as well. Keep it!


In [None]:
# Training for Model #1 (Experience)

#split the training data further for cross validation

X_train, X_test, y_train, y_test = train_test_split(X_features_experience, y, train_size=0.8, test_size=0.2, random_state=1, stratify=y)

#Intiating Classifiers

clf1 = LogisticRegression()

clf3 = XGBClassifier()

clf4 = DecisionTreeClassifier() 

clf5 = RandomForestClassifier()

# Setting up the parameter grids

param_grid1 = [{'clf1__C': list(np.logspace(start=-5, stop=3, num=9))}]

param_grid3 = [{'learning_rate' : [0.1, 0.3],
                'max_depth': [3, 6],
                'min_child_weight': list(range(1, 3))}]

param_grid4 = [{'max_depth': list(range(3, 6)),
                'criterion': ['gini', 'entropy'],
                'min_samples_leaf': [20, 50]}]

param_grid5 = [{'max_depth': list(range(1, 5)),
                'criterion': ['gini', 'entropy'],
                'min_samples_split' : [2, 3]}]

# Building the pipelines

pipe1 = Pipeline([('std', StandardScaler()),('clf1', clf1)])

pipe3 = Pipeline([('std', StandardScaler()),('clf3', clf3)])

pipe4 = Pipeline([('std', StandardScaler()),('clf4', clf4)])

pipe5 = Pipeline([('std', StandardScaler()),('clf5', clf5)])


# Setting up multiple GridSearchCV objects, 1 for each algorithm

gridcvs = {}

inner_cv = StratifiedKFold(n_splits=10, shuffle=False, random_state=2)
outer_cv = StratifiedKFold(n_splits=10, shuffle=False, random_state=2)

for pgrid, est, name in zip((param_grid1, param_grid3, param_grid4, param_grid5),
                            (pipe1, clf3, clf4, clf5,),
                            ('Logistic', 'XGBoost', 'DTree', 'Random Forest')):
    
    #First loop runs GridSearch and does Cross validation to find the best parameters

    gcv = GridSearchCV(estimator=est,
                       param_grid=pgrid,
                       scoring='neg_log_loss',
                       cv=outer_cv,
                       verbose=0,
                       refit=True,
                       return_train_score=False)
    
    gcv.fit(X_train, y_train)
    
    gridcvs[name] = gcv
    
    print(name)
    print()
    print(gcv.best_estimator_)
    print()
    print('Best score on Grid Search Cross Validation is %.5f%%' % (gcv.best_score_))
    print()
    results = pd.DataFrame(gcv.cv_results_)
      

#Inner loop runs Cross Val Score on tuned parameter model to determine accuracy of fit        

    # for name, gs_est in sorted(gridcvs.items()):
    
    nested_score = 0
    nested_score = cross_val_score(gcv, 
                                  X=X_train, 
                                  y=y_train, 
                                  cv=inner_cv,
                                  scoring='neg_log_loss')
                                
    
    print('Name, Log Loss, Std Dev, based on Best Parameter Model using Cross Validation Scoring')
    print('%s | %.2f %.2f' % (name,  nested_score.mean(),  nested_score.std() * 100,))
    print()
    
    
    #Generate predictions and probabilities
    
    best_algo = gcv    

    best_algo.fit(X_train, y_train)
    
    train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
    test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))

    print('Training Accuracy: %.2f%%' % (100 * train_acc))
    print('Test Accuracy: %.2f%%' % (100 * test_acc))
    print()
    
    # prints classification report and confusion matrix
    
    predictions = best_algo.predict(X_test)
    probability = best_algo.predict_proba(X_test)
    print(classification_report(y_test,predictions))
    print()
    print(confusion_matrix(y_test,predictions))
    print()
        

In [None]:
# Model 1 analysis

# Logistic - Best score on Grid Search Cross Validation is -0.64% 
# XGBoost - Best score on Grid Search Cross Validation is -0.66%
# DTree -   Best score on Grid Search Cross Validation is -0.65%
# RForest - Best score on Grid Search Cross Validation is -0.64%


# Models do better than 50/50 so there is value here

#We will chose Random Forest here


In [None]:
# Training for Model #2 (Ranks)

#split the training data further for cross validation

X_train, X_test, y_train, y_test = train_test_split(X_features_rank, y, train_size=0.8, test_size=0.2, random_state=1, stratify=y)

#Intiating Classifiers

clf1 = LogisticRegression()

clf3 = XGBClassifier()

clf4 = DecisionTreeClassifier() 

clf5 = RandomForestClassifier()

# Setting up the parameter grids

param_grid1 = [{'clf1__C': list(np.logspace(start=-5, stop=3, num=9))}]

param_grid3 = [{'learning_rate' : [0.1, 0.3],
                'max_depth': [3, 6],
                'min_child_weight': list(range(1, 3))}]

param_grid4 = [{'max_depth': list(range(3, 6)),
                'criterion': ['gini', 'entropy'],
                'min_samples_leaf': [20, 50]}]

param_grid5 = [{'max_depth': list(range(1, 5)),
                'criterion': ['gini', 'entropy'],
                'min_samples_split' : [2, 3]}]

# Building the pipelines

pipe1 = Pipeline([('std', StandardScaler()),('clf1', clf1)])

pipe3 = Pipeline([('std', StandardScaler()),('clf3', clf3)])

pipe4 = Pipeline([('std', StandardScaler()),('clf4', clf4)])

pipe5 = Pipeline([('std', StandardScaler()),('clf5', clf5)])


# Setting up multiple GridSearchCV objects, 1 for each algorithm

gridcvs = {}

inner_cv = StratifiedKFold(n_splits=10, shuffle=False, random_state=2)
outer_cv = StratifiedKFold(n_splits=10, shuffle=False, random_state=2)

for pgrid, est, name in zip((param_grid1, param_grid3, param_grid4, param_grid5),
                            (pipe1, clf3, clf4, clf5,),
                            ('Logistic', 'XGBoost', 'DTree', 'Random Forest')):
    
    #First loop runs GridSearch and does Cross validation to find the best parameters

    gcv = GridSearchCV(estimator=est,
                       param_grid=pgrid,
                       scoring='neg_log_loss',
                       cv=outer_cv,
                       verbose=0,
                       refit=True,
                       return_train_score=False)
    
    gcv.fit(X_train, y_train)
    
    gridcvs[name] = gcv
    
    print(name)
    print()
    print(gcv.best_estimator_)
    print()
    print('Best score on Grid Search Cross Validation is %.5f%%' % (gcv.best_score_))
    print()
    results = pd.DataFrame(gcv.cv_results_)
      

#Inner loop runs Cross Val Score on tuned parameter model to determine accuracy of fit        

    # for name, gs_est in sorted(gridcvs.items()):
    
    nested_score = 0
    nested_score = cross_val_score(gcv, 
                                  X=X_train, 
                                  y=y_train, 
                                  cv=inner_cv,
                                  scoring='neg_log_loss')
                                
    
    print('Name, Log Loss, Std Dev, based on Best Parameter Model using Cross Validation Scoring')
    print('%s | %.2f %.2f' % (name,  nested_score.mean(),  nested_score.std() * 100,))
    print()
    
    
    #Generate predictions and probabilities
    
    best_algo = gcv    

    best_algo.fit(X_train, y_train)
    
    train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
    test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))

    print('Training Accuracy: %.2f%%' % (100 * train_acc))
    print('Test Accuracy: %.2f%%' % (100 * test_acc))
    print()
    
    # prints classification report and confusion matrix
    
    predictions = best_algo.predict(X_test)
    probability = best_algo.predict_proba(X_test)
    print(classification_report(y_test,predictions))
    print()
    print(confusion_matrix(y_test,predictions))
    print()
        

In [None]:
# Model 2 Analysis

# Logistic - Best score on Grid Search Cross Validation is -0.54% 
# XGBoost - Best score on Grid Search Cross Validation is -0.56%
# DTree -   Best score on Grid Search Cross Validation is -0.58%
# RForest - Best score on Grid Search Cross Validation is -0.54%

# Models do better than 50/50 and much better than Experience model

# Logistic and RForest are very close again but RForest has a little higher accuracy

# We will choose RForest

In [None]:
# Training for Model #3 (Stats)

#split the training data further for cross validation

X_train, X_test, y_train, y_test = train_test_split(X_features_stats, y, train_size=0.8, test_size=0.2, random_state=1, stratify=y)

#Intiating Classifiers

clf1 = LogisticRegression()

clf3 = XGBClassifier(n_estimators=30)

clf4 = DecisionTreeClassifier() 

clf5 = RandomForestClassifier()

# Setting up the parameter grids

param_grid1 = [{'clf1__C': list(np.logspace(start=-5, stop=3, num=9))}]

param_grid3 = [{'learning_rate' : [0.1, 0.3],
                'max_depth': [3, 6],
                'min_child_weight': list(range(1, 3))}]

param_grid4 = [{'max_depth': list(range(3, 6)),
                'criterion': ['gini', 'entropy'],
                'min_samples_leaf': [20, 50]}]

param_grid5 = [{'max_depth': list(range(1, 5)),
                'criterion': ['gini', 'entropy'],
                'min_samples_split' : [2, 3]}]

# Building the pipelines

pipe1 = Pipeline([('std', StandardScaler()),('clf1', clf1)])

pipe3 = Pipeline([('std', StandardScaler()),('clf3', clf3)])

pipe4 = Pipeline([('std', StandardScaler()),('clf4', clf4)])

pipe5 = Pipeline([('std', StandardScaler()),('clf5', clf5)])


# Setting up multiple GridSearchCV objects, 1 for each algorithm

gridcvs = {}

inner_cv = StratifiedKFold(n_splits=10, shuffle=False, random_state=2)
outer_cv = StratifiedKFold(n_splits=10, shuffle=False, random_state=2)

for pgrid, est, name in zip((param_grid1, param_grid3, param_grid4, param_grid5),
                            (pipe1, clf3, clf4, clf5,),
                            ('Logistic', 'XGBoost', 'DTree', 'Random Forest')):
    
    #First loop runs GridSearch and does Cross validation to find the best parameters

    gcv = GridSearchCV(estimator=est,
                       param_grid=pgrid,
                       scoring='neg_log_loss',
                       cv=outer_cv,
                       verbose=0,
                       refit=True,
                       return_train_score=False)
    
    gcv.fit(X_train, y_train)
    
    gridcvs[name] = gcv
    
    print(name)
    print()
    print(gcv.best_estimator_)
    print()
    print('Best score on Grid Search Cross Validation is %.5f%%' % (gcv.best_score_))
    print()
    results = pd.DataFrame(gcv.cv_results_)
      

#Inner loop runs Cross Val Score on tuned parameter model to determine accuracy of fit        

    # for name, gs_est in sorted(gridcvs.items()):
    
    nested_score = 0
    nested_score = cross_val_score(gcv, 
                                  X=X_train, 
                                  y=y_train, 
                                  cv=inner_cv,
                                  scoring='neg_log_loss')
                                
    
    print('Name, Log Loss, Std Dev, based on Best Parameter Model using Cross Validation Scoring')
    print('%s | %.2f %.2f' % (name,  nested_score.mean(),  nested_score.std() * 100,))
    print()
    
    
    #Generate predictions and probabilities
    
    best_algo = gcv    

    best_algo.fit(X_train, y_train)
    
    train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
    test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))

    print('Training Accuracy: %.2f%%' % (100 * train_acc))
    print('Test Accuracy: %.2f%%' % (100 * test_acc))
    print()
    
    # prints classification report and confusion matrix
    
    predictions = best_algo.predict(X_test)
    probability = best_algo.predict_proba(X_test)
    print(classification_report(y_test,predictions))
    print()
    print(confusion_matrix(y_test,predictions))
    print()
        

In [None]:
#Model 3 Analysis

# Logistic -Best score on Grid Search Cross Validation is -0.58% 
# XGBoost - Best score on Grid Search Cross Validation is -0.62%
# DTree -   Best score on Grid Search Cross Validation is -0.64%
# RForest - Best score on Grid Search Cross Validation is -0.61%

# Models do better than 50/50 and much better than Experience model but not quite as good as Ranks model

# Logistic is the winner here.  XGBoost has high accuracy in training but way lower in test due to overfitting.

# We will choose Logistic

In [None]:
# Training for Model #4 (Full)

#split the training data further for cross validation

X_train, X_test, y_train, y_test = train_test_split(X_features, y, train_size=0.8, test_size=0.2, random_state=1, stratify=y)

#Intiating Classifiers

clf1 = LogisticRegression()

clf3 = XGBClassifier()

clf4 = DecisionTreeClassifier() 

clf5 = RandomForestClassifier()

# Setting up the parameter grids

param_grid1 = [{'clf1__C': list(np.logspace(start=-5, stop=3, num=9))}]

param_grid3 = [{'learning_rate' : [0.1, 0.3],
                'max_depth': [3, 6],
                'min_child_weight': list(range(1, 3))}]

param_grid4 = [{'max_depth': list(range(3, 6)),
                'criterion': ['gini', 'entropy'],
                'min_samples_leaf': [20, 50]}]

param_grid5 = [{'max_depth': list(range(1, 5)),
                'criterion': ['gini', 'entropy'],
                'min_samples_split' : [2, 3]}]

# Building the pipelines

pipe1 = Pipeline([('std', StandardScaler()),('clf1', clf1)])

pipe3 = Pipeline([('std', StandardScaler()),('clf3', clf3)])

pipe4 = Pipeline([('std', StandardScaler()),('clf4', clf4)])

pipe5 = Pipeline([('std', StandardScaler()),('clf5', clf5)])


# Setting up multiple GridSearchCV objects, 1 for each algorithm

gridcvs = {}

inner_cv = StratifiedKFold(n_splits=10, shuffle=False, random_state=2)
outer_cv = StratifiedKFold(n_splits=10, shuffle=False, random_state=2)

for pgrid, est, name in zip((param_grid1, param_grid3, param_grid4, param_grid5),
                            (pipe1, clf3, clf4, clf5,),
                            ('Logistic', 'XGBoost', 'DTree', 'Random Forest')):
    
    #First loop runs GridSearch and does Cross validation to find the best parameters

    gcv = GridSearchCV(estimator=est,
                       param_grid=pgrid,
                       scoring='neg_log_loss',
                       cv=outer_cv,
                       verbose=0,
                       refit=True,
                       return_train_score=False)
    
    gcv.fit(X_train, y_train)
    
    gridcvs[name] = gcv
    
    print(name)
    print()
    print(gcv.best_estimator_)
    print()
    print('Best score on Grid Search Cross Validation is %.5f%%' % (gcv.best_score_))
    print()
    results = pd.DataFrame(gcv.cv_results_)
      

#Inner loop runs Cross Val Score on tuned parameter model to determine accuracy of fit        

    # for name, gs_est in sorted(gridcvs.items()):
    
    nested_score = 0
    nested_score = cross_val_score(gcv, 
                                  X=X_train, 
                                  y=y_train, 
                                  cv=inner_cv,
                                  scoring='neg_log_loss')
                                
    
    print('Name, Log Loss, Std Dev, based on Best Parameter Model using Cross Validation Scoring')
    print('%s | %.2f %.2f' % (name,  nested_score.mean(),  nested_score.std() * 100,))
    print()
    
    
    #Generate predictions and probabilities
    
    best_algo = gcv    

    best_algo.fit(X_train, y_train)
    
    train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
    test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))

    print('Training Accuracy: %.2f%%' % (100 * train_acc))
    print('Test Accuracy: %.2f%%' % (100 * test_acc))
    print()
    
    # prints classification report and confusion matrix
    
    predictions = best_algo.predict(X_test)
    probability = best_algo.predict_proba(X_test)
    print(classification_report(y_test,predictions))
    print()
    print(confusion_matrix(y_test,predictions))
    print()
        

In [None]:
# Model 4 analysis

# Logistic -Best score on Grid Search Cross Validation is -0.53% 
# XGBoost - Best score on Grid Search Cross Validation is -0.57%
# DTree -   Best score on Grid Search Cross Validation is -0.57%
# RForest - Best score on Grid Search Cross Validation is -0.56%

# Models do better than 50/50 and much better than Experience model

# Logistic is the winner here.  XGBoost has high accuracy in training but way lower in test due to overfitting.

# We will choose Logistic for this model

# This model has the best performance of the 4 models so we will submit this as one of our models


In [None]:
# steps for grabbing teams seasons info and creating the input for the Full model
# uses the sample prediction file for the tournament with all possible games in the tourney that need to be predicted
# the input for the model will be the difference between the teams statistics which is calculated here


n_test_games = len(df_sample_sub)

def get_year_t1_t2(ID):
    """Return a tuple with ints `year`, `team1` and `team2`."""
    return (int(x) for x in ID.split('_'))

X_test = np.zeros(shape=(n_test_games, 1))
columns = df_tourney_final.columns.get_values()
model = []
data = []

for ii, row in df_sample_sub.iterrows():
    year, t1, t2 = get_year_t1_t2(row.ID)
    
    team1 = df_tourney_final[(df_tourney_final.TeamID == t1) & (df_tourney_final.Season == year)].values
    team2 = df_tourney_final[(df_tourney_final.TeamID == t2) & (df_tourney_final.Season == year)].values
    
    model = team1 - team2
    
    data.append(model)

Predictions = pd.DataFrame(np.array(data).reshape(9112,14), columns = (columns))

Predictions.drop(labels=['Season', 'TeamID'], inplace=True, axis=1)

Predictions.head()

In [None]:
#retrain the Full model using best tuned classifier on the entire scaled training data set

clf = LogisticRegression(C=1)

scaler = StandardScaler()

X_scaled = scaler.fit_transform(X_features)

clf.fit(X_scaled, y)

In [None]:
Predictions_scaled = scaler.transform(Predictions)

In [None]:
Predictions_scaled

In [None]:
#generate the predictions for the Full Model

preds = clf.predict_proba(Predictions_scaled)[:,1]

df_sample_sub['Pred'] = preds
df_sample_sub.head()

In [None]:
#generate prediction file

df_sample_sub.to_csv('FullModel_2014_2017_predictions.csv', index=False)

In [None]:
# Ensemble modeling

# We will try some ensemble modeling by combining the 3 lesser models individually since they are modelling different
# features that shouldnt be too corellated with each other and try combining them to see if we can beat the Full
# models performance

#from our results we will go with these 3 classifiers on the models with their respective best tuning paramaters from training

#1 -  Ranks  -   Random Forest

Clf_ranks = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=3,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

#2 -  Experience  -   Random Forest

Clf_experience = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=3,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
#3 -  Stats -  Logistic

Clf_stats = clf = LogisticRegression(C=1)

In [None]:
#setup various feature training sets for individual models

X_ranks = df_predictions_tourney.iloc[:1426, 0:2]
X_experience = df_predictions_tourney.iloc[:1426, 11:12]
X_stats = df_predictions_tourney.iloc[:1426, 2:11]


In [None]:
#setup the scalers.  I am scaling the entire training set here which I know is not optimal.

scaler_ranks = StandardScaler()
scaler_experience = StandardScaler()
scaler_stats = StandardScaler()

X_scaled_ranks = scaler_ranks.fit_transform(X_ranks)
X_scaled_experience = scaler_experience.fit_transform(X_experience)
X_scaled_stats = scaler_stats.fit_transform(X_stats)

Clf_ranks.fit(X_scaled_ranks, y)
Clf_experience.fit(X_scaled_experience, y)
Clf_stats.fit(X_scaled_stats, y)

In [None]:
#make predictions for the models 

pred_ranks = Clf_ranks.predict(X_scaled_ranks)
pred_experience = Clf_experience.predict(X_scaled_experience)
pred_stats = Clf_stats.predict(X_scaled_stats)

In [None]:
#combine the 3 models predictions together 

pred_ranks.reshape(len(X_ranks),1)
pred_experience.reshape(len(X_experience),1)
pred_stats.reshape(len(X_stats),1)




In [None]:
# Create the dataset of feature to be used for model 5

model_predictions = pd.DataFrame()

model_predictions = pd.DataFrame(pred_ranks, columns=['pred_ranks'] )
model_predictions['pred_experience'] = pd.DataFrame(pred_experience, columns=['pred_experience'] )
model_predictions['pred_stats'] = pd.DataFrame(pred_experience, columns=['pred_stats'] )

model_predictions.head(10)



In [None]:
# Training for Model #5 (Ensemble model of weaker 3 models predictions only)

#split the training data further for cross validation

X_train, X_test, y_train, y_test = train_test_split(model_predictions, y, train_size=0.8, test_size=0.2, random_state=1, stratify=y)

#Intiating Classifiers

clf1 = LogisticRegression()

clf3 = XGBClassifier()

clf4 = DecisionTreeClassifier() 

clf5 = RandomForestClassifier()

# Setting up the parameter grids

param_grid1 = [{'clf1__C': list(np.logspace(start=-5, stop=3, num=9))}]

param_grid3 = [{'learning_rate' : [0.1, 0.3],
                'max_depth': [3, 6],
                'min_child_weight': list(range(1, 3))}]

param_grid4 = [{'max_depth': list(range(3, 6)),
                'criterion': ['gini', 'entropy'],
                'min_samples_leaf': [20, 50]}]

param_grid5 = [{'max_depth': list(range(1, 5)),
                'criterion': ['gini', 'entropy'],
                'min_samples_split' : [2, 3]}]

# Building the pipelines

pipe1 = Pipeline([('std', StandardScaler()),('clf1', clf1)])

pipe3 = Pipeline([('std', StandardScaler()),('clf3', clf3)])

pipe4 = Pipeline([('std', StandardScaler()),('clf4', clf4)])

pipe5 = Pipeline([('std', StandardScaler()),('clf5', clf5)])


# Setting up multiple GridSearchCV objects, 1 for each algorithm

gridcvs = {}

inner_cv = StratifiedKFold(n_splits=10, shuffle=False, random_state=2)
outer_cv = StratifiedKFold(n_splits=10, shuffle=False, random_state=2)

for pgrid, est, name in zip((param_grid1, param_grid3, param_grid4, param_grid5),
                            (pipe1, clf3, clf4, clf5,),
                            ('Logistic', 'XGBoost', 'DTree', 'Random Forest')):
    
    #First loop runs GridSearch and does Cross validation to find the best parameters

    gcv = GridSearchCV(estimator=est,
                       param_grid=pgrid,
                       scoring='neg_log_loss',
                       cv=outer_cv,
                       verbose=0,
                       refit=True,
                       return_train_score=False)
    
    gcv.fit(X_train, y_train)
    
    gridcvs[name] = gcv
    
    print(name)
    print()
    print(gcv.best_estimator_)
    print()
    print('Best score on Grid Search Cross Validation is %.5f%%' % (gcv.best_score_))
    print()
    results = pd.DataFrame(gcv.cv_results_)
      

#Inner loop runs Cross Val Score on tuned parameter model to determine accuracy of fit        

    # for name, gs_est in sorted(gridcvs.items()):
    
    nested_score = 0
    nested_score = cross_val_score(gcv, 
                                  X=X_train, 
                                  y=y_train, 
                                  cv=inner_cv,
                                  scoring='neg_log_loss')
                                
    
    print('Name, Log Loss, Std Dev, based on Best Parameter Model using Cross Validation Scoring')
    print('%s | %.2f %.2f' % (name,  nested_score.mean(),  nested_score.std() * 100,))
    print()
    
    
    #Generate predictions and probabilities
    
    best_algo = gcv    

    best_algo.fit(X_train, y_train)
    
    train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
    test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))

    print('Training Accuracy: %.2f%%' % (100 * train_acc))
    print('Test Accuracy: %.2f%%' % (100 * test_acc))
    print()
    
    # prints classification report and confusion matrix
    
    predictions = best_algo.predict(X_test)
    probability = best_algo.predict_proba(X_test)
    print(classification_report(y_test,predictions))
    print()
    print(confusion_matrix(y_test,predictions))
    print()
        

In [None]:
# Model #5 Analysis

# Logistic -Best score on Grid Search Cross Validation is -0.59% 
# XGBoost - Best score on Grid Search Cross Validation is -0.59% 
# DTree -   Best score on Grid Search Cross Validation is -0.59% 
# RForest - Best score on Grid Search Cross Validation is -0.59% 

# Models do better than 50/50 but not as good as Full Model

# They all had same log less and accuray as well. Very simple model? 

# We will not use this model



In [None]:
# Lets add the predictions to Full Models data to create the dataset for Model #6

Full_Ensemble = pd.concat((X_features, model_predictions), axis=1)


In [None]:
Full_Ensemble.head()

In [None]:
# Training for Model #6 (Ensemble model of model #5 predictions plus all known features)

#split into train and test sets

X_train, X_test, y_train, y_test = train_test_split(Full_Ensemble, y, train_size=0.8, test_size=0.2, random_state=1, stratify=y)

#Intiating Classifiers

clf1 = LogisticRegression()

clf3 = XGBClassifier()

clf4 = DecisionTreeClassifier() 

clf5 = RandomForestClassifier()

# Setting up the parameter grids

param_grid1 = [{'clf1__C': list(np.logspace(start=-5, stop=3, num=9))}]

param_grid3 = [{'learning_rate' : [0.1, 0.3],
                'max_depth': [3, 6],
                'min_child_weight': list(range(1, 3))}]

param_grid4 = [{'max_depth': list(range(3, 6)),
                'criterion': ['gini', 'entropy'],
                'min_samples_leaf': [20, 50]}]

param_grid5 = [{'max_depth': list(range(1, 5)),
                'criterion': ['gini', 'entropy'],
                'min_samples_split' : [2, 3]}]

# Building the pipelines

pipe1 = Pipeline([('std', StandardScaler()),('clf1', clf1)])

pipe3 = Pipeline([('std', StandardScaler()),('clf3', clf3)])

pipe4 = Pipeline([('std', StandardScaler()),('clf4', clf4)])

pipe5 = Pipeline([('std', StandardScaler()),('clf5', clf5)])


# Setting up multiple GridSearchCV objects, 1 for each algorithm

gridcvs = {}

inner_cv = StratifiedKFold(n_splits=10, shuffle=False, random_state=2)
outer_cv = StratifiedKFold(n_splits=10, shuffle=False, random_state=2)

for pgrid, est, name in zip((param_grid1, param_grid3, param_grid4, param_grid5),
                            (pipe1, clf3, clf4, clf5,),
                            ('Logistic', 'XGBoost', 'DTree', 'Random Forest')):
    
    #First loop runs GridSearch and does Cross validation to find the best parameters

    gcv = GridSearchCV(estimator=est,
                       param_grid=pgrid,
                       scoring='neg_log_loss',
                       cv=outer_cv,
                       verbose=0,
                       refit=True,
                       return_train_score=False)
    
    gcv.fit(X_train, y_train)
    
    gridcvs[name] = gcv
    
    print(name)
    print()
    print(gcv.best_estimator_)
    print()
    print('Best score on Grid Search Cross Validation is %.5f%%' % (gcv.best_score_))
    print()
    results = pd.DataFrame(gcv.cv_results_)
      

#Inner loop runs Cross Val Score on tuned parameter model to determine accuracy of fit        

    # for name, gs_est in sorted(gridcvs.items()):
    
    nested_score = 0
    nested_score = cross_val_score(gcv, 
                                  X=X_train, 
                                  y=y_train, 
                                  cv=inner_cv,
                                  scoring='neg_log_loss')
                                
    
    print('Name, Log Loss, Std Dev, based on Best Parameter Model using Cross Validation Scoring')
    print('%s | %.2f %.2f' % (name,  nested_score.mean(),  nested_score.std() * 100,))
    print()
    
    
    #Generate predictions and probabilities
    
    best_algo = gcv    

    best_algo.fit(X_train, y_train)
    
    train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
    test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))

    print('Training Accuracy: %.2f%%' % (100 * train_acc))
    print('Test Accuracy: %.2f%%' % (100 * test_acc))
    print()
    
    # prints classification report and confusion matrix
    
    predictions = best_algo.predict(X_test)
    probability = best_algo.predict_proba(X_test)
    print(classification_report(y_test,predictions))
    print()
    print(confusion_matrix(y_test,predictions))
    print()
        

In [None]:
# Model 6 Analysis

# Logistic -Best score on Grid Search Cross Validation is -0.53% 
# XGBoost - Best score on Grid Search Cross Validation is -0.57%
# DTree -   Best score on Grid Search Cross Validation is -0.56%
# RForest - Best score on Grid Search Cross Validation is -0.55%

# Models is on par with Full Model on log loss and actually a little better on Accuracy

# Full Model Test Accuracy =    69.93%
# Full Ensemble Test Accuracy =  70.28%

# So lets build another submission file using this Ensemble model with Logistic Classifier


In [None]:
#Logistic new model DID improve slightly vs the logistic model without the 3 models predictions.  Lets go with this

# Setup the test data sets for the 3 models

features_rank_submission = Predictions.iloc[:, 0:2]
features_experience_submission = Predictions.iloc[:, 11:12]
features_stats_submission = Predictions.iloc[:,  2:11]

In [None]:
#make predictions on models 1,2 and 3 for the training set

pred_ranks_submission = Clf_ranks.predict(features_rank_submission)
pred_experience_submission = Clf_experience.predict(features_experience_submission)
pred_stats_submission = Clf_stats.predict(features_stats_submission)


In [None]:
#combine the 3 models predictions together 

pred_ranks_submission.reshape(len(Predictions),1)
pred_experience_submission.reshape(len(Predictions),1)
pred_stats_submission.reshape(len(Predictions),1)

#build into a dataframe

model_predictions_submission = pd.DataFrame()

model_predictions_submission = pd.DataFrame(pred_ranks_submission, columns=['pred_ranks_submission'] )
model_predictions_submission['pred_experience'] = pd.DataFrame(pred_experience_submission, columns=['pred_experience_submission'] )
model_predictions_submission['pred_stats'] = pd.DataFrame(pred_experience_submission, columns=['pred_stats_submission'] )

model_predictions_submission.head(5)

In [None]:
#merge with the test data set

Full_Ensemble_submission = pd.concat((Predictions, model_predictions_submission), axis=1)

In [None]:
Full_Ensemble_submission.head()

In [None]:
#Fit the final model to full training set and make predictions on scaled test set

clf_Full_Ensemble = LogisticRegression(C=1)

scaler = StandardScaler()

X_scaled = scaler.fit_transform(Full_Ensemble)

clf_Full_Ensemble.fit(Full_Ensemble, y)

Full_Ensemble_submission_scaled = scaler.transform(Full_Ensemble_submission)

preds_submission = clf_Full_Ensemble.predict_proba(Full_Ensemble_submission_scaled)[:,1]

df_sample_sub['Pred'] = preds_submission

df_sample_sub.head()


In [None]:
#generate prediction file

df_sample_sub.to_csv('Full_Ensemble_Model_2014_2017_predictions.csv', index=False)

In [None]:

end_time = time.time()

In [None]:
(end_time - start_time) / 60