In [22]:
import pandas as pd
import numpy as np

import os

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import log_loss

from IPython.display import display

%matplotlib inline

# Limitations
- In the interest of time and simplicity we will ignore the players and events as part of this year's model. It could be expanded on at a later time to enchance the model with this additional data.
- Seeds will not be used as they are a biased ranking

In [2]:
data_dir = 'data_2018'

In [3]:
df_teams = pd.read_csv(os.path.join(data_dir, 'Teams.csv'))
df_teams.head()

Unnamed: 0,TeamID,TeamName,FirstD1Season,LastD1Season
0,1101,Abilene Chr,2014,2018
1,1102,Air Force,1985,2018
2,1103,Akron,1985,2018
3,1104,Alabama,1985,2018
4,1105,Alabama A&M,2000,2018


In [4]:
df_teams['D1Seasons'] = df_teams['LastD1Season'] - df_teams['FirstD1Season']
df_teams.head()

Unnamed: 0,TeamID,TeamName,FirstD1Season,LastD1Season,D1Seasons
0,1101,Abilene Chr,2014,2018,4
1,1102,Air Force,1985,2018,33
2,1103,Akron,1985,2018,33
3,1104,Alabama,1985,2018,33
4,1105,Alabama A&M,2000,2018,18


In [6]:
df_detailed = pd.read_csv(os.path.join(data_dir, 'RegularSeasonDetailedResults.csv'))
df_detailed.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14


In [7]:
df_coaches = pd.read_csv(os.path.join(data_dir, 'TeamCoaches.csv'))
df_coaches.head()

Unnamed: 0,Season,TeamID,FirstDayNum,LastDayNum,CoachName
0,1985,1102,0,154,reggie_minton
1,1985,1103,0,154,bob_huggins
2,1985,1104,0,154,wimp_sanderson
3,1985,1106,0,154,james_oliver
4,1985,1108,0,154,davey_whitney


In [8]:
df_coaches = df_coaches.sort_values(by=['CoachName', 'Season'])
df_coaches['NumDaysCoaching'] = df_coaches['LastDayNum'] - df_coaches['FirstDayNum']
df_coaches['TotalDaysCoaching'] = df_coaches.groupby('CoachName')['NumDaysCoaching'].cumsum()

# Which iteration the coach as been with the team
# Team A -> Team B -> Team A = 1 -> 2 -> 3
df_coaches['TeamNumber'] = df_coaches['TeamID'].ne(df_coaches['TeamID'].shift().bfill()).astype(int)
df_coaches['TeamNumber'] = df_coaches.groupby(['CoachName'])['TeamNumber'].cumsum()

df_coaches['TeamTenure'] = df_coaches.groupby(['CoachName', 'TeamNumber'])['NumDaysCoaching'].cumsum()
df_coaches.sample(20)

Unnamed: 0,Season,TeamID,FirstDayNum,LastDayNum,CoachName,NumDaysCoaching,TotalDaysCoaching,TeamNumber,TeamTenure
8934,2013,1170,0,154,brian_katz,154,770,1,770
1745,1990,1461,0,154,benny_dees,154,770,2,462
2438,1993,1212,0,154,aaron_james,154,616,1,616
1725,1990,1437,0,154,rollie_massimino,154,924,1,924
9182,2013,1422,0,154,wes_miller,154,265,1,265
6736,2006,1412,0,154,mike_anderson,154,616,1,616
4104,1998,1382,0,154,jim_baron,154,1694,2,924
10365,2017,1189,0,154,matt_matheny,154,1232,1,1232
7396,2008,1387,0,154,rick_majerus,154,2764,4,154
6366,2005,1378,0,154,robert_mccullum,154,770,2,308


In [9]:
def calc_gameid(row):
    min_id = min(row['WTeamID'], row['LTeamID'])
    max_id = max(row['WTeamID'], row['LTeamID'])
    season = row['Season']
    return f'{season}_{min_id}_{max_id}'

In [10]:
df_detailed['GameID'] = df_detailed.apply(calc_gameid, axis='columns')

In [30]:
df_detailed_team_W = pd.merge(left = df_detailed,
        right = df_teams,
        left_on = ['WTeamID'],
        right_on = ['TeamID'])

df_detailed_team_L = pd.merge(left = df_detailed_team_W,
        right = df_teams,
        left_on = ['LTeamID'],
        right_on = ['TeamID'],
        suffixes = ('_W', '_L'))

df_master = pd.merge(left = df_detailed_team_W,
         right = df_detailed_team_L,
         on = list(df_detailed.columns),
         suffixes = ('_W', '_L')
        )

df_master = pd.merge(
    left = df_master,
    right = df_coaches,
    left_on = ['Season', 'WTeamID'],
    right_on = ['Season', 'TeamID']#,
    #suffixes = ('_W', '_L')
)

df_master = pd.merge(
    left = df_master,
    right = df_coaches,
    left_on = ['Season', 'LTeamID'],
    right_on = ['Season', 'TeamID'],
     suffixes = ('_W', '_L')
)
df_master

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,TeamNumber_W,TeamTenure_W,TeamID,FirstDayNum_L,LastDayNum_L,CoachName_L,NumDaysCoaching_L,TotalDaysCoaching_L,TeamNumber_L,TeamTenure_L
0,2003,10,1104,68,1328,62,N,0,27,58,...,2,770,1328,0,154,kelvin_sampson,154,2464,2,1386
1,2003,98,1400,67,1328,61,H,0,25,64,...,4,770,1328,0,154,kelvin_sampson,154,2464,2,1386
2,2003,124,1400,76,1328,71,A,0,27,50,...,4,770,1328,0,154,kelvin_sampson,154,2464,2,1386
3,2003,114,1281,67,1328,52,H,0,24,55,...,1,616,1328,0,154,kelvin_sampson,154,2464,2,1386
4,2003,70,1329,48,1328,46,H,0,19,49,...,3,2002,1328,0,154,kelvin_sampson,154,2464,2,1386
5,2003,54,1280,54,1328,45,N,0,18,50,...,1,770,1328,0,154,kelvin_sampson,154,2464,2,1386
6,2003,18,1104,82,1106,56,H,0,24,49,...,2,770,1106,0,154,rob_spivery,154,1078,1,1078
7,2003,13,1202,74,1106,73,N,0,29,51,...,1,924,1106,0,154,rob_spivery,154,1078,1,1078
8,2003,14,1426,59,1106,47,N,0,25,53,...,1,1694,1106,0,154,rob_spivery,154,1078,1,1078
9,2003,89,1341,63,1106,57,H,0,20,41,...,1,154,1106,0,154,rob_spivery,154,1078,1,1078


In [31]:
df_master['GameID'].value_counts()

2004_1424_1428    12
2010_1171_1335     8
2012_1226_1319     8
2008_1337_1362     8
2011_1395_1461     6
2017_1228_1276     6
2012_1422_1441     6
2017_1163_1378     6
2007_1276_1278     6
2004_1161_1424     6
2012_1102_1307     6
2008_1337_1360     6
2018_1175_1299     6
2009_1293_1398     6
2017_1133_1179     6
2008_1178_1460     6
2016_1102_1424     6
2010_1406_1423     6
2012_1198_1443     6
2018_1413_1415     6
2008_1258_1362     6
2008_1112_1333     6
2013_1375_1443     6
2004_1140_1428     6
2016_1201_1424     6
2003_1125_1209     6
2008_1362_1365     6
2012_1111_1422     6
2003_1152_1434     6
2009_1258_1360     6
                  ..
2016_1230_1322     1
2018_1204_1418     1
2016_1205_1397     1
2015_1326_1460     1
2018_1186_1449     1
2012_1210_1321     1
2004_1307_1336     1
2010_1103_1115     1
2006_1108_1378     1
2017_1299_1351     1
2016_1339_1356     1
2014_1154_1397     1
2008_1357_1384     1
2016_1354_1382     1
2015_1104_1406     1
2017_1398_1413     1
2012_1341_137

In [34]:
with pd.option_context('display.max_columns', None):
    display(df_master[df_master['GameID'] == '2004_1424_1428'])

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,WFGM3,WFGA3,WFTM,WFTA,WOR,WDR,WAst,WTO,WStl,WBlk,WPF,LFGM,LFGA,LFGM3,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF,GameID,TeamID_x,TeamName,FirstD1Season,LastD1Season,D1Seasons,TeamID_W,TeamName_W,FirstD1Season_W,LastD1Season_W,D1Seasons_W,TeamID_L,TeamName_L,FirstD1Season_L,LastD1Season_L,D1Seasons_L,TeamID_y,FirstDayNum_W,LastDayNum_W,CoachName_W,NumDaysCoaching_W,TotalDaysCoaching_W,TeamNumber_W,TeamTenure_W,TeamID,FirstDayNum_L,LastDayNum_L,CoachName_L,NumDaysCoaching_L,TotalDaysCoaching_L,TeamNumber_L,TeamTenure_L
21996,2004,70,1428,72,1424,67,A,0,25,46,7,13,15,17,10,22,14,20,4,0,29,18,45,4,16,27,37,11,11,7,12,7,6,22,2004_1424_1428,1428,Utah,1985,2018,33,1428,Utah,1985,2018,33,1424,UNLV,1985,2018,33,1428,86,154,kerry_rupp,68,68,1,68,1424,0,105,charles_spoonhour,105,2723,3,413
21997,2004,70,1428,72,1424,67,A,0,25,46,7,13,15,17,10,22,14,20,4,0,29,18,45,4,16,27,37,11,11,7,12,7,6,22,2004_1424_1428,1428,Utah,1985,2018,33,1428,Utah,1985,2018,33,1424,UNLV,1985,2018,33,1428,86,154,kerry_rupp,68,68,1,68,1424,106,154,jay_spoonhour,48,48,1,48
21998,2004,70,1428,72,1424,67,A,0,25,46,7,13,15,17,10,22,14,20,4,0,29,18,45,4,16,27,37,11,11,7,12,7,6,22,2004_1424_1428,1428,Utah,1985,2018,33,1428,Utah,1985,2018,33,1424,UNLV,1985,2018,33,1428,0,85,rick_majerus,85,2610,3,1994,1424,0,105,charles_spoonhour,105,2723,3,413
21999,2004,70,1428,72,1424,67,A,0,25,46,7,13,15,17,10,22,14,20,4,0,29,18,45,4,16,27,37,11,11,7,12,7,6,22,2004_1424_1428,1428,Utah,1985,2018,33,1428,Utah,1985,2018,33,1424,UNLV,1985,2018,33,1428,0,85,rick_majerus,85,2610,3,1994,1424,106,154,jay_spoonhour,48,48,1,48
22000,2004,98,1428,70,1424,56,H,0,23,50,5,13,19,21,11,35,12,16,5,7,15,19,54,5,17,13,18,4,17,9,7,10,3,18,2004_1424_1428,1428,Utah,1985,2018,33,1428,Utah,1985,2018,33,1424,UNLV,1985,2018,33,1428,86,154,kerry_rupp,68,68,1,68,1424,0,105,charles_spoonhour,105,2723,3,413
22001,2004,98,1428,70,1424,56,H,0,23,50,5,13,19,21,11,35,12,16,5,7,15,19,54,5,17,13,18,4,17,9,7,10,3,18,2004_1424_1428,1428,Utah,1985,2018,33,1428,Utah,1985,2018,33,1424,UNLV,1985,2018,33,1428,86,154,kerry_rupp,68,68,1,68,1424,106,154,jay_spoonhour,48,48,1,48
22002,2004,98,1428,70,1424,56,H,0,23,50,5,13,19,21,11,35,12,16,5,7,15,19,54,5,17,13,18,4,17,9,7,10,3,18,2004_1424_1428,1428,Utah,1985,2018,33,1428,Utah,1985,2018,33,1424,UNLV,1985,2018,33,1428,0,85,rick_majerus,85,2610,3,1994,1424,0,105,charles_spoonhour,105,2723,3,413
22003,2004,98,1428,70,1424,56,H,0,23,50,5,13,19,21,11,35,12,16,5,7,15,19,54,5,17,13,18,4,17,9,7,10,3,18,2004_1424_1428,1428,Utah,1985,2018,33,1428,Utah,1985,2018,33,1424,UNLV,1985,2018,33,1428,0,85,rick_majerus,85,2610,3,1994,1424,106,154,jay_spoonhour,48,48,1,48
22004,2004,131,1428,73,1424,70,N,0,23,62,10,21,17,21,17,20,14,11,5,2,15,26,52,7,16,11,14,8,25,10,13,7,4,18,2004_1424_1428,1428,Utah,1985,2018,33,1428,Utah,1985,2018,33,1424,UNLV,1985,2018,33,1428,86,154,kerry_rupp,68,68,1,68,1424,0,105,charles_spoonhour,105,2723,3,413
22005,2004,131,1428,73,1424,70,N,0,23,62,10,21,17,21,17,20,14,11,5,2,15,26,52,7,16,11,14,8,25,10,13,7,4,18,2004_1424_1428,1428,Utah,1985,2018,33,1428,Utah,1985,2018,33,1424,UNLV,1985,2018,33,1428,86,154,kerry_rupp,68,68,1,68,1424,106,154,jay_spoonhour,48,48,1,48


# TODO:
- Resolve merge issues with coaches who change midseason