In [4]:
import pandas as pd
import numpy as np

import os

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# Limitations
- In the interest of time and simplicity we will ignore the players and events as part of this year's model. It could be expanded on at a later time to enchance the model with this additional data.
- Seeds will not be used as they are a biased ranking

In [5]:
data_dir = 'data_2018'

In [8]:
df_teams = pd.read_csv(os.path.join(data_dir, 'Teams.csv'))
df_teams.head()

Unnamed: 0,TeamID,TeamName,FirstD1Season,LastD1Season
0,1101,Abilene Chr,2014,2018
1,1102,Air Force,1985,2018
2,1103,Akron,1985,2018
3,1104,Alabama,1985,2018
4,1105,Alabama A&M,2000,2018


In [40]:
df_teams['D1Seasons'] = df_teams['LastD1Season'] - df_teams['FirstD1Season']
df_teams.head()

Unnamed: 0,TeamID,TeamName,FirstD1Season,LastD1Season,D1Seasons
0,1101,Abilene Chr,2014,2018,4
1,1102,Air Force,1985,2018,33
2,1103,Akron,1985,2018,33
3,1104,Alabama,1985,2018,33
4,1105,Alabama A&M,2000,2018,18


In [16]:
df_detailed = pd.read_csv(os.path.join(data_dir, 'NCAATourneyDetailedResults.csv'))
df_detailed.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,134,1421,92,1411,84,N,1,32,69,...,31,14,31,17,28,16,15,5,0,22
1,2003,136,1112,80,1436,51,N,0,31,66,...,16,7,7,8,26,12,17,10,3,15
2,2003,136,1113,84,1272,71,N,0,31,59,...,28,14,21,20,22,11,12,2,5,18
3,2003,136,1141,79,1166,73,N,0,29,53,...,17,12,17,14,17,20,21,6,6,21
4,2003,136,1143,76,1301,74,N,1,27,64,...,21,15,20,10,26,16,14,5,8,19


In [18]:
df_coaches = pd.read_csv(os.path.join(data_dir, 'TeamCoaches.csv'))
df_coaches.head()

Unnamed: 0,Season,TeamID,FirstDayNum,LastDayNum,CoachName
0,1985,1102,0,154,reggie_minton
1,1985,1103,0,154,bob_huggins
2,1985,1104,0,154,wimp_sanderson
3,1985,1106,0,154,james_oliver
4,1985,1108,0,154,davey_whitney


In [38]:
df_coaches = df_coaches.sort_values(by=['CoachName', 'Season'])
df_coaches['NumDaysCoaching'] = df_coaches['LastDayNum'] - df_coaches['FirstDayNum']
df_coaches['TotalDaysCoaching'] = df_coaches.groupby('CoachName')['NumDaysCoaching'].cumsum()

# Which iteration the coach as been with the team
# Team A -> Team B -> Team A = 1 -> 2 -> 3
df_coaches['TeamNumber'] = df_coaches['TeamID'].ne(df_coaches['TeamID'].shift().bfill()).astype(int)
df_coaches['TeamNumber'] = df_coaches.groupby(['CoachName'])['TeamNumber'].cumsum()

df_coaches['TeamTenure'] = df_coaches.groupby(['CoachName', 'TeamNumber'])['NumDaysCoaching'].cumsum()
df_coaches.sample(20)

Unnamed: 0,Season,TeamID,FirstDayNum,LastDayNum,CoachName,NumDaysCoaching,TotalDaysCoaching,TeamNumber,TeamTenure,Tenure
2943,1994,1461,0,154,joby_wright,154,1078,2,616,154
3974,1998,1226,0,154,herb_williams,154,1232,1,1232,1232
5175,2002,1145,0,154,mike_macdonald,154,1386,1,1386,770
5463,2003,1105,0,154,vann_pettaway,154,1848,1,1848,616
6947,2007,1277,0,154,tom_izzo,154,3542,1,3542,1848
1239,1989,1202,0,154,butch_estes,154,1386,1,1386,616
9453,2014,1339,0,154,eric_reveno,154,1540,1,1540,1232
9911,2015,1449,0,154,lorenzo_romar,154,3234,3,2310,2002
5332,2002,1324,0,154,greg_kampe,154,2926,1,2926,462
4542,2000,1160,0,154,ricardo_patton,154,2386,1,1770,692


In [39]:
df_coaches[df_coaches['CoachName'] == 'bob_knight']

Unnamed: 0,Season,TeamID,FirstDayNum,LastDayNum,CoachName,NumDaysCoaching,TotalDaysCoaching,TeamNumber,TeamTenure,Tenure
101,1985,1231,0,154,bob_knight,154,3478,1,2464,154
387,1986,1231,0,154,bob_knight,154,3478,1,2464,308
675,1987,1231,0,154,bob_knight,154,3478,1,2464,462
969,1988,1231,0,154,bob_knight,154,3478,1,2464,616
1263,1989,1231,0,154,bob_knight,154,3478,1,2464,770
1559,1990,1231,0,154,bob_knight,154,3478,1,2464,924
1852,1991,1231,0,154,bob_knight,154,3478,1,2464,1078
2150,1992,1231,0,154,bob_knight,154,3478,1,2464,1232
2451,1993,1231,0,154,bob_knight,154,3478,1,2464,1386
2750,1994,1231,0,154,bob_knight,154,3478,1,2464,1540


In [46]:
df_master = pd.merge(
    left = df_coaches,
    right = df_teams,
    on = 'TeamID'
)

df_master =  pd.merge(
    left = df_master,
    right = df_detailed,
    left_on = 'TeamID',
    right_on = 'WTeamID'
)
df_master =  pd.merge(
    left = df_master,
    right = df_detailed,
    left_on = 'TeamID',
    right_on = 'LTeamID',
    suffixes = ('W','L')
)
df_master.head()

Unnamed: 0,Season_x,TeamID,FirstDayNum,LastDayNum,CoachName,NumDaysCoaching,TotalDaysCoaching,TeamNumber,TeamTenure,Tenure,...,LFGA3L,LFTML,LFTAL,LORL,LDRL,LAstL,LTOL,LStlL,LBlkL,LPFL
0,1998,1443,110,154,al_seibert,44,44,1,44,44,...,24,8,13,17,18,10,14,6,5,16
1,1998,1443,110,154,al_seibert,44,44,1,44,44,...,24,19,26,18,18,12,11,11,4,24
2,1998,1443,110,154,al_seibert,44,44,1,44,44,...,25,5,14,10,21,14,11,4,5,20
3,1998,1443,110,154,al_seibert,44,44,1,44,44,...,15,10,16,10,20,10,10,6,2,19
4,1998,1443,110,154,al_seibert,44,44,1,44,44,...,20,12,18,18,23,8,10,9,0,24


# TODO:
- Tie in detailed for winning and losing
- pull in df_compact