In [1]:
import pandas as pd
import numpy as np

import os

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder

from IPython.display import display

%matplotlib inline

# Limitations
- In the interest of time and simplicity we will ignore the players and events as part of this year's model. It could be expanded on at a later time to enchance the model with this additional data.
- Seeds will not be used as they are a biased ranking

In [2]:
data_dir = 'data_2019'

In [3]:
df_teams = pd.read_csv(os.path.join(data_dir, 'Teams.csv'))
df_teams.head()

Unnamed: 0,TeamID,TeamName,FirstD1Season,LastD1Season
0,1101,Abilene Chr,2014,2019
1,1102,Air Force,1985,2019
2,1103,Akron,1985,2019
3,1104,Alabama,1985,2019
4,1105,Alabama A&M,2000,2019


In [4]:
df_teams['D1Seasons'] = df_teams['LastD1Season'] - df_teams['FirstD1Season']
teams_dict = df_teams[['TeamID', 'TeamName']].to_dict()
del df_teams['TeamName']
df_teams.head()

Unnamed: 0,TeamID,FirstD1Season,LastD1Season,D1Seasons
0,1101,2014,2019,5
1,1102,1985,2019,34
2,1103,1985,2019,34
3,1104,1985,2019,34
4,1105,2000,2019,19


In [5]:
df_detailed = pd.read_csv(os.path.join(data_dir, 'RegularSeasonDetailedResults.csv'))
df_detailed.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14


In [6]:
df_detailed['ScoreDiff'] = df_detailed['WScore'] - df_detailed['LScore']

In [7]:
df_coaches = pd.read_csv(os.path.join(data_dir, 'TeamCoaches.csv'))
df_coaches.head()

Unnamed: 0,Season,TeamID,FirstDayNum,LastDayNum,CoachName
0,1985,1102,0,154,reggie_minton
1,1985,1103,0,154,bob_huggins
2,1985,1104,0,154,wimp_sanderson
3,1985,1106,0,154,james_oliver
4,1985,1108,0,154,davey_whitney


In [8]:
df_coaches = df_coaches.sort_values(by=['CoachName', 'Season'])
df_coaches['NumDaysCoaching'] = df_coaches['LastDayNum'] - df_coaches['FirstDayNum']
df_coaches['TotalDaysCoaching'] = df_coaches.groupby('CoachName')['NumDaysCoaching'].cumsum()

# Which iteration the coach as been with the team
# Team A -> Team B -> Team A = 1 -> 2 -> 3
df_coaches['TeamNumber'] = df_coaches['TeamID'].ne(df_coaches['TeamID'].shift().bfill()).astype(int)
df_coaches['TeamNumber'] = df_coaches.groupby(['CoachName'])['TeamNumber'].cumsum()

df_coaches['TeamTenure'] = df_coaches.groupby(['CoachName', 'TeamNumber'])['NumDaysCoaching'].cumsum()
df_coaches.sample(20)

Unnamed: 0,Season,TeamID,FirstDayNum,LastDayNum,CoachName,NumDaysCoaching,TotalDaysCoaching,TeamNumber,TeamTenure
3545,1996,1447,0,154,tim_capstraw,154,1078,1,1078
2469,1993,1254,0,154,paul_lizzo,154,1386,1,1386
4738,2000,1385,0,154,mike_jarvis,154,2310,3,308
5438,2002,1441,0,154,steve_shurina,154,308,1,308
9218,2013,1460,0,154,billy_donlon,154,462,1,462
5319,2002,1308,0,154,lou_henson,154,2618,2,770
228,1985,1396,0,154,john_chaney,154,154,1,154
545,1986,1434,0,154,tom_smith,154,308,1,308
6696,2006,1371,0,154,louis_orr,154,924,2,770
4109,1998,1387,0,154,charles_spoonhour,154,2156,2,924


In [9]:
le_coach = LabelEncoder()
df_coaches['CoachName'] = le_coach.fit_transform(df_coaches['CoachName'].values)

In [10]:
def calc_gameid(row):
    min_id = min(row['WTeamID'], row['LTeamID'])
    max_id = max(row['WTeamID'], row['LTeamID'])
    season = row['Season']
    return f'{season}_{min_id}_{max_id}'

In [11]:
# df_detailed['GameID'] = df_detailed.apply(calc_gameid, axis='columns')
df_detailed['WLoc'] = df_detailed['WLoc'].map({'H': 1, 'A': 2, 'N': 3 })

In [12]:
df_detailed_team_W = pd.merge(left = df_detailed,
        right = df_teams,
        left_on = ['WTeamID'],
        right_on = ['TeamID'])

df_detailed_team_L = pd.merge(left = df_detailed_team_W,
        right = df_teams,
        left_on = ['LTeamID'],
        right_on = ['TeamID'],
        suffixes = ('_W', '_L'))

df_master = pd.merge(left = df_detailed_team_W,
         right = df_detailed_team_L,
         on = list(df_detailed.columns),
         suffixes = ('_W', '_L')
        )

df_master = pd.merge(
    left = df_master,
    right = df_coaches,
    left_on = ['Season', 'WTeamID'],
    right_on = ['Season', 'TeamID']#,
    #suffixes = ('_W', '_L')
)

df_master = pd.merge(
    left = df_master,
    right = df_coaches,
    left_on = ['Season', 'LTeamID'],
    right_on = ['Season', 'TeamID'],
     suffixes = ('_W', '_L')
)
df_master

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,TeamNumber_W,TeamTenure_W,TeamID,FirstDayNum_L,LastDayNum_L,CoachName_L,NumDaysCoaching_L,TotalDaysCoaching_L,TeamNumber_L,TeamTenure_L
0,2003,10,1104,68,1328,62,3,0,27,58,...,2,770,1328,0,154,736,154,2464,2,1386
1,2003,98,1400,67,1328,61,1,0,25,64,...,4,770,1328,0,154,736,154,2464,2,1386
2,2003,124,1400,76,1328,71,2,0,27,50,...,4,770,1328,0,154,736,154,2464,2,1386
3,2003,114,1281,67,1328,52,1,0,24,55,...,1,616,1328,0,154,736,154,2464,2,1386
4,2003,70,1329,48,1328,46,1,0,19,49,...,3,2002,1328,0,154,736,154,2464,2,1386
5,2003,54,1280,54,1328,45,3,0,18,50,...,1,770,1328,0,154,736,154,2464,2,1386
6,2003,18,1104,82,1106,56,1,0,24,49,...,2,770,1106,0,154,1111,154,1078,1,1078
7,2003,13,1202,74,1106,73,3,0,29,51,...,1,924,1106,0,154,1111,154,1078,1,1078
8,2003,14,1426,59,1106,47,3,0,25,53,...,1,1694,1106,0,154,1111,154,1078,1,1078
9,2003,89,1341,63,1106,57,1,0,20,41,...,1,154,1106,0,154,1111,154,1078,1,1078


In [21]:
df_master['Team1'] = df_master.apply(lambda r: sorted([r['WTeamID'], r['LTeamID']])[0], axis=1)
df_master['Team2'] = df_master.apply(lambda r: sorted([r['WTeamID'], r['LTeamID']])[1], axis=1)
df_master['Pred'] = df_master['Team1'] == df_master['TeamID']
df_master['Pred'] = df_master['Pred'].astype(float)
df_master['ScoreDiffNorm'] = df_master.apply(lambda r: r['ScoreDiff'] * -1 if r['Pred'] == 0 else r['ScoreDiff'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

Filter out coaches based on the game day to handle coaches who are replaced mid-season

In [13]:
mask = (df_master['FirstDayNum_L'] <= df_master['DayNum'])
mask = mask & (df_master['LastDayNum_L'] >= df_master['DayNum'])
mask = mask & (df_master['FirstDayNum_W'] <= df_master['DayNum'])
mask = mask & (df_master['LastDayNum_W'] >= df_master['DayNum'])

In [14]:
df_master = df_master.loc[mask, :]

In [15]:
df_master.sample(20)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,TeamNumber_W,TeamTenure_W,TeamID,FirstDayNum_L,LastDayNum_L,CoachName_L,NumDaysCoaching_L,TotalDaysCoaching_L,TeamNumber_L,TeamTenure_L
20175,2013,27,1458,81,1143,56,1,0,30,58,...,2,1848,1143,0,154,939,154,3850,3,770
41343,2012,61,1460,74,1227,70,2,1,23,58,...,1,308,1227,0,154,505,154,308,1,308
18017,2003,79,1131,94,1306,64,2,0,33,62,...,1,1386,1306,0,154,1036,154,616,1,616
68159,2014,96,1214,63,1224,47,1,0,18,52,...,1,770,1224,0,154,761,154,1078,2,616
66708,2010,103,1300,81,1255,78,1,0,30,55,...,1,154,1255,0,154,922,154,924,1,924
20924,2016,96,1203,72,1433,69,2,0,27,59,...,2,770,1433,0,154,1371,154,462,2,154
40554,2006,61,1376,65,1178,35,1,0,25,53,...,2,770,1178,0,154,1022,154,2002,1,2002
43654,2007,50,1314,105,1194,52,1,0,37,56,...,2,616,1194,0,154,1069,154,154,1,154
80207,2016,117,1460,55,1156,51,2,0,17,46,...,1,924,1156,0,154,439,154,3080,3,1540
6271,2011,98,1366,63,1394,57,2,1,21,60,...,2,924,1394,0,154,1021,154,2926,3,616


In [16]:
df_master.dtypes[df_master.dtypes==object]

Series([], dtype: object)

In [17]:
from sklearn.linear_model import LogisticRegression

In [18]:
lr = LogisticRegression()

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
columns = [c for c in df_master if c not in ['WTeamID', 'LTeamID', 'ScoreDiff', 'TeamID']]

In [28]:
X = df_master[columns]
y = df_master.pop('Pred')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8675309)

In [29]:
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [30]:
p = lr.predict(X_test).clip(0.000001, 0.999998)

In [31]:
print('Log Loss:', log_loss(y_test, p))

Log Loss: 1.5155719967274715e-06


# TODO:
* remove detailed dataframe
* calculate team wins and losses rolling for the season
* calculate rolling detailed stats FG %, FT %, 3PT %
* DO NOT USE SCORE