In [None]:
import pandas as pd
import numpy as np

import os

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder

from IPython.display import display

%matplotlib inline

# Limitations
- In the interest of time and simplicity we will ignore the players and events as part of this year's model. It could be expanded on at a later time to enchance the model with this additional data.
- Seeds will not be used as they are a biased ranking

In [None]:
data_dir = 'data_2019'

In [None]:
df_teams = pd.read_csv(os.path.join(data_dir, 'Teams.csv'))
df_teams.head()

In [None]:
df_teams['D1Seasons'] = df_teams['LastD1Season'] - df_teams['FirstD1Season']
teams_dict = df_teams[['TeamID', 'TeamName']].to_dict()
del df_teams['TeamName']
df_teams.head()

In [None]:
df_detailed = pd.read_csv(os.path.join(data_dir, 'RegularSeasonDetailedResults.csv'))
df_detailed.head()

In [None]:
df_detailed['ScoreDiff'] = df_detailed['WScore'] - df_detailed['LScore']

In [None]:
ignore_cols = ['WLoc']
include_cols = ['DayNum', 'Season']
w_cols = [c for c in df_detailed 
          if (c.startswith('W') and c not in ignore_cols) or 
              c in include_cols]
df_stats_W = df_detailed.loc[:, w_cols]
c_names = {c:(c[1:] if c not in include_cols else c) for c in df_stats_W}
df_stats_W.rename(columns=c_names, inplace=True)
df_stats_W['W'] = 1
df_stats_W['L'] = 0
df_stats_W.head()

In [None]:
ignore_cols = ['WLoc']
include_cols = ['DayNum', 'Season']
cols = [c for c in df_detailed if (c.startswith('L') and c not in ignore_cols) or c in include_cols]
df_stats_L = df_detailed.loc[:, cols]
c_names = {c:(c[1:] if c not in include_cols else c) for c in df_stats_L}
df_stats_L.rename(columns=c_names, inplace=True)
df_stats_L['W'] = 0
df_stats_L['L'] = 1
df_stats_L.head()

In [None]:
df_stats = df_stats_W.append(df_stats_L)
df_c_stats = df_stats.groupby(['TeamID', 'Season', 'DayNum']).sum().groupby(level=[0, 1]).cumsum()
df_c_stats.reset_index(inplace=True)
df_c_stats['GameNum'] = df_c_stats.reset_index().groupby(['TeamID', 'Season']).cumcount() + 1
df_c_stats.set_index(['TeamID', 'Season'], inplace=True)
df_c_stats.head()

In [None]:
df_c_stats['FG_PCT'] = df_c_stats['FGM'] / df_c_stats['FGA']
df_c_stats['FG3_PCT'] = df_c_stats['FGM3'] / df_c_stats['FGA3']
df_c_stats['FT_PCT'] = df_c_stats['FTM'] / df_c_stats['FTA']

PGL = ['Score', 'OR', 'DR', 'Ast', 'TO', 'Stl', 'Blk', 'PF']
for pg in PGL:
    df_c_stats[f'{pg}_PG'] = df_c_stats[pg] / df_c_stats['GameNum']
df_c_stats.head()

In [None]:
mask = (df_c_stats['GameNum'] == 1)
df0_i = df_c_stats[mask].copy().index
df0 = pd.DataFrame(columns=df_c_stats.columns, index=df0_i)
df0 = df0.fillna(0)
df_c = df_c_stats.append(df0).reset_index().set_index(['TeamID', 'Season'])
df_c

In [None]:
df_coaches = pd.read_csv(os.path.join(data_dir, 'TeamCoaches.csv'))
df_coaches.head()

In [None]:
df_coaches = df_coaches.sort_values(by=['CoachName', 'Season'])
df_coaches['NumDaysCoaching'] = df_coaches['LastDayNum'] - df_coaches['FirstDayNum']
df_coaches['TotalDaysCoaching'] = df_coaches.groupby('CoachName')['NumDaysCoaching'].cumsum()

# Which iteration the coach as been with the team
# Team A -> Team B -> Team A = 1 -> 2 -> 3
df_coaches['TeamNumber'] = df_coaches['TeamID'].ne(df_coaches['TeamID'].shift().bfill()).astype(int)
df_coaches['TeamNumber'] = df_coaches.groupby(['CoachName'])['TeamNumber'].cumsum()

df_coaches['TeamTenure'] = df_coaches.groupby(['CoachName', 'TeamNumber'])['NumDaysCoaching'].cumsum()
df_coaches.sample(20)

In [None]:
le_coach = LabelEncoder()
df_coaches['CoachName'] = le_coach.fit_transform(df_coaches['CoachName'].values)

In [None]:
def calc_gameid(row):
    min_id = min(row['WTeamID'], row['LTeamID'])
    max_id = max(row['WTeamID'], row['LTeamID'])
    season = row['Season']
    return f'{season}_{min_id}_{max_id}'

In [None]:
df_detailed_team_W = pd.merge(left = df_detailed,
        right = df_teams,
        left_on = ['WTeamID'],
        right_on = ['TeamID'])

df_detailed_team_L = pd.merge(left = df_detailed_team_W,
        right = df_teams,
        left_on = ['LTeamID'],
        right_on = ['TeamID'],
        suffixes = ('_W', '_L'))


In [None]:
df_master = pd.merge(left = df_detailed_team_W,
         right = df_detailed_team_L,
         on = list(df_detailed.columns),
         suffixes = ('_W', '_L')
        )

df_master = pd.merge(
    left = df_master,
    right = df_coaches,
    left_on = ['Season', 'WTeamID'],
    right_on = ['Season', 'TeamID']#,
    #suffixes = ('_W', '_L')
)

df_master = pd.merge(
    left = df_master,
    right = df_coaches,
    left_on = ['Season', 'LTeamID'],
    right_on = ['Season', 'TeamID'],
     suffixes = ('_W', '_L')
)
df_master.sample(20)

In [None]:
df_master['Team1'] = df_master.apply(lambda r: sorted([r['WTeamID'], r['LTeamID']])[0], axis=1)
df_master['Team2'] = df_master.apply(lambda r: sorted([r['WTeamID'], r['LTeamID']])[1], axis=1)
df_master['Pred'] = df_master['Team1'] == df_master['TeamID']
df_master['Pred'] = df_master['Pred'].astype(float)
df_master['ScoreDiffNorm'] = df_master.apply(lambda r: r['ScoreDiff'] * -1 if r['Pred'] == 0 else r['ScoreDiff'], axis=1)
df_master['Team1W'] = df_master['Team1'] == df_master['WTeamID']
df_master['Team1L'] = df_master['Team1'] == df_master['LTeamID']
df_master['Team2W'] = df_master['Team2'] == df_master['WTeamID']
df_master['Team2L'] = df_master['Team2'] == df_master['LTeamID']
# df_master['Team1W'] = df_master.groupby(['Season', 'Team1')['Team1W'].cumsum()
# df_master['Team2W'] = df_master.groupby('Team2')['Team2W'].cumsum()
# df_master['Team1L'] = df_master.groupby('Team1')['Team1L'].cumsum()
# df_master['Team2L'] = df_master.groupby('Team2')['Team2L'].cumsum()

In [None]:
df_master.groupby(['TeamID', 'Season', 'GameNum'])[['Team1W', 'Team1L', 'Team2W', 'Team1W']].sum() #.groupby(level=[0]).cumsum()

In [None]:
df_master.head()

Filter out coaches based on the game day to handle coaches who are replaced mid-season

In [None]:
mask = (df_master['FirstDayNum_L'] <= df_master['DayNum'])
mask = mask & (df_master['LastDayNum_L'] >= df_master['DayNum'])
mask = mask & (df_master['FirstDayNum_W'] <= df_master['DayNum'])
mask = mask & (df_master['LastDayNum_W'] >= df_master['DayNum'])

In [None]:
df_master = df_master.loc[mask, :]

In [None]:
df_master.sample(20)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr = LogisticRegression(solver='lbfgs')

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
columns = [c for c in df_master if c not in ['WTeamID', 'LTeamID', 'ScoreDiff', 'TeamID']]

In [None]:
X = df_master[columns]
y = df_master.pop('Pred')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8675309)

In [None]:
lr.fit(X_train, y_train)

In [None]:
p = lr.predict(X_test).clip(0.000001, 0.999998)

In [None]:
print('Log Loss:', log_loss(y_test, p))

# TODO:
* [ ] calculate team wins and losses rolling for the season
  * Added Game 0 Values, just need to determine how to shift W/L columns down to show values coming into game
* [X] calculate rolling detailed stats FG %, FT %, 3PT %
  * [ ] shift to be stats prior to game?
  * How handle first game of season as shift will make all stats NaN? Currently filling all stats with 0
* [X] DO NOT USE SCORE