In [None]:
import pandas as pd
import numpy as np

import os

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder

from IPython.display import display

%matplotlib inline

# Limitations
- In the interest of time and simplicity we will ignore the players and events as part of this year's model. It could be expanded on at a later time to enchance the model with this additional data.
- Seeds will not be used as they are a biased ranking

In [None]:
data_dir = 'data_2019'

In [None]:
df_teams = pd.read_csv(os.path.join(data_dir, 'Teams.csv'))
df_teams.head()

In [None]:
df_teams['D1Seasons'] = df_teams['LastD1Season'] - df_teams['FirstD1Season']
teams_dict = df_teams[['TeamID', 'TeamName']].to_dict()
del df_teams['TeamName']
df_teams.head()

In [None]:
df_detailed = pd.read_csv(os.path.join(data_dir, 'RegularSeasonDetailedResults.csv'))
df_detailed.head()

In [None]:
df_detailed['ScoreDiff'] = df_detailed['WScore'] - df_detailed['LScore']

In [None]:
ignore_cols = ['WLoc']
include_cols = ['DayNum', 'Season']
w_cols = [c for c in df_detailed 
          if (c.startswith('W') and c not in ignore_cols) or 
              c in include_cols]
df_stats_W = df_detailed.loc[:, w_cols]
c_names = {c:(c[1:] if c not in include_cols else c) for c in df_stats_W}
df_stats_W.rename(columns=c_names, inplace=True)
df_stats_W['W'] = 1
df_stats_W['L'] = 0
df_stats_W.head()

In [None]:
ignore_cols = ['WLoc']
include_cols = ['DayNum', 'Season']
cols = [c for c in df_detailed if (c.startswith('L') and c not in ignore_cols) or c in include_cols]
df_stats_L = df_detailed.loc[:, cols]
c_names = {c:(c[1:] if c not in include_cols else c) for c in df_stats_L}
df_stats_L.rename(columns=c_names, inplace=True)
df_stats_L['W'] = 0
df_stats_L['L'] = 1
df_stats_L.head()

In [None]:
df_stats = df_stats_W.append(df_stats_L)
df_stats = df_stats.sort_values(by=['TeamID', 'Season', 'DayNum'])
df_stats.head()

In [None]:
df_c_stats = df_stats.groupby(['TeamID', 'Season', 'DayNum']).sum().groupby(level=[0, 1]).cumsum()
df_c_stats.reset_index(inplace=True)
df_c_stats['GameNum'] = df_c_stats.reset_index().groupby(['TeamID', 'Season']).cumcount() + 1
df_c_stats.set_index(['TeamID', 'Season'], inplace=True)
df_c_stats.head()

In [None]:
df_c_stats['FG_PCT'] = df_c_stats['FGM'] / df_c_stats['FGA']
df_c_stats['FG3_PCT'] = df_c_stats['FGM3'] / df_c_stats['FGA3']
df_c_stats['FT_PCT'] = df_c_stats['FTM'] / df_c_stats['FTA']

PGL = ['Score', 'OR', 'DR', 'Ast', 'TO', 'Stl', 'Blk', 'PF']
for pg in PGL:
    df_c_stats[f'{pg}_PG'] = df_c_stats[pg] / df_c_stats['GameNum']
df_c_stats.head()

In [None]:
df_result = df_detailed.loc[:, ['Season', 'DayNum', 'WTeamID', 'LTeamID']]
df_result['TeamID1'] = df_result.apply(lambda row: sorted([row['WTeamID'], row['LTeamID']])[0], axis=1)
df_result['TeamID2'] = df_result.apply(lambda row: sorted([row['WTeamID'], row['LTeamID']])[1], axis=1)
df_result['Pred'] = df_result['TeamID1'] == df_result['WTeamID']
df_result['Pred'] = df_result['Pred'].astype(float)
del df_result['WTeamID']
del df_result['LTeamID']
df_result.head()

In [None]:
df_coaches = pd.read_csv(os.path.join(data_dir, 'TeamCoaches.csv'))
df_coaches.head()

In [None]:
# Which iteration the coach as been with the team
# Team A -> Team B -> Team A = 1 -> 2 -> 3

df_coaches = df_coaches.sort_values(by=['CoachName', 'Season'])
df_coaches['TeamNumber'] = df_coaches['TeamID'].ne(df_coaches['TeamID'].shift().bfill()).astype(int)
df_coaches['TeamNumber'] = df_coaches.groupby(['CoachName'])['TeamNumber'].cumsum()
df_coaches.head(30)

In [None]:
# Handle special scenario where some coaches start with team number 0 instead of 1

team_number_0_coaches = df_coaches[df_coaches['TeamNumber']==0]['CoachName'].unique()
row_filter = df_coaches['CoachName'].isin(team_number_0_coaches)
df_coaches.loc[row_filter, 'TeamNumber'] = df_coaches.loc[row_filter, 'TeamNumber'] + 1

In [None]:
df_coaches.head()

In [None]:
def calc_gameid(row):
    min_id = min(row['WTeamID'], row['LTeamID'])
    max_id = max(row['WTeamID'], row['LTeamID'])
    season = row['Season']
    return f'{season}_{min_id}_{max_id}'

In [None]:
df_c_pg = df_c_stats.loc[:, ['W', 'L', 'GameNum', 'DayNum', 'FG_PCT',
       'FG3_PCT', 'FT_PCT', 'Score_PG', 'OR_PG', 'DR_PG', 'Ast_PG', 'TO_PG',
       'Stl_PG', 'Blk_PG', 'PF_PG']]

mask = (df_c_pg['GameNum'] == 1)
df0_i = df_c_pg[mask].copy().index
df0 = pd.DataFrame(columns=df_c_pg.columns, index=df0_i)
df0 = df0.fillna(0)
df_c = df_c_pg.append(df0).reset_index()
df_c.head()

df_c_pg.head()

In [None]:
for shift_col in ['W', 'L', 'FG_PCT',
       'FG3_PCT', 'FT_PCT', 'Score_PG', 'OR_PG', 'DR_PG', 'Ast_PG', 'TO_PG',
       'Stl_PG', 'Blk_PG', 'PF_PG']:
    #df_c_pg[shift_col] = df_c_pg[shift_col].shift(1)
    df_c_pg[shift_col] = df_c_pg.groupby(level=[0,1])[shift_col].shift(1)

df_c_pg = df_c_pg.dropna()
df_c_pg.head()

In [None]:
df_c_pg[df_c_pg['GameNum']==1]

In [None]:
df_c_pg_coaches = pd.merge(
    left = df_c_pg.reset_index(),
    right = df_coaches,
    on = ['TeamID', 'Season']
)

# Filter out coaches based on the game day to handle coaches who are replaced mid-season
mask = (df_c_pg_coaches['FirstDayNum'] <= df_c_pg_coaches['DayNum'])
mask = mask & (df_c_pg_coaches['LastDayNum'] >= df_c_pg_coaches['DayNum'])
df_c_pg_coaches = df_c_pg_coaches[mask]
df_c_pg_coaches.head()

In [None]:
df_c_pg_coaches['SeasonsCoaching'] = df_c_pg_coaches['Season'] - df_c_pg_coaches.groupby('CoachName')['Season'].transform('min')
del df_c_pg_coaches['FirstDayNum']
del df_c_pg_coaches['LastDayNum']
df_c_pg_coaches.head(30)

In [None]:
one_hot = pd.get_dummies(df_c_pg_coaches['CoachName'])
df_c_pg_coaches = df_c_pg_coaches.drop('CoachName', axis=1)
df_c_pg_coaches = df_c_pg_coaches.join(one_hot)
df_c_pg_coaches.head()

In [None]:
df_result_team1_details = pd.merge(
    left = df_c_pg_coaches,
    right = df_result,
    left_index = False,
    left_on = ['Season', 'DayNum', 'TeamID'],
    right_index = False,
    right_on = ['Season', 'DayNum', 'TeamID1']
)
df_result_team1_details.head()

df_result_team_details = pd.merge(
    left = df_c_pg_coaches,
    right = df_result_team1_details,
    left_index = False,
    left_on = ['Season', 'DayNum', 'TeamID'],
    right_index = False,
    right_on = ['Season', 'DayNum', 'TeamID2'],
    suffixes = ('_Team1', '_Team2')
)

df_result_team_details.head()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
columns = [c for c in df_result_team_details if c not in ['TeamID1', 'TeamID2']]

In [None]:
df_master = df_result_team_details.copy()
X = df_result_team_details[columns]
y = df_result_team_details.pop('Pred')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8675309)

In [None]:
lr = LogisticRegression(solver='lbfgs', max_iter=10000)
lr.fit(X_train, y_train)

In [None]:
p = lr.predict_proba(X_test).clip(0.0000001, 0.99999999)

In [None]:
print('Log Loss:', log_loss(y_test, p))

# TODO:
* [X] calculate team wins and losses rolling for the season
  * Added Game 0 Values, just need to determine how to shift W/L columns down to show values coming into game
* [X] calculate rolling detailed stats FG %, FT %, 3PT %
  * [X] shift to be stats prior to game?
  * How handle first game of season as shift will make all stats NaN? Ignore first game of season.
* [X] DO NOT USE SCORE (We won't know it when predicting future games. We can use the season average score though)
* [X] Fine tune coach tenure (first season games are all currently 154 but it should be 0, 7, 14, 21 etc)
* [X] One hot encode coaches?
* [X] Shift per group not over all dataframe