In [1]:
!pip install flask-sqlalchemy



In [2]:
# Import dependencies

# For data processing
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# For ML
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# For Database connections
import sqlalchemy 
from sqlalchemy import create_engine, func
from sqlalchemy import inspect
# from sqlalchemy import session
import psycopg2
from sqlalchemy.ext.automap import automap_base

In [3]:
# Connect to Database - this will be done via config.py in the future
host = "group2022.cem6bfyajguw.us-east-2.rds.amazonaws.com"
database = "postgres"
user = 'postgres'
password = 'postgrespassword'

db_string = f"postgresql://{user}:{password}@{host}/{database}"

In [4]:
# Set up SQLAlchemy
engine = create_engine(db_string)
insp = inspect(engine)
insp.get_table_names()

['teams',
 'regular_season_detailed_results',
 'all_game_results',
 'conferences',
 'rankings_with_team_names',
 'conferences_with_team_names',
 'sag_system',
 'rankings']

In [5]:
# Teams raw DF
teams_df=pd.read_sql_table('teams', engine)
print(teams_df.shape)
teams_df.head()

(372, 4)


Unnamed: 0,teamid,teamname,firstd1season,lastd1season
0,1101,Abilene Chr,2014,2022
1,1102,Air Force,1985,2022
2,1103,Akron,1985,2022
3,1104,Alabama,1985,2022
4,1105,Alabama A&M,2000,2022


In [6]:
# Rankings raw DF
rankings_df=pd.read_sql_table('rankings', engine)
print(rankings_df.shape)
rankings_df.head()

(4521720, 5)


Unnamed: 0,season,rankingdaynum,systemname,teamid,ordinalrank
0,2010,113,BPI,1315,332
1,2010,113,BPI,1316,238
2,2010,113,BPI,1317,145
3,2010,113,BPI,1318,73
4,2010,113,BPI,1319,206


In [7]:
# Checking ranking systems
rankings_df['systemname'].value_counts()

SAG    122047
MOR    121057
POM    118566
DOK    105270
WLK     97047
        ...  
HRN       351
CRW       351
PMC       351
BP5       345
PH        326
Name: systemname, Length: 187, dtype: int64

In [8]:
# We need to get just SAG ranks for simplicity
SAG_ranks = rankings_df.loc[rankings_df['systemname']=='SAG']
SAG_ranks.sample(5)

Unnamed: 0,season,rankingdaynum,systemname,teamid,ordinalrank
898500,2014,58,SAG,1132,182
176139,2011,86,SAG,1412,67
3975721,2007,121,SAG,1407,215
3595683,2005,29,SAG,1453,238
3535834,2004,99,SAG,1393,46


In [9]:
# Results raw DF

raw_results_df=pd.read_sql_table('regular_season_detailed_results', engine)
print(raw_results_df.shape)
raw_results_df.head()

(100423, 34)


Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,lfga3,lftm,lfta,lor,ldr,last,lto,lstl,lblk,lpf
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14


In [10]:
raw_results_df.columns

Index(['season', 'daynum', 'wteamid', 'wscore', 'lteamid', 'lscore', 'wloc',
       'numot', 'wfgm', 'wfga', 'wfgm3', 'wfga3', 'wftm', 'wfta', 'wor', 'wdr',
       'wast', 'wto', 'wstl', 'wblk', 'wpf', 'lfgm', 'lfga', 'lfgm3', 'lfga3',
       'lftm', 'lfta', 'lor', 'ldr', 'last', 'lto', 'lstl', 'lblk', 'lpf'],
      dtype='object')

In [11]:
# We want to drop years before 2003, because rankings go to 2003. Also drop extra 'ordinalrank'
results_df = raw_results_df[raw_results_df['season']>=2003]
results_df.head()

Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,lfga3,lftm,lfta,lor,ldr,last,lto,lstl,lblk,lpf
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14


# Preprocessing & Feature Engineering


In [12]:
# Make a new column in SAG_Ranks called 'Season_Day_Team' that we will repeat in Results 

SAG_ranks['season-day-team'] = SAG_ranks['season'].map(str) + '-' + SAG_ranks['rankingdaynum'].map(str) + '-' + SAG_ranks['teamid'].map(str)
SAG_ranks.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,season,rankingdaynum,systemname,teamid,ordinalrank,season-day-team
6393,2010,113,SAG,1102,230,2010-113-1102
6394,2010,113,SAG,1103,101,2010-113-1103
6395,2010,113,SAG,1104,83,2010-113-1104
6396,2010,113,SAG,1105,334,2010-113-1105
6397,2010,113,SAG,1106,314,2010-113-1106


In [13]:
# Make new columns in SAG_Ranks called 'RankingWeek' and 'Season_Week_Team' because rankings only come out weekly

SAG_ranks['rankingweek'] = (SAG_ranks['rankingdaynum']/7).apply(np.ceil)

SAG_ranks['season-week-team'] = SAG_ranks['season'].map(str) + '-' + SAG_ranks['rankingweek'].map(str) + '-' + SAG_ranks['teamid'].map(str)
SAG_ranks.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,season,rankingdaynum,systemname,teamid,ordinalrank,season-day-team,rankingweek,season-week-team
6393,2010,113,SAG,1102,230,2010-113-1102,17.0,2010-17.0-1102
6394,2010,113,SAG,1103,101,2010-113-1103,17.0,2010-17.0-1103
6395,2010,113,SAG,1104,83,2010-113-1104,17.0,2010-17.0-1104
6396,2010,113,SAG,1105,334,2010-113-1105,17.0,2010-17.0-1105
6397,2010,113,SAG,1106,314,2010-113-1106,17.0,2010-17.0-1106


In [14]:
SAG_ranks.shape

(122047, 8)

In [15]:
# Make a new column in Results called 'WSeason_Day_Team' & 'LSeason_Day_Team' that we'll use for potential indices

results_df['wseason-Day-Team'] = results_df['season'].map(str) + '-' + results_df['daynum'].map(str) + '-' + results_df['wteamid'].map(str)
results_df['lseason-Day-Team'] = results_df['season'].map(str) + '-' + results_df['daynum'].map(str) + '-' + results_df['lteamid'].map(str)
results_df.head()

Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,lfta,lor,ldr,last,lto,lstl,lblk,lpf,wseason-Day-Team,lseason-Day-Team
0,2003,10,1104,68,1328,62,N,0,27,58,...,22,10,22,8,18,9,2,20,2003-10-1104,2003-10-1328
1,2003,10,1272,70,1393,63,N,0,26,62,...,20,20,25,7,12,8,6,16,2003-10-1272,2003-10-1393
2,2003,11,1266,73,1437,61,N,0,24,58,...,23,31,22,9,12,2,5,23,2003-11-1266,2003-11-1437
3,2003,11,1296,56,1457,50,N,0,18,38,...,15,17,20,9,19,4,3,23,2003-11-1296,2003-11-1457
4,2003,11,1400,77,1208,71,N,0,30,61,...,27,21,15,12,10,7,1,14,2003-11-1400,2003-11-1208


In [16]:
# Make a new column in Results called 'WSeason_Week_Team' & 'LSeason_Week_Team' that we'll use for loc to pull in OrdinalRank for winning and losing team

results_df['weeknum'] = (results_df['daynum']/7).apply(np.ceil)

results_df['wseason-week-team'] = results_df['season'].map(str) + '-' + results_df['weeknum'].map(str) + '-' + results_df['wteamid'].map(str)
results_df['lseason-week-team'] = results_df['season'].map(str) + '-' + results_df['weeknum'].map(str) + '-' + results_df['lteamid'].map(str)
results_df.head()

Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,last,lto,lstl,lblk,lpf,wseason-Day-Team,lseason-Day-Team,weeknum,wseason-week-team,lseason-week-team
0,2003,10,1104,68,1328,62,N,0,27,58,...,8,18,9,2,20,2003-10-1104,2003-10-1328,2.0,2003-2.0-1104,2003-2.0-1328
1,2003,10,1272,70,1393,63,N,0,26,62,...,7,12,8,6,16,2003-10-1272,2003-10-1393,2.0,2003-2.0-1272,2003-2.0-1393
2,2003,11,1266,73,1437,61,N,0,24,58,...,9,12,2,5,23,2003-11-1266,2003-11-1437,2.0,2003-2.0-1266,2003-2.0-1437
3,2003,11,1296,56,1457,50,N,0,18,38,...,9,19,4,3,23,2003-11-1296,2003-11-1457,2.0,2003-2.0-1296,2003-2.0-1457
4,2003,11,1400,77,1208,71,N,0,30,61,...,12,10,7,1,14,2003-11-1400,2003-11-1208,2.0,2003-2.0-1400,2003-2.0-1208


In [17]:
# Merging winning team rankings on season-week-team. There will be NA's for Rank bc of beginning and end of season

results_df = results_df.merge(SAG_ranks[['season-week-team','ordinalrank']], how='left', left_on='wseason-week-team', right_on='season-week-team').drop(columns=['season-week-team'])

results_df.sample(5)

Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,lto,lstl,lblk,lpf,wseason-Day-Team,lseason-Day-Team,weeknum,wseason-week-team,lseason-week-team,ordinalrank
26513,2008,18,1102,58,1290,40,N,0,22,41,...,12,6,2,15,2008-18-1102,2008-18-1290,3.0,2008-3.0-1102,2008-3.0-1290,90.0
32450,2009,15,1421,84,1251,56,H,0,29,59,...,11,6,2,20,2009-15-1421,2009-15-1251,3.0,2009-3.0-1421,2009-3.0-1251,222.0
105077,2022,61,1123,81,1132,80,H,0,27,60,...,14,10,3,23,2022-61-1123,2022-61-1132,9.0,2022-9.0-1123,2022-9.0-1132,250.0
47221,2011,86,1403,92,1235,83,A,0,35,64,...,10,7,4,18,2011-86-1403,2011-86-1235,13.0,2011-13.0-1403,2011-13.0-1235,175.0
96718,2020,75,1430,72,1410,70,H,0,21,50,...,5,7,2,22,2020-75-1430,2020-75-1410,11.0,2020-11.0-1430,2020-11.0-1410,282.0


In [18]:
# Rename OrdinalRank to WRank
results_df.rename(columns={'ordinalrank': 'wrank'},inplace=True)
results_df.sample(5)

Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,lto,lstl,lblk,lpf,wseason-Day-Team,lseason-Day-Team,weeknum,wseason-week-team,lseason-week-team,wrank
43274,2010,117,1237,86,1355,76,A,0,32,56,...,11,3,1,21,2010-117-1237,2010-117-1355,17.0,2010-17.0-1237,2010-17.0-1355,113.0
71309,2015,130,1433,70,1350,67,N,0,22,53,...,13,6,4,18,2015-130-1433,2015-130-1350,19.0,2015-19.0-1433,2015-19.0-1350,31.0
49977,2012,22,1110,74,1346,73,H,0,17,46,...,13,6,3,33,2012-22-1110,2012-22-1346,4.0,2012-4.0-1110,2012-4.0-1346,190.0
20959,2007,18,1378,82,1354,69,H,0,27,47,...,11,5,3,24,2007-18-1378,2007-18-1354,3.0,2007-3.0-1378,2007-3.0-1354,
86316,2018,96,1280,72,1208,57,H,0,27,53,...,12,1,4,16,2018-96-1280,2018-96-1208,14.0,2018-14.0-1280,2018-14.0-1208,71.0


In [19]:
# Merging losing team rankings on season-week-team. There will be NA's for Rank bc of beginning and end of season

results_df = results_df.merge(SAG_ranks[['season-week-team','ordinalrank']], how='left', left_on='lseason-week-team', right_on='season-week-team').drop(columns=['season-week-team'])

results_df.sample(5)

Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,lstl,lblk,lpf,wseason-Day-Team,lseason-Day-Team,weeknum,wseason-week-team,lseason-week-team,wrank,ordinalrank
5892,2003,131,1181,75,1314,63,N,0,27,50,...,6,5,19,2003-131-1181,2003-131-1314,19.0,2003-19.0-1181,2003-19.0-1314,13.0,55.0
13852,2004,129,1106,63,1380,62,N,0,21,38,...,11,2,20,2004-129-1106,2004-129-1380,19.0,2004-19.0-1106,2004-19.0-1380,270.0,291.0
102507,2019,61,1362,72,1337,69,A,0,25,52,...,9,2,19,2019-61-1362,2019-61-1337,9.0,2019-9.0-1362,2019-9.0-1337,44.0,214.0
80129,2015,93,1423,77,1241,65,A,0,25,59,...,4,2,16,2015-93-1423,2015-93-1241,14.0,2015-14.0-1423,2015-14.0-1241,187.0,178.0
119550,2022,96,1385,75,1139,72,A,0,26,54,...,5,1,19,2022-96-1385,2022-96-1139,14.0,2022-14.0-1385,2022-14.0-1139,65.0,109.0


In [20]:
# Rename OrdinalRank to LRank
results_df.rename(columns={'ordinalrank': 'lrank'},inplace=True)
results_df.sample(5)

Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,lstl,lblk,lpf,wseason-Day-Team,lseason-Day-Team,weeknum,wseason-week-team,lseason-week-team,wrank,lrank
112546,2021,55,1324,83,1178,80,A,0,30,59,...,8,2,0,2021-55-1324,2021-55-1178,8.0,2021-8.0-1324,2021-8.0-1178,280.0,234.0
75862,2014,128,1177,60,1207,56,N,0,19,46,...,6,1,18,2014-128-1177,2014-128-1207,19.0,2014-19.0-1177,2014-19.0-1207,156.0,51.0
97281,2018,80,1304,72,1276,52,H,0,26,47,...,4,2,19,2018-80-1304,2018-80-1276,12.0,2018-12.0-1304,2018-12.0-1276,80.0,16.0
36214,2008,80,1287,82,1369,78,A,0,20,50,...,10,7,31,2008-80-1287,2008-80-1369,12.0,2008-12.0-1287,2008-12.0-1369,,
13609,2004,124,1203,87,1247,72,A,0,30,50,...,3,1,21,2004-124-1203,2004-124-1247,18.0,2004-18.0-1203,2004-18.0-1247,,


In [21]:
# Getting ranking differential to use as a variable - negative values are underdog wins
results_df['rank_diff'] = results_df['lrank'] - results_df['wrank']

# Getting Score differential just in case
results_df['spread'] = results_df['wscore'] - results_df['lscore']

# Outcome column for if the lower teamID won, as that is the submission format for Kaggle
results_df['lowidwin'] = np.where((results_df['wteamid'] < results_df['lteamid']), 1, 0)

results_df.sample(5)

Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,wseason-Day-Team,lseason-Day-Team,weeknum,wseason-week-team,lseason-week-team,wrank,lrank,rank_diff,spread,lowidwin
7905,2004,57,1368,75,1288,61,N,0,25,44,...,2004-57-1368,2004-57-1288,9.0,2004-9.0-1368,2004-9.0-1288,,,,14,0
112714,2021,61,1266,64,1207,60,A,0,25,64,...,2021-61-1266,2021-61-1207,9.0,2021-9.0-1266,2021-9.0-1207,53.0,110.0,57.0,4,0
92121,2017,96,1363,78,1307,68,A,0,27,50,...,2017-96-1363,2017-96-1307,14.0,2017-14.0-1363,2017-14.0-1307,217.0,98.0,-119.0,10,0
6442,2004,28,1194,89,1198,87,H,0,32,79,...,2004-28-1194,2004-28-1198,4.0,2004-4.0-1194,2004-4.0-1198,223.0,314.0,91.0,2,1
1786,2003,65,1220,70,1206,67,H,0,29,50,...,2003-65-1220,2003-65-1206,10.0,2003-10.0-1220,2003-10.0-1206,303.0,76.0,-227.0,3,0


In [22]:
results_df.columns

Index(['season', 'daynum', 'wteamid', 'wscore', 'lteamid', 'lscore', 'wloc',
       'numot', 'wfgm', 'wfga', 'wfgm3', 'wfga3', 'wftm', 'wfta', 'wor', 'wdr',
       'wast', 'wto', 'wstl', 'wblk', 'wpf', 'lfgm', 'lfga', 'lfgm3', 'lfga3',
       'lftm', 'lfta', 'lor', 'ldr', 'last', 'lto', 'lstl', 'lblk', 'lpf',
       'wseason-Day-Team', 'lseason-Day-Team', 'weeknum', 'wseason-week-team',
       'lseason-week-team', 'wrank', 'lrank', 'rank_diff', 'spread',
       'lowidwin'],
      dtype='object')

In [23]:
# Getting shooting percentages

results_df['wfgpct'] = results_df['wfgm'] / results_df['wfga']
results_df['lfgpct'] = results_df['lfgm'] / results_df['lfga']

results_df['wfg3pct'] = results_df['wfgm3'] / results_df['wfga3']
results_df['lfg3pct'] = results_df['lfgm3'] / results_df['lfga3']

results_df['wefgpct'] = (results_df['wfgm'] + 0.5 * results_df['wfga3']) / results_df['wfga']
results_df['lefgpct'] = (results_df['lfgm'] + 0.5 * results_df['lfgm3']) / results_df['lfga']

results_df['wftpct'] = results_df['wftm'] / results_df['wfta']
results_df['lftpct'] = results_df['lftm'] / results_df['lfta']

results_df.sample(5)

Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,spread,lowidwin,wfgpct,lfgpct,wfg3pct,lfg3pct,wefgpct,lefgpct,wftpct,lftpct
72619,2014,57,1338,58,1107,46,H,0,24,54,...,12,0,0.444444,0.313725,0.1,0.333333,0.537037,0.362745,0.5625,0.9
22510,2006,94,1139,72,1453,66,H,1,23,57,...,6,1,0.403509,0.5,0.333333,0.388889,0.666667,0.564815,0.761905,0.555556
38108,2008,123,1135,78,1217,62,A,0,28,46,...,16,1,0.608696,0.471698,0.526316,0.272727,0.815217,0.528302,0.923077,0.6
54069,2011,25,1424,69,1293,55,N,0,27,46,...,14,0,0.586957,0.347826,0.466667,0.352941,0.75,0.413043,0.727273,0.772727
94598,2018,17,1385,79,1304,56,H,0,30,69,...,23,0,0.434783,0.280702,0.294118,0.230769,0.557971,0.333333,0.7,0.75


In [24]:
# getting differentials between teams for stats

results_df['fgpct_diff'] = results_df['wfgpct'] - results_df['lfgpct']
results_df['fg3pct_diff'] = results_df['wfg3pct'] - results_df['lfg3pct']
results_df['efgpct_diff'] = results_df['wefgpct'] - results_df['lefgpct']
results_df['ftpct_diff'] = results_df['wftpct'] - results_df['lftpct']
results_df['or_diff'] = results_df['wor'] - results_df['lor']
results_df['dr_diff'] = results_df['wdr'] - results_df['ldr']
results_df['ast_diff'] = results_df['wast'] - results_df['last']
results_df['to_diff'] = results_df['wto'] - results_df['lto']
results_df['stl_diff'] = results_df['wstl'] - results_df['lstl']
results_df['blk_diff'] = results_df['wblk'] - results_df['lblk']
results_df['pf_diff'] = results_df['wpf'] - results_df['lpf']

results_df.columns

Index(['season', 'daynum', 'wteamid', 'wscore', 'lteamid', 'lscore', 'wloc',
       'numot', 'wfgm', 'wfga', 'wfgm3', 'wfga3', 'wftm', 'wfta', 'wor', 'wdr',
       'wast', 'wto', 'wstl', 'wblk', 'wpf', 'lfgm', 'lfga', 'lfgm3', 'lfga3',
       'lftm', 'lfta', 'lor', 'ldr', 'last', 'lto', 'lstl', 'lblk', 'lpf',
       'wseason-Day-Team', 'lseason-Day-Team', 'weeknum', 'wseason-week-team',
       'lseason-week-team', 'wrank', 'lrank', 'rank_diff', 'spread',
       'lowidwin', 'wfgpct', 'lfgpct', 'wfg3pct', 'lfg3pct', 'wefgpct',
       'lefgpct', 'wftpct', 'lftpct', 'fgpct_diff', 'fg3pct_diff',
       'efgpct_diff', 'ftpct_diff', 'or_diff', 'dr_diff', 'ast_diff',
       'to_diff', 'stl_diff', 'blk_diff', 'pf_diff'],
      dtype='object')

In [25]:
# Drop NA's for a nice and pretty DF

clean_results_df = results_df.dropna()

clean_results_df.head()

Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,fg3pct_diff,efgpct_diff,ftpct_diff,or_diff,dr_diff,ast_diff,to_diff,stl_diff,blk_diff,pf_diff
792,2003,36,1113,76,1305,63,H,0,25,57,...,0.031579,0.02556,0.32381,-4,4,2,-8,3,-1,-7
793,2003,36,1116,72,1256,60,H,0,25,58,...,0.038363,0.16931,-0.117794,1,-2,4,-7,-1,0,-1
794,2003,36,1130,85,1235,78,A,0,28,57,...,0.052632,0.224071,-0.05335,-10,0,-1,-1,3,-5,3
795,2003,36,1139,75,1133,70,H,0,23,56,...,-0.478469,0.019133,0.235577,4,-1,-5,-4,3,0,-4
796,2003,36,1143,67,1364,60,H,0,18,42,...,0.076923,0.052857,0.233333,4,2,-8,-1,2,1,-16


# Model Assembly and Training

In [26]:
# X = ranks, y = win?

y = clean_results_df['lowidwin']
X = clean_results_df[['rank_diff', 'efgpct_diff', 'ftpct_diff', 'or_diff', 'dr_diff',
       'ast_diff', 'to_diff', 'stl_diff', 'blk_diff', 'pf_diff']]

In [27]:
# Do TTS 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
X_train.shape

(82959, 10)

In [28]:
# Set up the Balanced Random Forest

from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=250, random_state=1)
brf.fit(X_train, y_train)


BalancedRandomForestClassifier(n_estimators=250, random_state=1)

In [29]:
prob_predictions = brf.predict_proba(X_test)
print (prob_predictions)

[[0.892 0.108]
 [0.66  0.34 ]
 [0.388 0.612]
 ...
 [0.356 0.644]
 [0.936 0.064]
 [0.388 0.612]]


In [30]:
# Calculated the balanced accuracy score
y_pred = brf.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.620705865335937


In [31]:
# Get importances and features
importances = brf.feature_importances_
cols = X.columns

# Store in a DataFrame
feature_importances_df = pd.DataFrame({'feature':cols, 'importance': importances})
feature_importances_df

Unnamed: 0,feature,importance
0,rank_diff,0.12665
1,efgpct_diff,0.146385
2,ftpct_diff,0.145825
3,or_diff,0.089197
4,dr_diff,0.082253
5,ast_diff,0.088167
6,to_diff,0.082906
7,stl_diff,0.076741
8,blk_diff,0.078299
9,pf_diff,0.083577


## Generating 2022 bracket predictions

In [32]:
# Now we need to get these predictions for all tournament games

# 1. Get season averages for teams meeting in the tournament GROUP BY WHERE SEASON = 2022, Get 64 tourney teams
# 2. Calculate the differentials to use as inputs for predictions for all potential matchups

In [33]:
# Getting Team stats for a season (uses clean_results so may exclude a few games bc of rankings)

# Grab results for a team where it was the winner
w_teams = clean_results_df.loc[:, ['season', 'wteamid', 'wfgm','wfga','wfgm3',
                                   'wfga3','wftm', 'wfta', 'wor', 'wdr','wast', 
                                   'wto', 'wstl', 'wblk', 'wpf']]

w_teams.columns = ['Season', 'TeamID', 'FGM','FGA','FGM3','FGA3','FTM','FTA','OR','DR',
             'Ast','TO','Stl','Blk','PF']

# Grab results for a team where it was the loser
l_teams = clean_results_df.loc[:, ['season', 'lteamid', 'lfgm', 'lfga', 'lfgm3',
                                   'lfga3','lftm', 'lfta', 'lor', 'ldr', 'last',
                                   'lto', 'lstl', 'lblk', 'lpf']]

l_teams.columns = ['Season', 'TeamID', 'FGM','FGA','FGM3','FGA3','FTM','FTA','OR','DR',
             'Ast','TO','Stl','Blk','PF']

# Merge those together then get shooting effeciency
detail = pd.concat([w_teams,l_teams])
detail['FGPct'] = detail.FGM / detail.FGA 
detail['FG3Pct'] = detail.FGM3 / detail.FGA3  
detail['eFGPct'] = (detail.FGM + 0.5 * detail.FGM3)/detail.FGA
detail['FTPct'] = detail.FTM  / detail.FTA  

# Bring it together by Season and Team
team_stats = detail.groupby(['Season','TeamID'])['FGM','FGA','FGM3','FGA3','FTM','FTA',
                                         'OR','DR','Ast','TO','Stl','Blk','PF',
                                          'FGPct', 'FG3Pct', 
                                         'eFGPct', 'FTPct']\
                                        .mean().fillna(0).reset_index()

del w_teams, l_teams, detail

team_stats.sample(5)



Unnamed: 0,Season,TeamID,FGM,FGA,FGM3,FGA3,FTM,FTA,OR,DR,Ast,TO,Stl,Blk,PF,FGPct,FG3Pct,eFGPct,FTPct
5010,2017,1343,25.090909,56.333333,9.545455,24.969697,11.151515,15.030303,8.424242,24.636364,13.848485,9.515152,6.606061,3.333333,15.636364,0.447919,0.377317,0.534996,0.741136
5786,2019,1417,26.945946,60.135135,7.864865,21.513514,15.162162,23.702703,11.216216,29.513514,13.243243,14.108108,5.513514,3.837838,17.810811,0.448927,0.367274,0.514915,0.631511
6788,2022,1367,25.166667,55.722222,8.666667,22.333333,12.722222,16.777778,6.333333,21.833333,13.722222,12.777778,5.888889,3.055556,19.666667,0.454047,0.397195,0.533016,0.752129
5591,2019,1219,23.962963,54.740741,5.592593,18.814815,12.962963,18.62963,11.62963,26.185185,12.666667,13.333333,5.074074,2.777778,16.148148,0.437173,0.297333,0.488878,0.691972
3756,2014,1137,22.821429,53.785714,6.5,16.571429,13.464286,18.107143,9.642857,25.928571,11.678571,10.928571,4.75,1.678571,16.892857,0.426219,0.392192,0.487136,0.72967


In [34]:
# Get just 2022 and then add rankings for final week

teams_2022 = team_stats[team_stats.Season == 2022]
teams_2022.sample(5)

Unnamed: 0,Season,TeamID,FGM,FGA,FGM3,FGA3,FTM,FTA,OR,DR,Ast,TO,Stl,Blk,PF,FGPct,FG3Pct,eFGPct,FTPct
6807,2022,1386,25.368421,59.052632,8.736842,26.263158,10.315789,15.157895,7.368421,25.421053,13.421053,13.210526,4.789474,2.631579,16.263158,0.429071,0.330925,0.503515,0.667411
6764,2022,1342,22.947368,56.526316,5.789474,20.421053,11.526316,16.473684,9.894737,19.684211,11.368421,12.368421,6.947368,2.421053,18.894737,0.406293,0.281369,0.458143,0.66718
6779,2022,1357,25.833333,59.388889,8.055556,23.333333,10.222222,14.222222,9.333333,21.666667,13.833333,12.666667,5.833333,2.388889,15.0,0.436408,0.340827,0.505056,0.714334
6573,2022,1146,27.111111,62.833333,6.666667,22.0,11.0,15.055556,8.611111,22.666667,13.333333,14.166667,6.222222,2.777778,17.166667,0.430736,0.298039,0.48457,0.733844
6552,2022,1123,25.578947,57.0,8.421053,23.526316,15.105263,21.263158,8.0,24.210526,12.789474,13.631579,5.421053,2.315789,18.736842,0.45311,0.378371,0.527516,0.704809


In [35]:
# Find last week of the season for rankings in 2022

SAG2022_ranks = SAG_ranks[SAG_ranks['season']==2022]
SAG2022_ranks['rankingweek'].max()

15.0

In [36]:
# Getting 2022 season-end rankings

end_ranks = SAG_ranks[SAG_ranks.rankingweek == 15]
end_ranks_2022 = end_ranks[end_ranks.season == 2022]
end_ranks_2022.head()

Unnamed: 0,season,rankingdaynum,systemname,teamid,ordinalrank,season-day-team,rankingweek,season-week-team
3318459,2022,100,SAG,1101,151,2022-100-1101,15.0,2022-15.0-1101
3318460,2022,100,SAG,1102,254,2022-100-1102,15.0,2022-15.0-1102
3318461,2022,100,SAG,1103,131,2022-100-1103,15.0,2022-15.0-1103
3318462,2022,100,SAG,1104,19,2022-100-1104,15.0,2022-15.0-1104
3318463,2022,100,SAG,1105,348,2022-100-1105,15.0,2022-15.0-1105


In [37]:
# Adding the 2022-end rankings into teams_2022

teams_2022 = teams_2022.merge(end_ranks_2022[['teamid','ordinalrank']], how='left', left_on='TeamID', right_on='teamid')
teams_2022.rename(columns={'ordinalrank': 'end_rank'},inplace=True)
teams_2022.head()

Unnamed: 0,Season,TeamID,FGM,FGA,FGM3,FGA3,FTM,FTA,OR,DR,...,TO,Stl,Blk,PF,FGPct,FG3Pct,eFGPct,FTPct,teamid,end_rank
0,2022,1101,25.058824,59.058824,6.882353,21.352941,17.176471,23.235294,8.588235,20.352941,...,12.352941,10.294118,1.529412,22.647059,0.424027,0.318485,0.482021,0.739285,1101,151
1,2022,1102,21.842105,49.263158,7.842105,21.894737,8.842105,13.631579,5.105263,19.736842,...,11.631579,6.263158,3.263158,18.473684,0.446294,0.35707,0.525694,0.639186,1102,254
2,2022,1103,23.555556,53.0,7.833333,22.5,13.777778,20.611111,8.0,24.055556,...,11.166667,5.0,2.777778,15.944444,0.445353,0.352604,0.520504,0.650718,1103,131
3,2022,1104,27.142857,62.428571,8.666667,29.047619,15.52381,21.619048,11.666667,24.285714,...,13.52381,7.285714,5.095238,18.52381,0.43762,0.300061,0.506297,0.709938,1104,19
4,2022,1105,21.263158,56.578947,3.736842,14.315789,11.0,16.157895,8.736842,21.736842,...,14.263158,8.315789,3.052632,16.842105,0.379589,0.264676,0.412969,0.687256,1105,348
