In [1]:
!pip install flask-sqlalchemy



In [2]:
# Import dependencies

# For data processing
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# For ML
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# For Database connections
import sqlalchemy 
from sqlalchemy import create_engine, func
from sqlalchemy import inspect
# from sqlalchemy import session
import psycopg2
from sqlalchemy.ext.automap import automap_base

In [3]:
# Connect to Database - this will be done via config.py in the future
host = "group2022.cem6bfyajguw.us-east-2.rds.amazonaws.com"
database = "postgres"
user = 'postgres'
password = 'postgrespassword'

db_string = f"postgresql://{user}:{password}@{host}/{database}"

In [4]:
# Set up SQLAlchemy
engine = create_engine(db_string)
insp = inspect(engine)
insp.get_table_names()

['teams',
 'regular_season_detailed_results',
 'all_game_results',
 'conferences',
 'rankings_with_team_names',
 'conferences_with_team_names',
 'sag_system',
 'rankings']

In [5]:
# Teams raw DF
teams_df=pd.read_sql_table('teams', engine)
print(teams_df.shape)
teams_df.head()

(372, 4)


Unnamed: 0,teamid,teamname,firstd1season,lastd1season
0,1101,Abilene Chr,2014,2022
1,1102,Air Force,1985,2022
2,1103,Akron,1985,2022
3,1104,Alabama,1985,2022
4,1105,Alabama A&M,2000,2022


In [6]:
# Rankings raw DF
rankings_df=pd.read_sql_table('rankings', engine)
print(rankings_df.shape)
rankings_df.head()

(4521720, 5)


Unnamed: 0,season,rankingdaynum,systemname,teamid,ordinalrank
0,2010,113,BPI,1315,332
1,2010,113,BPI,1316,238
2,2010,113,BPI,1317,145
3,2010,113,BPI,1318,73
4,2010,113,BPI,1319,206


In [7]:
# Checking ranking systems
rankings_df['systemname'].value_counts()

SAG    122047
MOR    121057
POM    118566
DOK    105270
WLK     97047
        ...  
HRN       351
CRW       351
PMC       351
BP5       345
PH        326
Name: systemname, Length: 187, dtype: int64

In [8]:
# We need to get just SAG ranks for simplicity
SAG_ranks = rankings_df.loc[rankings_df['systemname']=='SAG']
SAG_ranks.sample(5)

Unnamed: 0,season,rankingdaynum,systemname,teamid,ordinalrank
2305430,2019,16,SAG,1402,229
817179,2013,133,SAG,1122,297
4185099,2009,36,SAG,1397,11
1271578,2015,93,SAG,1233,89
4271997,2009,93,SAG,1395,112


In [9]:
# Results raw DF

raw_results_df=pd.read_sql_table('regular_season_detailed_results', engine)
print(raw_results_df.shape)
raw_results_df.head()

(100423, 34)


Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,lfga3,lftm,lfta,lor,ldr,last,lto,lstl,lblk,lpf
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14


In [10]:
raw_results_df.columns

Index(['season', 'daynum', 'wteamid', 'wscore', 'lteamid', 'lscore', 'wloc',
       'numot', 'wfgm', 'wfga', 'wfgm3', 'wfga3', 'wftm', 'wfta', 'wor', 'wdr',
       'wast', 'wto', 'wstl', 'wblk', 'wpf', 'lfgm', 'lfga', 'lfgm3', 'lfga3',
       'lftm', 'lfta', 'lor', 'ldr', 'last', 'lto', 'lstl', 'lblk', 'lpf'],
      dtype='object')

In [11]:
# We want to drop years before 2003, because rankings go to 2003. Also drop extra 'ordinalrank'
results_df = raw_results_df[raw_results_df['season']>=2003]
results_df.head()

Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,lfga3,lftm,lfta,lor,ldr,last,lto,lstl,lblk,lpf
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14


# Preprocessing & Feature Engineering


In [12]:
# Make a new column in SAG_Ranks called 'Season_Day_Team' that we will repeat in Results 

SAG_ranks['season-day-team'] = SAG_ranks['season'].map(str) + '-' + SAG_ranks['rankingdaynum'].map(str) + '-' + SAG_ranks['teamid'].map(str)
SAG_ranks.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,season,rankingdaynum,systemname,teamid,ordinalrank,season-day-team
6393,2010,113,SAG,1102,230,2010-113-1102
6394,2010,113,SAG,1103,101,2010-113-1103
6395,2010,113,SAG,1104,83,2010-113-1104
6396,2010,113,SAG,1105,334,2010-113-1105
6397,2010,113,SAG,1106,314,2010-113-1106


In [13]:
# Make new columns in SAG_Ranks called 'RankingWeek' and 'Season_Week_Team' because rankings only come out weekly

SAG_ranks['rankingweek'] = (SAG_ranks['rankingdaynum']/7).apply(np.ceil)

SAG_ranks['season-week-team'] = SAG_ranks['season'].map(str) + '-' + SAG_ranks['rankingweek'].map(str) + '-' + SAG_ranks['teamid'].map(str)
SAG_ranks.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,season,rankingdaynum,systemname,teamid,ordinalrank,season-day-team,rankingweek,season-week-team
6393,2010,113,SAG,1102,230,2010-113-1102,17.0,2010-17.0-1102
6394,2010,113,SAG,1103,101,2010-113-1103,17.0,2010-17.0-1103
6395,2010,113,SAG,1104,83,2010-113-1104,17.0,2010-17.0-1104
6396,2010,113,SAG,1105,334,2010-113-1105,17.0,2010-17.0-1105
6397,2010,113,SAG,1106,314,2010-113-1106,17.0,2010-17.0-1106


In [14]:
SAG_ranks.shape

(122047, 8)

In [15]:
# Make a new column in Results called 'WSeason_Day_Team' & 'LSeason_Day_Team' that we'll use for potential indices

results_df['wseason-Day-Team'] = results_df['season'].map(str) + '-' + results_df['daynum'].map(str) + '-' + results_df['wteamid'].map(str)
results_df['lseason-Day-Team'] = results_df['season'].map(str) + '-' + results_df['daynum'].map(str) + '-' + results_df['lteamid'].map(str)
results_df.head()

Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,lfta,lor,ldr,last,lto,lstl,lblk,lpf,wseason-Day-Team,lseason-Day-Team
0,2003,10,1104,68,1328,62,N,0,27,58,...,22,10,22,8,18,9,2,20,2003-10-1104,2003-10-1328
1,2003,10,1272,70,1393,63,N,0,26,62,...,20,20,25,7,12,8,6,16,2003-10-1272,2003-10-1393
2,2003,11,1266,73,1437,61,N,0,24,58,...,23,31,22,9,12,2,5,23,2003-11-1266,2003-11-1437
3,2003,11,1296,56,1457,50,N,0,18,38,...,15,17,20,9,19,4,3,23,2003-11-1296,2003-11-1457
4,2003,11,1400,77,1208,71,N,0,30,61,...,27,21,15,12,10,7,1,14,2003-11-1400,2003-11-1208


In [16]:
# Make a new column in Results called 'WSeason_Week_Team' & 'LSeason_Week_Team' that we'll use for loc to pull in OrdinalRank for winning and losing team

results_df['weeknum'] = (results_df['daynum']/7).apply(np.ceil)

results_df['wseason-week-team'] = results_df['season'].map(str) + '-' + results_df['weeknum'].map(str) + '-' + results_df['wteamid'].map(str)
results_df['lseason-week-team'] = results_df['season'].map(str) + '-' + results_df['weeknum'].map(str) + '-' + results_df['lteamid'].map(str)
results_df.head()

Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,last,lto,lstl,lblk,lpf,wseason-Day-Team,lseason-Day-Team,weeknum,wseason-week-team,lseason-week-team
0,2003,10,1104,68,1328,62,N,0,27,58,...,8,18,9,2,20,2003-10-1104,2003-10-1328,2.0,2003-2.0-1104,2003-2.0-1328
1,2003,10,1272,70,1393,63,N,0,26,62,...,7,12,8,6,16,2003-10-1272,2003-10-1393,2.0,2003-2.0-1272,2003-2.0-1393
2,2003,11,1266,73,1437,61,N,0,24,58,...,9,12,2,5,23,2003-11-1266,2003-11-1437,2.0,2003-2.0-1266,2003-2.0-1437
3,2003,11,1296,56,1457,50,N,0,18,38,...,9,19,4,3,23,2003-11-1296,2003-11-1457,2.0,2003-2.0-1296,2003-2.0-1457
4,2003,11,1400,77,1208,71,N,0,30,61,...,12,10,7,1,14,2003-11-1400,2003-11-1208,2.0,2003-2.0-1400,2003-2.0-1208


In [17]:
# Merging winning team rankings on season-week-team. There will be NA's for Rank bc of beginning and end of season

results_df = results_df.merge(SAG_ranks[['season-week-team','ordinalrank']], how='left', left_on='wseason-week-team', right_on='season-week-team').drop(columns=['season-week-team'])

results_df.sample(5)

Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,lto,lstl,lblk,lpf,wseason-Day-Team,lseason-Day-Team,weeknum,wseason-week-team,lseason-week-team,ordinalrank
65318,2014,122,1437,77,1462,70,A,0,23,49,...,10,5,2,22,2014-122-1437,2014-122-1462,18.0,2014-18.0-1437,2014-18.0-1462,3.0
62774,2014,68,1223,98,1322,97,H,0,35,65,...,12,9,4,22,2014-68-1223,2014-68-1322,10.0,2014-10.0-1223,2014-10.0-1322,335.0
29381,2008,75,1131,59,1392,52,A,0,19,52,...,12,5,4,20,2008-75-1131,2008-75-1392,11.0,2008-11.0-1131,2008-11.0-1392,269.0
51410,2012,59,1430,42,1214,40,N,0,15,61,...,11,5,13,15,2012-59-1430,2012-59-1214,9.0,2012-9.0-1430,2012-9.0-1214,238.0
75766,2016,110,1138,88,1132,74,H,0,29,59,...,18,6,1,26,2016-110-1138,2016-110-1132,16.0,2016-16.0-1138,2016-16.0-1132,143.0


In [18]:
# Rename OrdinalRank to WRank
results_df.rename(columns={'ordinalrank': 'wrank'},inplace=True)
results_df.sample(5)

Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,lto,lstl,lblk,lpf,wseason-Day-Team,lseason-Day-Team,weeknum,wseason-week-team,lseason-week-team,wrank
103325,2022,11,1458,72,1453,34,H,0,24,64,...,12,3,4,20,2022-11-1458,2022-11-1453,2.0,2022-2.0-1458,2022-2.0-1453,
9871,2004,115,1194,95,1144,80,H,0,30,73,...,10,9,3,23,2004-115-1194,2004-115-1144,17.0,2004-17.0-1194,2004-17.0-1144,262.0
96953,2020,82,1202,78,1154,54,A,0,27,63,...,21,5,6,20,2020-82-1202,2020-82-1154,12.0,2020-12.0-1202,2020-12.0-1154,93.0
18343,2006,82,1362,62,1339,50,H,0,25,63,...,16,1,2,17,2006-82-1362,2006-82-1339,12.0,2006-12.0-1362,2006-12.0-1339,176.0
55502,2013,18,1153,78,1235,70,N,0,25,60,...,18,6,2,20,2013-18-1153,2013-18-1235,3.0,2013-3.0-1153,2013-3.0-1235,44.0


In [19]:
# Merging losing team rankings on season-week-team. There will be NA's for Rank bc of beginning and end of season

results_df = results_df.merge(SAG_ranks[['season-week-team','ordinalrank']], how='left', left_on='lseason-week-team', right_on='season-week-team').drop(columns=['season-week-team'])

results_df.sample(5)

Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,lstl,lblk,lpf,wseason-Day-Team,lseason-Day-Team,weeknum,wseason-week-team,lseason-week-team,wrank,ordinalrank
7813,2004,56,1395,99,1212,71,H,0,34,55,...,9,0,29,2004-56-1395,2004-56-1212,8.0,2004-8.0-1395,2004-8.0-1212,207.0,310.0
113176,2021,74,1421,83,1149,75,H,0,31,58,...,4,3,0,2021-74-1421,2021-74-1149,11.0,2021-11.0-1421,2021-11.0-1149,244.0,321.0
89943,2017,47,1338,83,1349,73,H,0,29,45,...,8,2,18,2017-47-1338,2017-47-1349,7.0,2017-7.0-1338,2017-7.0-1349,70.0,109.0
56267,2011,80,1404,79,1369,69,H,0,27,65,...,3,10,20,2011-80-1404,2011-80-1369,12.0,2011-12.0-1404,2011-12.0-1369,318.0,319.0
19121,2005,129,1438,66,1274,65,N,0,22,50,...,3,0,23,2005-129-1438,2005-129-1274,19.0,2005-19.0-1438,2005-19.0-1274,78.0,58.0


In [20]:
# Rename OrdinalRank to LRank
results_df.rename(columns={'ordinalrank': 'lrank'},inplace=True)
results_df.sample(5)

Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,lstl,lblk,lpf,wseason-Day-Team,lseason-Day-Team,weeknum,wseason-week-team,lseason-week-team,wrank,lrank
62571,2012,91,1459,82,1441,56,H,0,32,54,...,5,1,14,2012-91-1459,2012-91-1441,13.0,2012-13.0-1459,2012-13.0-1441,183.0,244.0
67792,2013,79,1191,66,1133,56,H,0,27,54,...,5,2,17,2013-79-1191,2013-79-1133,12.0,2013-12.0-1191,2013-12.0-1133,120.0,141.0
42538,2009,75,1151,82,1441,75,H,0,25,57,...,9,4,33,2009-75-1151,2009-75-1441,11.0,2009-11.0-1151,2009-11.0-1441,215.0,179.0
113964,2021,94,1429,69,1201,53,A,0,26,52,...,6,3,0,2021-94-1429,2021-94-1201,14.0,2021-14.0-1429,2021-14.0-1201,55.0,186.0
16414,2005,73,1264,78,1310,72,H,0,27,62,...,6,4,20,2005-73-1264,2005-73-1310,11.0,2005-11.0-1264,2005-11.0-1310,158.0,119.0


In [21]:
# Getting ranking differential to use as a variable - negative values are underdog wins
results_df['rank_diff'] = results_df['lrank'] - results_df['wrank']

# Getting Score differential just in case
results_df['spread'] = results_df['wscore'] - results_df['lscore']

# Outcome column for if the lower teamID won, as that is the submission format for Kaggle
results_df['lowidwin'] = np.where((results_df['wteamid'] < results_df['lteamid']), 1, 0)

results_df.sample(5)

Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,wseason-Day-Team,lseason-Day-Team,weeknum,wseason-week-team,lseason-week-team,wrank,lrank,rank_diff,spread,lowidwin
41686,2009,61,1398,69,1184,63,H,0,27,49,...,2009-61-1398,2009-61-1184,9.0,2009-9.0-1398,2009-9.0-1184,280.0,131.0,-149.0,6,0
1279,2003,54,1113,75,1304,63,H,0,28,61,...,2003-54-1113,2003-54-1304,8.0,2003-8.0-1113,2003-8.0-1304,56.0,59.0,3.0,12,1
34518,2008,63,1399,73,1287,55,H,0,24,50,...,2008-63-1399,2008-63-1287,9.0,2008-9.0-1399,2008-9.0-1287,298.0,290.0,-8.0,18,0
258,2003,22,1166,80,1323,75,N,0,31,50,...,2003-22-1166,2003-22-1323,4.0,2003-4.0-1166,2003-4.0-1323,,,,5,1
39571,2009,25,1216,50,1312,38,N,0,15,42,...,2009-25-1216,2009-25-1312,4.0,2009-4.0-1216,2009-4.0-1312,306.0,345.0,39.0,12,1


In [22]:
results_df.columns

Index(['season', 'daynum', 'wteamid', 'wscore', 'lteamid', 'lscore', 'wloc',
       'numot', 'wfgm', 'wfga', 'wfgm3', 'wfga3', 'wftm', 'wfta', 'wor', 'wdr',
       'wast', 'wto', 'wstl', 'wblk', 'wpf', 'lfgm', 'lfga', 'lfgm3', 'lfga3',
       'lftm', 'lfta', 'lor', 'ldr', 'last', 'lto', 'lstl', 'lblk', 'lpf',
       'wseason-Day-Team', 'lseason-Day-Team', 'weeknum', 'wseason-week-team',
       'lseason-week-team', 'wrank', 'lrank', 'rank_diff', 'spread',
       'lowidwin'],
      dtype='object')

In [23]:
# Getting shooting percentages

results_df['wfgpct'] = results_df['wfgm'] / results_df['wfga']
results_df['lfgpct'] = results_df['lfgm'] / results_df['lfga']

results_df['wfg3pct'] = results_df['wfgm3'] / results_df['wfga3']
results_df['lfg3pct'] = results_df['lfgm3'] / results_df['lfga3']

results_df['wefgpct'] = (results_df['wfgm'] + 0.5 * results_df['wfga3']) / results_df['wfga']
results_df['lefgpct'] = (results_df['lfgm'] + 0.5 * results_df['lfgm3']) / results_df['lfga']

results_df['wftpct'] = results_df['wftm'] / results_df['wfta']
results_df['lftpct'] = results_df['lftm'] / results_df['lfta']

results_df.sample(5)

Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,spread,lowidwin,wfgpct,lfgpct,wfg3pct,lfg3pct,wefgpct,lefgpct,wftpct,lftpct
69147,2013,108,1149,92,1440,69,H,0,35,59,...,23,1,0.59322,0.393939,0.56,0.314286,0.805085,0.477273,0.533333,0.75
39516,2009,23,1135,80,1119,73,H,0,26,48,...,7,0,0.541667,0.483333,0.5,0.352941,0.75,0.533333,0.692308,0.6
48889,2010,39,1269,81,1212,58,H,0,32,70,...,23,0,0.457143,0.40625,0.181818,0.2,0.614286,0.4375,0.619048,0.2
22405,2006,91,1105,99,1411,75,H,0,33,68,...,24,1,0.485294,0.461538,0.44,0.444444,0.669118,0.538462,0.594595,0.475
115894,2021,132,1159,85,1259,72,H,0,31,59,...,13,1,0.525424,0.467742,0.608696,0.227273,0.720339,0.508065,0.642857,0.5625


In [24]:
# getting differentials between teams for stats

results_df['fgpct_diff'] = results_df['wfgpct'] - results_df['lfgpct']
results_df['fg3pct_diff'] = results_df['wfg3pct'] - results_df['lfg3pct']
results_df['efgpct_diff'] = results_df['wefgpct'] - results_df['lefgpct']
results_df['ftpct_diff'] = results_df['wftpct'] - results_df['lftpct']
results_df['or_diff'] = results_df['wor'] - results_df['lor']
results_df['dr_diff'] = results_df['wdr'] - results_df['ldr']
results_df['ast_diff'] = results_df['wast'] - results_df['last']
results_df['to_diff'] = results_df['wto'] - results_df['lto']
results_df['stl_diff'] = results_df['wstl'] - results_df['lstl']
results_df['blk_diff'] = results_df['wblk'] - results_df['lblk']
results_df['pf_diff'] = results_df['wpf'] - results_df['lpf']

results_df.columns

Index(['season', 'daynum', 'wteamid', 'wscore', 'lteamid', 'lscore', 'wloc',
       'numot', 'wfgm', 'wfga', 'wfgm3', 'wfga3', 'wftm', 'wfta', 'wor', 'wdr',
       'wast', 'wto', 'wstl', 'wblk', 'wpf', 'lfgm', 'lfga', 'lfgm3', 'lfga3',
       'lftm', 'lfta', 'lor', 'ldr', 'last', 'lto', 'lstl', 'lblk', 'lpf',
       'wseason-Day-Team', 'lseason-Day-Team', 'weeknum', 'wseason-week-team',
       'lseason-week-team', 'wrank', 'lrank', 'rank_diff', 'spread',
       'lowidwin', 'wfgpct', 'lfgpct', 'wfg3pct', 'lfg3pct', 'wefgpct',
       'lefgpct', 'wftpct', 'lftpct', 'fgpct_diff', 'fg3pct_diff',
       'efgpct_diff', 'ftpct_diff', 'or_diff', 'dr_diff', 'ast_diff',
       'to_diff', 'stl_diff', 'blk_diff', 'pf_diff'],
      dtype='object')

In [25]:
# Drop NA's for a nice and pretty DF

clean_results_df = results_df.dropna()

clean_results_df.head()

Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,fg3pct_diff,efgpct_diff,ftpct_diff,or_diff,dr_diff,ast_diff,to_diff,stl_diff,blk_diff,pf_diff
792,2003,36,1113,76,1305,63,H,0,25,57,...,0.031579,0.02556,0.32381,-4,4,2,-8,3,-1,-7
793,2003,36,1116,72,1256,60,H,0,25,58,...,0.038363,0.16931,-0.117794,1,-2,4,-7,-1,0,-1
794,2003,36,1130,85,1235,78,A,0,28,57,...,0.052632,0.224071,-0.05335,-10,0,-1,-1,3,-5,3
795,2003,36,1139,75,1133,70,H,0,23,56,...,-0.478469,0.019133,0.235577,4,-1,-5,-4,3,0,-4
796,2003,36,1143,67,1364,60,H,0,18,42,...,0.076923,0.052857,0.233333,4,2,-8,-1,2,1,-16


# Model Assembly and Training

In [26]:
# X = ranks, y = win?

y = clean_results_df['lowidwin']
X = clean_results_df[['rank_diff', 'efgpct_diff', 'ftpct_diff', 'or_diff', 'dr_diff',
       'ast_diff', 'to_diff', 'stl_diff', 'blk_diff', 'pf_diff']]

In [27]:
# Do TTS 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
X_train.shape

(82959, 10)

In [28]:
# Set up the Balanced Random Forest

from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=250, random_state=1)
brf.fit(X_train, y_train)


BalancedRandomForestClassifier(n_estimators=250, random_state=1)

In [29]:
prob_predictions = brf.predict_proba(X_test)
print (prob_predictions)

[[0.892 0.108]
 [0.66  0.34 ]
 [0.388 0.612]
 ...
 [0.356 0.644]
 [0.936 0.064]
 [0.388 0.612]]


In [30]:
# Calculated the balanced accuracy score
y_pred = brf.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.620705865335937


In [31]:
# Get importances and features
importances = brf.feature_importances_
cols = X.columns

# Store in a DataFrame
feature_importances_df = pd.DataFrame({'feature':cols, 'importance': importances})
feature_importances_df

Unnamed: 0,feature,importance
0,rank_diff,0.12665
1,efgpct_diff,0.146385
2,ftpct_diff,0.145825
3,or_diff,0.089197
4,dr_diff,0.082253
5,ast_diff,0.088167
6,to_diff,0.082906
7,stl_diff,0.076741
8,blk_diff,0.078299
9,pf_diff,0.083577


# Generating 2022 bracket predictions

In [32]:
# Now we need to get these predictions for all tournament games

# 1. Get season averages for teams meeting in the tournament GROUP BY WHERE SEASON = 2022, Get 64 tourney teams
# 2. Calculate the differentials to use as inputs for predictions for all potential matchups

## Getting season average stats by team

In [33]:
# Getting Team stats for a season (uses clean_results so may exclude a few games bc of rankings)

# Grab results for a team where it was the winner
w_teams = clean_results_df.loc[:, ['season', 'wteamid', 'wfgm','wfga','wfgm3',
                                   'wfga3','wftm', 'wfta', 'wor', 'wdr','wast', 
                                   'wto', 'wstl', 'wblk', 'wpf']]

w_teams.columns = ['Season', 'TeamID', 'FGM','FGA','FGM3','FGA3','FTM','FTA','OR','DR',
             'Ast','TO','Stl','Blk','PF']

# Grab results for a team where it was the loser
l_teams = clean_results_df.loc[:, ['season', 'lteamid', 'lfgm', 'lfga', 'lfgm3',
                                   'lfga3','lftm', 'lfta', 'lor', 'ldr', 'last',
                                   'lto', 'lstl', 'lblk', 'lpf']]

l_teams.columns = ['Season', 'TeamID', 'FGM','FGA','FGM3','FGA3','FTM','FTA','OR','DR',
             'Ast','TO','Stl','Blk','PF']

# Merge those together then get shooting effeciency
detail = pd.concat([w_teams,l_teams])
detail['FGPct'] = detail.FGM / detail.FGA 
detail['FG3Pct'] = detail.FGM3 / detail.FGA3  
detail['eFGPct'] = (detail.FGM + 0.5 * detail.FGM3)/detail.FGA
detail['FTPct'] = detail.FTM  / detail.FTA  

# Bring it together by Season and Team
team_stats = detail.groupby(['Season','TeamID'])['FGM','FGA','FGM3','FGA3','FTM','FTA',
                                         'OR','DR','Ast','TO','Stl','Blk','PF',
                                          'FGPct', 'FG3Pct', 
                                         'eFGPct', 'FTPct']\
                                        .mean().fillna(0).reset_index()

del w_teams, l_teams, detail

team_stats.sample(5)



Unnamed: 0,Season,TeamID,FGM,FGA,FGM3,FGA3,FTM,FTA,OR,DR,Ast,TO,Stl,Blk,PF,FGPct,FG3Pct,eFGPct,FTPct
5562,2019,1189,25.107143,57.642857,10.071429,28.857143,9.25,13.25,7.75,24.785714,14.107143,12.285714,4.107143,1.821429,16.785714,0.435897,0.348486,0.523465,0.71514
2152,2009,1269,24.45,60.8,8.0,25.525,9.925,14.525,11.125,24.0,13.4,13.275,6.475,5.175,17.0,0.403739,0.314841,0.4704,0.668998
1445,2007,1243,23.741935,53.612903,6.548387,18.645161,15.709677,22.451613,12.0,22.387097,15.290323,14.225806,6.064516,3.258065,17.903226,0.443342,0.347726,0.504838,0.6842
6444,2021,1379,23.44,55.12,4.92,16.2,10.44,14.76,6.08,24.56,12.6,13.72,6.0,2.44,4.4,0.426644,0.291976,0.471588,0.691492
4062,2014,1451,24.060606,49.575758,7.575758,18.818182,17.121212,22.818182,8.030303,24.878788,13.484848,11.848485,3.545455,3.333333,17.727273,0.486463,0.401983,0.563788,0.752104


In [34]:
# Get just 2022 and then add rankings for final week

teams_2022 = team_stats[team_stats.Season == 2022]
teams_2022.sample(5)

Unnamed: 0,Season,TeamID,FGM,FGA,FGM3,FGA3,FTM,FTA,OR,DR,Ast,TO,Stl,Blk,PF,FGPct,FG3Pct,eFGPct,FTPct
6737,2022,1314,27.380952,61.52381,8.428571,21.904762,13.857143,18.047619,9.571429,28.428571,14.333333,11.52381,5.142857,3.714286,14.571429,0.446994,0.384497,0.515554,0.758335
6630,2022,1204,23.375,54.1875,5.375,19.1875,10.875,16.5625,8.8125,23.125,10.9375,14.8125,6.5625,2.875,18.8125,0.431772,0.284029,0.481661,0.664786
6621,2022,1195,26.571429,60.047619,9.904762,28.761905,11.904762,17.333333,9.666667,24.333333,15.571429,13.52381,6.857143,4.761905,13.952381,0.441124,0.344554,0.523871,0.680863
6546,2022,1115,21.315789,52.631579,5.526316,20.315789,12.105263,17.947368,6.684211,18.473684,11.052632,13.947368,6.263158,2.736842,17.052632,0.402026,0.270616,0.455055,0.668017
6545,2022,1114,22.1875,55.0,6.8125,21.1875,14.0,18.75,7.0625,20.1875,12.6875,13.0,6.1875,2.3125,17.5,0.401928,0.318036,0.462877,0.725596


In [35]:
# Find last week of the season for rankings in 2022

SAG2022_ranks = SAG_ranks[SAG_ranks['season']==2022]
SAG2022_ranks['rankingweek'].max()

15.0

In [36]:
# Getting 2022 season-end rankings

end_ranks = SAG_ranks[SAG_ranks.rankingweek == 15]
end_ranks_2022 = end_ranks[end_ranks.season == 2022]
end_ranks_2022.head()

Unnamed: 0,season,rankingdaynum,systemname,teamid,ordinalrank,season-day-team,rankingweek,season-week-team
3318459,2022,100,SAG,1101,151,2022-100-1101,15.0,2022-15.0-1101
3318460,2022,100,SAG,1102,254,2022-100-1102,15.0,2022-15.0-1102
3318461,2022,100,SAG,1103,131,2022-100-1103,15.0,2022-15.0-1103
3318462,2022,100,SAG,1104,19,2022-100-1104,15.0,2022-15.0-1104
3318463,2022,100,SAG,1105,348,2022-100-1105,15.0,2022-15.0-1105


In [37]:
# Adding the 2022-end rankings into teams_2022

teams_2022 = teams_2022.merge(end_ranks_2022[['teamid','ordinalrank']], how='left', left_on='TeamID', right_on='teamid').drop(['teamid'], axis=1)
teams_2022.rename(columns={'ordinalrank': 'end_rank'},inplace=True)

teams_2022.head()

Unnamed: 0,Season,TeamID,FGM,FGA,FGM3,FGA3,FTM,FTA,OR,DR,Ast,TO,Stl,Blk,PF,FGPct,FG3Pct,eFGPct,FTPct,end_rank
0,2022,1101,25.058824,59.058824,6.882353,21.352941,17.176471,23.235294,8.588235,20.352941,13.941176,12.352941,10.294118,1.529412,22.647059,0.424027,0.318485,0.482021,0.739285,151
1,2022,1102,21.842105,49.263158,7.842105,21.894737,8.842105,13.631579,5.105263,19.736842,13.210526,11.631579,6.263158,3.263158,18.473684,0.446294,0.35707,0.525694,0.639186,254
2,2022,1103,23.555556,53.0,7.833333,22.5,13.777778,20.611111,8.0,24.055556,12.0,11.166667,5.0,2.777778,15.944444,0.445353,0.352604,0.520504,0.650718,131
3,2022,1104,27.142857,62.428571,8.666667,29.047619,15.52381,21.619048,11.666667,24.285714,13.714286,13.52381,7.285714,5.095238,18.52381,0.43762,0.300061,0.506297,0.709938,19
4,2022,1105,21.263158,56.578947,3.736842,14.315789,11.0,16.157895,8.736842,21.736842,9.421053,14.263158,8.315789,3.052632,16.842105,0.379589,0.264676,0.412969,0.687256,348


## Readying a submission file

In [38]:
# Get submission file
sub_df = pd.read_csv('Resources/MSampleSubmissionStage2.csv').drop(['Pred'],axis=1)
sub_df['Season'] = sub_df['ID'].map(lambda x: int(x.split('_')[0]))
sub_df['TeamA'] = sub_df['ID'].map(lambda x: int(x.split('_')[1]))
sub_df['TeamB'] = sub_df['ID'].map(lambda x: int(x.split('_')[2]))

sub_df.head()

Unnamed: 0,ID,Season,TeamA,TeamB
0,2022_1103_1104,2022,1103,1104
1,2022_1103_1112,2022,1103,1112
2,2022_1103_1116,2022,1103,1116
3,2022_1103_1120,2022,1103,1120
4,2022_1103_1124,2022,1103,1124


In [106]:
# Team A & B Lists

team_A = pd.DataFrame(sub_df[['ID','TeamA']].copy())
team_B = pd.DataFrame(sub_df[['ID','TeamB']].copy())

In [107]:
# Get Team A stats

merge_cols = ['TeamID', 'FGM','FGA','FGM3','FGA3','FTM','FTA','OR','DR',
             'Ast','TO','Stl','Blk','PF', 'end_rank']

team_A = team_A.merge(teams_2022[merge_cols], how='left', left_on='TeamA', right_on='TeamID').drop(['TeamID'], axis=1)

print(team_A.shape)
team_A.sample(5)

(2278, 16)


Unnamed: 0,ID,TeamA,FGM,FGA,FGM3,FGA3,FTM,FTA,OR,DR,Ast,TO,Stl,Blk,PF,end_rank
72,2022_1104_1136,1104,27.142857,62.428571,8.666667,29.047619,15.52381,21.619048,11.666667,24.285714,13.714286,13.52381,7.285714,5.095238,18.52381,19
1209,2022_1231_1277,1231,26.5,56.25,6.4,17.8,13.2,19.25,8.0,25.95,15.0,12.55,6.05,5.25,16.75,27
1523,2022_1260_1397,1260,25.263158,53.052632,8.421053,22.526316,12.421053,16.894737,5.894737,23.368421,13.736842,11.052632,5.947368,2.684211,16.105263,35
1342,2022_1240_1286,1240,25.052632,54.947368,9.0,23.0,13.210526,19.947368,8.526316,25.684211,13.263158,12.421053,6.157895,2.789474,14.894737,143
939,2022_1174_1400,1174,26.35,55.85,8.0,22.65,13.6,18.15,7.45,22.55,13.6,12.6,6.05,3.65,15.75,155


In [108]:
# Get shooting percentages and rename columns

team_A['FGPct'] = team_A.FGM / team_A.FGA 
team_A['FG3Pct'] = team_A.FGM3 / team_A.FGA3  
team_A['eFGPct'] = (team_A.FGM + 0.5 * team_A.FGM3)/team_A.FGA
team_A['FTPct'] = team_A.FTM  / team_A.FTA  

# Rename to be A_''
team_A.columns = ['ID','TeamA', 'A_FGM','A_FGA','A_FGM3','A_FGA3','A_FTM','A_FTA','A_OR','A_DR',
             'A_Ast','A_TO','A_Stl','A_Blk','A_PF', 'A_rank', 'A_FGPct', 'A_FG3Pct', 'A_eFGPct','A_FTPct']

print(team_A.shape)
team_A.sample(5)

(2278, 20)


Unnamed: 0,ID,TeamA,A_FGM,A_FGA,A_FGM3,A_FGA3,A_FTM,A_FTA,A_OR,A_DR,A_Ast,A_TO,A_Stl,A_Blk,A_PF,A_rank,A_FGPct,A_FG3Pct,A_eFGPct,A_FTPct
1489,2022_1255_1417,1255,25.764706,57.294118,7.941176,21.588235,14.176471,19.764706,9.470588,22.941176,13.0,11.764706,7.647059,2.235294,16.529412,191,0.449692,0.367847,0.518994,0.717262
274,2022_1120_1209,1120,28.571429,63.666667,8.142857,25.190476,14.619048,19.714286,10.095238,25.47619,14.571429,11.238095,8.904762,7.952381,18.095238,9,0.448766,0.323251,0.512715,0.741546
822,2022_1168_1345,1168,23.4375,55.25,5.4375,16.9375,16.5625,21.0,8.1875,22.9375,9.5625,11.5,6.875,2.125,16.5625,164,0.424208,0.321033,0.473416,0.78869
1250,2022_1234_1266,1234,28.75,64.55,8.3,24.9,14.9,19.7,10.8,22.75,15.5,8.5,7.95,4.45,16.35,21,0.445391,0.333333,0.509682,0.756345
1070,2022_1211_1293,1211,33.777778,63.444444,9.111111,23.666667,14.5,20.555556,9.055556,31.222222,18.722222,12.0,7.055556,5.722222,16.0,1,0.532399,0.384977,0.604203,0.705405


In [109]:
# Team B Stats

merge_cols = ['TeamID', 'FGM','FGA','FGM3','FGA3','FTM','FTA','OR','DR',
             'Ast','TO','Stl','Blk','PF', 'end_rank']

# Rename to be B_''
team_B = team_B.merge(teams_2022[merge_cols], how='left', left_on='TeamB', right_on='TeamID').drop(['TeamID'], axis=1)

print(team_B.shape)
team_B.sample(5)

(2278, 16)


Unnamed: 0,ID,TeamB,FGM,FGA,FGM3,FGA3,FTM,FTA,OR,DR,Ast,TO,Stl,Blk,PF,end_rank
442,2022_1129_1437,1437,24.47619,56.047619,8.952381,25.428571,14.095238,16.857143,9.0,23.238095,12.285714,9.142857,6.0,2.380952,15.285714,10
1113,2022_1222_1272,1272,27.166667,58.555556,6.166667,18.0,14.888889,22.5,11.166667,23.944444,15.722222,16.055556,8.444444,5.444444,19.277778,42
119,2022_1104_1397,1397,26.25,60.2,8.7,26.15,11.6,16.75,9.95,23.0,16.85,12.3,9.65,4.3,16.9,13
1659,2022_1274_1350,1350,26.318182,58.454545,8.545455,25.363636,11.863636,16.590909,6.636364,23.818182,15.681818,9.363636,8.409091,2.227273,13.636364,77
1085,2022_1211_1389,1389,23.0625,54.75,6.1875,17.0,15.25,21.4375,9.125,23.1875,13.0,13.1875,8.0,4.5625,19.5625,173


In [110]:
# Get shooting percentages and rename columns

team_B['FGPct'] = team_B.FGM / team_B.FGA 
team_B['FG3Pct'] = team_B.FGM3 / team_B.FGA3  
team_B['eFGPct'] = (team_B.FGM + 0.5 * team_B.FGM3)/team_B.FGA
team_B['FTPct'] = team_B.FTM  / team_B.FTA  

team_B.columns = ['ID','TeamB', 'B_FGM','B_FGA','B_FGM3','B_FGA3','B_FTM','B_FTA','B_OR','B_DR',
             'B_Ast','B_TO','B_Stl','B_Blk','B_PF', 'B_rank', 'B_FGPct', 'B_FG3Pct', 'B_eFGPct','B_FTPct']

print(team_B.shape)
team_B.sample(5)

(2278, 20)


Unnamed: 0,ID,TeamB,B_FGM,B_FGA,B_FGM3,B_FGA3,B_FTM,B_FTA,B_OR,B_DR,B_Ast,B_TO,B_Stl,B_Blk,B_PF,B_rank,B_FGPct,B_FG3Pct,B_eFGPct,B_FTPct
1619,2022_1272_1314,1314,27.380952,61.52381,8.428571,21.904762,13.857143,18.047619,9.571429,28.428571,14.333333,11.52381,5.142857,3.714286,14.571429,33,0.445046,0.384783,0.513545,0.76781
1370,2022_1240_1439,1439,25.8,54.35,9.35,22.4,9.35,12.7,7.65,21.3,14.5,10.85,5.05,3.05,15.0,38,0.474701,0.417411,0.560718,0.73622
1457,2022_1246_1463,1463,25.388889,58.444444,7.388889,21.888889,14.444444,19.5,7.888889,25.5,12.444444,12.833333,5.722222,3.166667,19.0,132,0.434411,0.337563,0.497624,0.740741
1284,2022_1234_1458,1458,25.210526,59.473684,7.052632,21.894737,13.842105,18.526316,7.789474,24.105263,11.578947,8.105263,4.842105,2.631579,16.947368,24,0.423894,0.322115,0.483186,0.747159
1755,2022_1286_1326,1326,26.1875,54.6875,8.6875,23.3125,13.75,18.0,7.625,25.5,14.4375,11.3125,4.5,4.6875,16.6875,17,0.478857,0.372654,0.558286,0.763889


In [111]:
# Reminder of features, so we know what columns to keep for the merge and in what order

# 'rank_diff', 'efgpct_diff', 'ftpct_diff', 'or_diff', 'dr_diff',
#        'ast_diff', 'to_diff', 'stl_diff', 'blk_diff', 'pf_diff'

print(feature_importances_df['feature'])

0      rank_diff
1    efgpct_diff
2     ftpct_diff
3        or_diff
4        dr_diff
5       ast_diff
6        to_diff
7       stl_diff
8       blk_diff
9        pf_diff
Name: feature, dtype: object


In [120]:
# Merge these two stat tables, so we can take the differences and run our model
team_AB = pd.DataFrame(sub_df['ID'].copy())

# Merge with team_A

merge_A = ['ID','TeamA','A_rank','A_eFGPct', 'A_FTPct', 'A_OR','A_DR','A_Ast','A_TO','A_Stl','A_Blk','A_PF']

team_AB = team_AB.merge(team_A[merge_A], how='left', left_on='ID', right_on='ID')
# Merge with team_B

merge_B = ['ID','TeamB','B_rank','B_eFGPct', 'B_FTPct', 'B_OR','B_DR','B_Ast','B_TO','B_Stl','B_Blk','B_PF']

team_AB = team_AB.merge(team_B[merge_B], how='left', left_on='ID', right_on='ID')

# Check the new team_AB. This contains all potential March Madness 2022 matchups, 
# and the teams' average stats and ending rank

team_AB.head()

Unnamed: 0,ID,TeamA,A_rank,A_eFGPct,A_FTPct,A_OR,A_DR,A_Ast,A_TO,A_Stl,...,B_rank,B_eFGPct,B_FTPct,B_OR,B_DR,B_Ast,B_TO,B_Stl,B_Blk,B_PF
0,2022_1103_1104,1103,131,0.518344,0.668464,8.0,24.055556,12.0,11.166667,5.0,...,19,0.504195,0.718062,11.666667,24.285714,13.714286,13.52381,7.285714,5.095238,18.52381
1,2022_1103_1112,1103,131,0.518344,0.668464,8.0,24.055556,12.0,11.166667,5.0,...,2,0.54252,0.725995,11.6,28.8,19.6,13.6,6.65,6.2,16.45
2,2022_1103_1116,1103,131,0.518344,0.668464,8.0,24.055556,12.0,11.166667,5.0,...,22,0.502342,0.742268,9.666667,25.904762,14.904762,12.380952,8.095238,4.47619,17.190476
3,2022_1103_1120,1103,131,0.518344,0.668464,8.0,24.055556,12.0,11.166667,5.0,...,9,0.512715,0.741546,10.095238,25.47619,14.571429,11.238095,8.904762,7.952381,18.095238
4,2022_1103_1124,1103,131,0.518344,0.668464,8.0,24.055556,12.0,11.166667,5.0,...,7,0.537773,0.677233,11.285714,22.238095,15.904762,12.571429,9.761905,3.095238,15.714286


In [132]:
# Now, take differentials (A-B) so it aligns with our model features

team_AB_diffs = pd.DataFrame(team_AB[['ID','TeamA','TeamB']]).copy()

# 'rank_diff', 'efgpct_diff', 'ftpct_diff', 'or_diff', 'dr_diff',
#        'ast_diff', 'to_diff', 'stl_diff', 'blk_diff', 'pf_diff'

team_AB_diffs['rank_diff'] = team_AB['A_rank'] - team_AB['B_rank']
team_AB_diffs['efgpct_diff'] = team_AB['A_eFGPct'] - team_AB['B_eFGPct']
team_AB_diffs['ftpct_diff'] = team_AB['A_FTPct'] - team_AB['B_FTPct']
team_AB_diffs['or_diff'] = team_AB['A_OR'] - team_AB['B_OR']
team_AB_diffs['dr_diff'] = team_AB['A_DR'] - team_AB['B_DR']
team_AB_diffs['ast_diff'] = team_AB['A_Ast'] - team_AB['B_Ast']
team_AB_diffs['to_diff'] = team_AB['A_TO'] - team_AB['B_TO']
team_AB_diffs['stl_diff'] = team_AB['A_Stl'] - team_AB['B_Stl']
team_AB_diffs['blk_diff'] = team_AB['A_Blk'] - team_AB['B_Blk']
team_AB_diffs['pf_diff'] = team_AB['A_PF'] - team_AB['B_PF']
team_AB_diffs['lowIDwin'] = ""

team_AB_diffs.head()

Unnamed: 0,ID,TeamA,TeamB,rank_diff,efgpct_diff,ftpct_diff,or_diff,dr_diff,ast_diff,to_diff,stl_diff,blk_diff,pf_diff,lowIDwin
0,2022_1103_1104,1103,1104,112,0.014149,-0.049598,-3.666667,-0.230159,-1.714286,-2.357143,-2.285714,-2.31746,-2.579365,
1,2022_1103_1112,1103,1112,129,-0.024176,-0.057532,-3.6,-4.744444,-7.6,-2.433333,-1.65,-3.422222,-0.505556,
2,2022_1103_1116,1103,1116,109,0.016002,-0.073804,-1.666667,-1.849206,-2.904762,-1.214286,-3.095238,-1.698413,-1.246032,
3,2022_1103_1120,1103,1120,122,0.005629,-0.073082,-2.095238,-1.420635,-2.571429,-0.071429,-3.904762,-5.174603,-2.150794,
4,2022_1103_1124,1103,1124,124,-0.019429,-0.00877,-3.285714,1.81746,-3.904762,-1.404762,-4.761905,-0.31746,0.230159,


In [133]:
# Run the model
X_AB = team_AB_diffs[['rank_diff', 'efgpct_diff', 'ftpct_diff', 'or_diff', 'dr_diff',
       'ast_diff', 'to_diff', 'stl_diff', 'blk_diff', 'pf_diff']]

In [134]:
y_pred = brf.predict(X_AB)
print(y_pred)

[1 0 1 ... 0 1 0]


In [139]:
columns = ['lowidwin']

predictions_AB = pd.DataFrame(y_pred, index=team_AB_diffs.ID, columns=columns)
predictions_AB.head()

Unnamed: 0_level_0,lowidwin
ID,Unnamed: 1_level_1
2022_1103_1104,1
2022_1103_1112,0
2022_1103_1116,1
2022_1103_1120,1
2022_1103_1124,0


In [140]:
y_probs = brf.predict_proba(X_AB)
print(y_probs)

[[0.408 0.592]
 [0.504 0.496]
 [0.452 0.548]
 ...
 [0.608 0.392]
 [0.476 0.524]
 [0.528 0.472]]


In [166]:
columns = ['lowidlose_prob','lowidwin_prob']

probs_AB = pd.DataFrame(y_probs, index=team_AB_diffs.ID, columns=columns)
probs_AB.head()

Unnamed: 0_level_0,lowidlose_prob,lowidwin_prob
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
2022_1103_1104,0.408,0.592
2022_1103_1112,0.504,0.496
2022_1103_1116,0.452,0.548
2022_1103_1120,0.464,0.536
2022_1103_1124,0.512,0.488


In [154]:
# Test - GA State 1209 vs Gonzaga 1211. We definitely want to see lowidlose >0.6ish

test = probs_AB.loc[probs_AB.index=='2022_1209_1211']
test

Unnamed: 0_level_0,lowidlose_prob,lowidwin_prob
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
2022_1209_1211,0.556,0.444


In [167]:
# Getting Team ID's and Team Names

probs_AB['LowIDwin'] = predictions_AB['lowidwin']
probs_AB['TeamA_ID'] = probs_AB.index.map(lambda x: int(x.split('_')[1]))
probs_AB['TeamB_ID'] = probs_AB.index.map(lambda x: int(x.split('_')[2]))


probs_AB.head()


Unnamed: 0_level_0,lowidlose_prob,lowidwin_prob,LowIDwin,TeamA_ID,TeamB_ID
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022_1103_1104,0.408,0.592,1,1103,1104
2022_1103_1112,0.504,0.496,0,1103,1112
2022_1103_1116,0.452,0.548,1,1103,1116
2022_1103_1120,0.464,0.536,1,1103,1120
2022_1103_1124,0.512,0.488,0,1103,1124


In [156]:
teams_df

Unnamed: 0,teamid,teamname,firstd1season,lastd1season
0,1101,Abilene Chr,2014,2022
1,1102,Air Force,1985,2022
2,1103,Akron,1985,2022
3,1104,Alabama,1985,2022
4,1105,Alabama A&M,2000,2022
...,...,...,...,...
367,1468,Bellarmine,2021,2022
368,1469,Dixie St,2021,2022
369,1470,Tarleton St,2021,2022
370,1471,UC San Diego,2021,2022


In [168]:
# Merge in Team names and make the bracket with this little cheat sheet!

# teams_2022 = teams_2022.merge(end_ranks_2022[['teamid','ordinalrank']], how='left', left_on='TeamID', right_on='teamid').drop(['teamid'], axis=1)

probs_AB = probs_AB.merge(teams_df[['teamid','teamname']], how='left', left_on='TeamA_ID', right_on='teamid').drop(['teamid'], axis=1)
probs_AB = probs_AB.merge(teams_df[['teamid','teamname']], how='left', left_on='TeamB_ID', right_on='teamid').drop(['teamid'], axis=1)

probs_AB.rename(columns={'teamname_x': 'TeamA_Name', 'teamname_y': 'TeamB_Name'},inplace=True)

probs_AB.head()

Unnamed: 0,lowidlose_prob,lowidwin_prob,LowIDwin,TeamA_ID,TeamB_ID,TeamA_Name,TeamB_Name
0,0.408,0.592,1,1103,1104,Akron,Alabama
1,0.504,0.496,0,1103,1112,Akron,Arizona
2,0.452,0.548,1,1103,1116,Akron,Arkansas
3,0.464,0.536,1,1103,1120,Akron,Auburn
4,0.512,0.488,0,1103,1124,Akron,Baylor


In [173]:
probs_AB['Winner'] = np.where((probs_AB['LowIDwin'] ==1), probs_AB['TeamA_Name'], probs_AB['TeamB_Name'])
                              
probs_AB.head()

Unnamed: 0,lowidlose_prob,lowidwin_prob,LowIDwin,TeamA_ID,TeamB_ID,TeamA_Name,TeamB_Name,Winner
0,0.408,0.592,1,1103,1104,Akron,Alabama,Akron
1,0.504,0.496,0,1103,1112,Akron,Arizona,Arizona
2,0.452,0.548,1,1103,1116,Akron,Arkansas,Akron
3,0.464,0.536,1,1103,1120,Akron,Auburn,Akron
4,0.512,0.488,0,1103,1124,Akron,Baylor,Baylor


In [178]:
# Export the model results as a csv

path = 'Exports/model_results.csv'

probs_AB.to_csv(path)

In [180]:
# 2022 Team Stats per game

path = 'Exports/team_stats_per_game.csv'

teams_2022.to_csv(path)