In [1]:
!pip install flask-sqlalchemy





In [2]:
# Import dependencies

# For data processing
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# For ML
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# For Database connections
import sqlalchemy 
from sqlalchemy import create_engine, func
from sqlalchemy import inspect
# from sqlalchemy import session
import psycopg2
from sqlalchemy.ext.automap import automap_base

In [3]:
# Connect to Database - this will be done via config.py in the future
host = "group2022.cem6bfyajguw.us-east-2.rds.amazonaws.com"
database = "postgres"
user = 'postgres'
password = 'postgrespassword'

db_string = f"postgresql://{user}:{password}@{host}/{database}"

In [4]:
# Set up SQLAlchemy
engine = create_engine(db_string)
insp = inspect(engine)
insp.get_table_names()

['teams',
 'regular_season_detailed_results',
 'all_game_results',
 'conferences',
 'rankings_with_team_names',
 'conferences_with_team_names',
 'sag_system',
 'rankings']

In [5]:
# Teams raw DF
teams_df=pd.read_sql_table('teams', engine)
print(teams_df.shape)
teams_df.head()

(372, 4)


Unnamed: 0,teamid,teamname,firstd1season,lastd1season
0,1101,Abilene Chr,2014,2022
1,1102,Air Force,1985,2022
2,1103,Akron,1985,2022
3,1104,Alabama,1985,2022
4,1105,Alabama A&M,2000,2022


In [6]:
# Rankings raw DF
rankings_df=pd.read_sql_table('rankings', engine)
print(rankings_df.shape)
rankings_df.head()

(4521720, 5)


Unnamed: 0,season,rankingdaynum,systemname,teamid,ordinalrank
0,2010,113,BPI,1315,332
1,2010,113,BPI,1316,238
2,2010,113,BPI,1317,145
3,2010,113,BPI,1318,73
4,2010,113,BPI,1319,206


In [7]:
# Checking ranking systems
rankings_df['systemname'].value_counts()

SAG    122047
MOR    121057
POM    118566
DOK    105270
WLK     97047
        ...  
HRN       351
CRW       351
PMC       351
BP5       345
PH        326
Name: systemname, Length: 187, dtype: int64

In [8]:
# We need to get just SAG ranks for simplicity
SAG_ranks = rankings_df.loc[rankings_df['systemname']=='SAG']
SAG_ranks.head()

Unnamed: 0,season,rankingdaynum,systemname,teamid,ordinalrank
6393,2010,113,SAG,1102,230
6394,2010,113,SAG,1103,101
6395,2010,113,SAG,1104,83
6396,2010,113,SAG,1105,334
6397,2010,113,SAG,1106,314


In [9]:
# Results raw DF

raw_results_df=pd.read_sql_table('all_game_results', engine)
print(raw_results_df.shape)
raw_results_df.head()

(101604, 48)


Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,wordinalrank,lordinalrank,wfgpercent,lfgpercent,wfg3percent,lfg3percent,wftpercent,lftpercent,seasonweek,ordinalrank
0,2003,134,1421,92,1411,84,N,1,32,69,...,252.0,262.0,0.463768,0.432836,0.37931,0.387097,0.653846,0.451613,2003-19.0-1421,252.0
1,2003,136,1112,80,1436,51,N,0,31,66,...,1.0,166.0,0.469697,0.3125,0.304348,0.25,0.785714,1.0,2003-19.0-1112,1.0
2,2003,136,1113,84,1272,71,N,0,31,59,...,29.0,25.0,0.525424,0.362319,0.428571,0.25,0.727273,0.666667,2003-19.0-1113,29.0
3,2003,136,1141,79,1166,73,N,0,29,53,...,52.0,18.0,0.54717,0.45,0.428571,0.411765,0.72,0.705882,2003-19.0-1141,52.0
4,2003,136,1143,76,1301,74,N,1,27,64,...,35.0,51.0,0.421875,0.446429,0.35,0.428571,0.652174,0.75,2003-19.0-1143,35.0


In [10]:
raw_results_df.columns

Index(['season', 'daynum', 'wteamid', 'wscore', 'lteamid', 'lscore', 'wloc',
       'numot', 'wfgm', 'wfga', 'wfgm3', 'wfga3', 'wftm', 'wfta', 'wor', 'wdr',
       'wast', 'wto', 'wstl', 'wblk', 'wpf', 'lfgm', 'lfga', 'lfgm3', 'lfga3',
       'lftm', 'lfta', 'lor', 'ldr', 'last', 'lto', 'lstl', 'lblk', 'lpf',
       'gametype', 'weeknum', 'wseason_week_team', 'lseason_week_team',
       'wordinalrank', 'lordinalrank', 'wfgpercent', 'lfgpercent',
       'wfg3percent', 'lfg3percent', 'wftpercent', 'lftpercent', 'seasonweek',
       'ordinalrank'],
      dtype='object')

In [11]:
# We want to drop years before 2003, because rankings go to 2003
results_df = raw_results_df[raw_results_df['season']>=2003]
results_df.head()

Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,wordinalrank,lordinalrank,wfgpercent,lfgpercent,wfg3percent,lfg3percent,wftpercent,lftpercent,seasonweek,ordinalrank
0,2003,134,1421,92,1411,84,N,1,32,69,...,252.0,262.0,0.463768,0.432836,0.37931,0.387097,0.653846,0.451613,2003-19.0-1421,252.0
1,2003,136,1112,80,1436,51,N,0,31,66,...,1.0,166.0,0.469697,0.3125,0.304348,0.25,0.785714,1.0,2003-19.0-1112,1.0
2,2003,136,1113,84,1272,71,N,0,31,59,...,29.0,25.0,0.525424,0.362319,0.428571,0.25,0.727273,0.666667,2003-19.0-1113,29.0
3,2003,136,1141,79,1166,73,N,0,29,53,...,52.0,18.0,0.54717,0.45,0.428571,0.411765,0.72,0.705882,2003-19.0-1141,52.0
4,2003,136,1143,76,1301,74,N,1,27,64,...,35.0,51.0,0.421875,0.446429,0.35,0.428571,0.652174,0.75,2003-19.0-1143,35.0


# Preprocessing & Feature Engineering


In [12]:
# Make a new column in SAG_Ranks called 'Season_Day_Team' that we will repeat in Results 

SAG_ranks['season-day-team'] = SAG_ranks['season'].map(str) + '-' + SAG_ranks['rankingdaynum'].map(str) + '-' + SAG_ranks['teamid'].map(str)
SAG_ranks.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,season,rankingdaynum,systemname,teamid,ordinalrank,season-day-team
6393,2010,113,SAG,1102,230,2010-113-1102
6394,2010,113,SAG,1103,101,2010-113-1103
6395,2010,113,SAG,1104,83,2010-113-1104
6396,2010,113,SAG,1105,334,2010-113-1105
6397,2010,113,SAG,1106,314,2010-113-1106


In [13]:
# Make new columns in SAG_Ranks called 'RankingWeek' and 'Season_Week_Team' because rankings only come out weekly

SAG_ranks['rankingweek'] = (SAG_ranks['rankingdaynum']/7).apply(np.ceil)

SAG_ranks['season-week-team'] = SAG_ranks['season'].map(str) + '-' + SAG_ranks['rankingweek'].map(str) + '-' + SAG_ranks['teamid'].map(str)
SAG_ranks.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,season,rankingdaynum,systemname,teamid,ordinalrank,season-day-team,rankingweek,season-week-team
6393,2010,113,SAG,1102,230,2010-113-1102,17.0,2010-17.0-1102
6394,2010,113,SAG,1103,101,2010-113-1103,17.0,2010-17.0-1103
6395,2010,113,SAG,1104,83,2010-113-1104,17.0,2010-17.0-1104
6396,2010,113,SAG,1105,334,2010-113-1105,17.0,2010-17.0-1105
6397,2010,113,SAG,1106,314,2010-113-1106,17.0,2010-17.0-1106


In [14]:
# Make a new column in Results called 'WSeason_Day_Team' & 'LSeason_Day_Team' that we'll use for potential indices

results_df['wseason-Day-Team'] = results_df['season'].map(str) + '-' + results_df['daynum'].map(str) + '-' + results_df['wteamid'].map(str)
results_df['lseason-Day-Team'] = results_df['season'].map(str) + '-' + results_df['daynum'].map(str) + '-' + results_df['lteamid'].map(str)
results_df.head()

Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,wfgpercent,lfgpercent,wfg3percent,lfg3percent,wftpercent,lftpercent,seasonweek,ordinalrank,wseason-Day-Team,lseason-Day-Team
0,2003,134,1421,92,1411,84,N,1,32,69,...,0.463768,0.432836,0.37931,0.387097,0.653846,0.451613,2003-19.0-1421,252.0,2003-134-1421,2003-134-1411
1,2003,136,1112,80,1436,51,N,0,31,66,...,0.469697,0.3125,0.304348,0.25,0.785714,1.0,2003-19.0-1112,1.0,2003-136-1112,2003-136-1436
2,2003,136,1113,84,1272,71,N,0,31,59,...,0.525424,0.362319,0.428571,0.25,0.727273,0.666667,2003-19.0-1113,29.0,2003-136-1113,2003-136-1272
3,2003,136,1141,79,1166,73,N,0,29,53,...,0.54717,0.45,0.428571,0.411765,0.72,0.705882,2003-19.0-1141,52.0,2003-136-1141,2003-136-1166
4,2003,136,1143,76,1301,74,N,1,27,64,...,0.421875,0.446429,0.35,0.428571,0.652174,0.75,2003-19.0-1143,35.0,2003-136-1143,2003-136-1301


In [15]:
# Make a new column in Results called 'WSeason_Week_Team' & 'LSeason_Week_Team' that we'll use for loc to pull in OrdinalRank for winning and losing team

results_df['weeknum'] = (results_df['daynum']/7).apply(np.ceil)

results_df['wseason-week-team'] = results_df['season'].map(str) + '-' + results_df['weeknum'].map(str) + '-' + results_df['wteamid'].map(str)
results_df['lseason-week-team'] = results_df['season'].map(str) + '-' + results_df['weeknum'].map(str) + '-' + results_df['lteamid'].map(str)
results_df.head()

Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,wfg3percent,lfg3percent,wftpercent,lftpercent,seasonweek,ordinalrank,wseason-Day-Team,lseason-Day-Team,wseason-week-team,lseason-week-team
0,2003,134,1421,92,1411,84,N,1,32,69,...,0.37931,0.387097,0.653846,0.451613,2003-19.0-1421,252.0,2003-134-1421,2003-134-1411,2003-20.0-1421,2003-20.0-1411
1,2003,136,1112,80,1436,51,N,0,31,66,...,0.304348,0.25,0.785714,1.0,2003-19.0-1112,1.0,2003-136-1112,2003-136-1436,2003-20.0-1112,2003-20.0-1436
2,2003,136,1113,84,1272,71,N,0,31,59,...,0.428571,0.25,0.727273,0.666667,2003-19.0-1113,29.0,2003-136-1113,2003-136-1272,2003-20.0-1113,2003-20.0-1272
3,2003,136,1141,79,1166,73,N,0,29,53,...,0.428571,0.411765,0.72,0.705882,2003-19.0-1141,52.0,2003-136-1141,2003-136-1166,2003-20.0-1141,2003-20.0-1166
4,2003,136,1143,76,1301,74,N,1,27,64,...,0.35,0.428571,0.652174,0.75,2003-19.0-1143,35.0,2003-136-1143,2003-136-1301,2003-20.0-1143,2003-20.0-1301


In [16]:
# Merging winning team rankings on season-week-team. There will be NA's for Rank bc of beginning and end of season

results_df = results_df.merge(SAG_ranks[['season-week-team','ordinalrank']], how='left', left_on='wseason-week-team', right_on='season-week-team').drop(columns=['season-week-team'])

results_df.sample(10)

Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,lfg3percent,wftpercent,lftpercent,seasonweek,ordinalrank_x,wseason-Day-Team,lseason-Day-Team,wseason-week-team,lseason-week-team,ordinalrank_y
19049,2006,72,1419,78,1311,59,H,0,30,57,...,0.296296,0.590909,0.6875,2006-11.0-1419,278.0,2006-72-1419,2006-72-1311,2006-11.0-1419,2006-11.0-1311,278.0
62623,2014,28,1400,70,1435,64,H,0,25,56,...,0.238095,0.517241,0.714286,2014-4.0-1400,74.0,2014-28-1400,2014-28-1435,2014-4.0-1400,2014-4.0-1435,74.0
94667,2019,129,1163,80,1378,73,N,0,25,46,...,0.291667,0.607143,0.666667,2019-19.0-1163,96.0,2019-129-1163,2019-129-1378,2019-19.0-1163,2019-19.0-1378,97.0
61326,2013,129,1393,62,1338,59,N,0,22,47,...,0.315789,0.6,0.578947,2013-19.0-1393,12.0,2013-129-1393,2013-129-1338,2013-19.0-1393,2013-19.0-1338,12.0
20488,2006,103,1355,75,1410,58,H,0,27,53,...,0.277778,0.8,0.5,2006-15.0-1355,298.0,2006-103-1355,2006-103-1410,2006-15.0-1355,2006-15.0-1410,298.0
76158,2016,93,1205,79,1219,74,A,0,27,47,...,0.4,0.736842,0.761905,2016-14.0-1205,245.0,2016-93-1205,2016-93-1219,2016-14.0-1205,2016-14.0-1219,245.0
2322,2003,47,1276,70,1435,66,H,0,20,52,...,0.3125,0.72973,0.6875,2003-7.0-1276,221.0,2003-47-1276,2003-47-1435,2003-7.0-1276,2003-7.0-1435,221.0
11723,2004,124,1350,69,1269,65,H,0,25,56,...,0.45,0.5625,0.625,,,2004-124-1350,2004-124-1269,2004-18.0-1350,2004-18.0-1269,
104007,2021,128,1274,67,1155,64,N,0,29,54,...,0.521739,0.8,1.0,2021-19.0-1274,114.0,2021-128-1274,2021-128-1155,2021-19.0-1274,2021-19.0-1155,97.0
98639,2020,93,1236,75,1442,69,H,0,23,48,...,0.333333,0.777778,0.9,2020-14.0-1236,275.0,2020-93-1236,2020-93-1442,2020-14.0-1236,2020-14.0-1442,275.0


In [17]:
# Rename OrdinalRank to WRank
results_df.rename(columns={'ordinalrank_y': 'wrank'},inplace=True)
results_df.sample(10)

Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,lfg3percent,wftpercent,lftpercent,seasonweek,ordinalrank_x,wseason-Day-Team,lseason-Day-Team,wseason-week-team,lseason-week-team,wrank
104959,2022,21,1211,107,1141,54,N,0,42,74,...,0.391304,0.583333,0.6875,2022-3.0-1211,1.0,2022-21-1211,2022-21-1141,2022-3.0-1211,2022-3.0-1141,1.0
102242,2021,82,1360,78,1339,70,A,0,31,55,...,0.363636,0.785714,0.736842,2021-12.0-1360,242.0,2021-82-1360,2021-82-1339,2021-12.0-1360,2021-12.0-1339,242.0
52883,2012,66,1284,73,1291,59,H,0,29,54,...,0.217391,0.6,0.555556,2012-10.0-1284,334.0,2012-66-1284,2012-66-1291,2012-10.0-1284,2012-10.0-1291,334.0
33156,2008,129,1210,94,1438,76,N,0,33,57,...,0.391304,0.833333,0.724138,2008-19.0-1210,69.0,2008-129-1210,2008-129-1438,2008-19.0-1210,2008-19.0-1438,69.0
10408,2004,103,1341,67,1238,64,H,0,21,45,...,0.36,0.733333,0.5625,2004-15.0-1341,306.0,2004-103-1341,2004-103-1238,2004-15.0-1341,2004-15.0-1238,305.0
25472,2007,100,1107,71,1263,55,A,0,26,56,...,0.15,0.571429,0.666667,2007-15.0-1107,140.0,2007-100-1107,2007-100-1263,2007-15.0-1107,2007-15.0-1263,140.0
80160,2017,59,1383,80,1136,77,H,1,29,73,...,0.2,0.6,0.703704,2017-9.0-1383,335.0,2017-59-1383,2017-59-1136,2017-9.0-1383,2017-9.0-1136,335.0
19249,2006,75,1463,75,1135,61,H,0,27,58,...,0.235294,0.75,0.641026,2006-11.0-1463,210.0,2006-75-1463,2006-75-1135,2006-11.0-1463,2006-11.0-1135,210.0
93974,2019,114,1222,99,1187,65,A,0,35,58,...,0.384615,0.666667,0.666667,2019-17.0-1222,18.0,2019-114-1222,2019-114-1187,2019-17.0-1222,2019-17.0-1187,18.0
67049,2015,11,1165,68,1206,60,A,0,21,47,...,0.259259,0.607143,0.52381,,,2015-11-1165,2015-11-1206,2015-2.0-1165,2015-2.0-1206,


In [18]:
# Merging losing team rankings on season-week-team. There will be NA's for Rank bc of beginning and end of season

results_df = results_df.merge(SAG_ranks[['season-week-team','ordinalrank']], how='left', left_on='lseason-week-team', right_on='season-week-team').drop(columns=['season-week-team'])

results_df.sample(5)

Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,wftpercent,lftpercent,seasonweek,ordinalrank_x,wseason-Day-Team,lseason-Day-Team,wseason-week-team,lseason-week-team,wrank,ordinalrank
5074,2003,107,1410,45,1366,39,A,0,12,36,...,0.62069,0.7,2003-16.0-1410,305.0,2003-107-1410,2003-107-1366,2003-16.0-1410,2003-16.0-1366,302.0,325.0
75458,2014,94,1208,91,1261,78,H,0,27,45,...,0.695652,0.681818,2014-14.0-1208,145.0,2014-94-1208,2014-94-1261,2014-14.0-1208,2014-14.0-1261,145.0,53.0
89929,2017,19,1344,71,1212,54,H,0,25,54,...,0.8,0.55,2017-3.0-1344,56.0,2017-19-1344,2017-19-1212,2017-3.0-1344,2017-3.0-1212,56.0,351.0
29051,2007,90,1375,90,1317,89,H,1,30,63,...,0.653846,0.65,2007-13.0-1375,149.0,2007-90-1375,2007-90-1317,2007-13.0-1375,2007-13.0-1317,149.0,180.0
96221,2018,25,1414,77,1319,71,N,0,25,50,...,0.689655,0.826087,2018-4.0-1414,146.0,2018-25-1414,2018-25-1319,2018-4.0-1414,2018-4.0-1319,146.0,327.0


In [19]:
# Rename OrdinalRank to LRank
results_df.rename(columns={'ordinalrank': 'lrank'},inplace=True)
results_df.sample(5)

Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,wftpercent,lftpercent,seasonweek,ordinalrank_x,wseason-Day-Team,lseason-Day-Team,wseason-week-team,lseason-week-team,wrank,lrank
10736,2004,87,1451,68,1340,63,A,0,24,49,...,0.782609,0.6875,2004-13.0-1451,166.0,2004-87-1451,2004-87-1340,2004-13.0-1451,2004-13.0-1340,158.0,190.0
89345,2016,130,1433,85,1269,70,N,0,29,72,...,0.8125,0.809524,2016-19.0-1433,40.0,2016-130-1433,2016-130-1269,2016-19.0-1433,2016-19.0-1269,39.0,154.0
119981,2022,82,1308,72,1372,58,A,0,26,59,...,0.6,0.8,2022-12.0-1308,73.0,2022-82-1308,2022-82-1372,2022-12.0-1308,2022-12.0-1372,73.0,140.0
99404,2018,100,1435,81,1208,66,H,0,27,49,...,1.0,0.631579,2018-15.0-1435,74.0,2018-100-1435,2018-100-1208,2018-15.0-1435,2018-15.0-1208,74.0,61.0
35891,2008,68,1174,81,1330,74,H,2,22,55,...,0.764706,0.68,,,2008-68-1174,2008-68-1330,2008-10.0-1174,2008-10.0-1330,,


In [20]:
# Getting ranking differential to use as a variable - negative values are underdog wins
results_df['rankdiff'] = results_df['lrank'] - results_df['wrank']

# Getting Score differential just in case
results_df['spread'] = results_df['wscore'] - results_df['lscore']

# Outcome column for if the lower teamID won, as that is the submission format for Kaggle
results_df['lowidwin'] = np.where((results_df['wteamid'] < results_df['lteamid']), 1, 0)

results_df.sample(5)

Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,ordinalrank_x,wseason-Day-Team,lseason-Day-Team,wseason-week-team,lseason-week-team,wrank,lrank,rankdiff,spread,lowidwin
39831,2008,130,1140,63,1361,54,N,0,21,47,...,30.0,2008-130-1140,2008-130-1361,2008-19.0-1140,2008-19.0-1361,30.0,89.0,59.0,9,1
6849,2003,129,1326,66,1234,64,N,0,23,54,...,86.0,2003-129-1326,2003-129-1234,2003-19.0-1326,2003-19.0-1234,72.0,83.0,11.0,2,0
17570,2005,72,1356,69,1191,57,A,0,27,60,...,40.0,2005-72-1356,2005-72-1191,2005-11.0-1356,2005-11.0-1191,40.0,85.0,45.0,12,0
40478,2009,19,1178,62,1185,47,H,0,23,51,...,191.0,2009-19-1178,2009-19-1185,2009-3.0-1178,2009-3.0-1185,191.0,187.0,-4.0,15,1
68352,2013,65,1307,65,1424,60,H,0,22,57,...,40.0,2013-65-1307,2013-65-1424,2013-10.0-1307,2013-10.0-1424,40.0,29.0,-11.0,5,1


In [21]:
results_df.columns

Index(['season', 'daynum', 'wteamid', 'wscore', 'lteamid', 'lscore', 'wloc',
       'numot', 'wfgm', 'wfga', 'wfgm3', 'wfga3', 'wftm', 'wfta', 'wor', 'wdr',
       'wast', 'wto', 'wstl', 'wblk', 'wpf', 'lfgm', 'lfga', 'lfgm3', 'lfga3',
       'lftm', 'lfta', 'lor', 'ldr', 'last', 'lto', 'lstl', 'lblk', 'lpf',
       'gametype', 'weeknum', 'wseason_week_team', 'lseason_week_team',
       'wordinalrank', 'lordinalrank', 'wfgpercent', 'lfgpercent',
       'wfg3percent', 'lfg3percent', 'wftpercent', 'lftpercent', 'seasonweek',
       'ordinalrank_x', 'wseason-Day-Team', 'lseason-Day-Team',
       'wseason-week-team', 'lseason-week-team', 'wrank', 'lrank', 'rankdiff',
       'spread', 'lowidwin'],
      dtype='object')

In [23]:
# Getting shooting percentages

results_df['wfgpct'] = results_df['wfgm'] / results_df['wfga']
results_df['lfgpct'] = results_df['lfgm'] / results_df['lfga']

results_df['wfg3pct'] = results_df['wfgm3'] / results_df['wfga3']
results_df['lfg3pct'] = results_df['lfgm3'] / results_df['lfga3']

results_df['wefgpct'] = (results_df['wfgm'] + 0.5 * results_df['wfga3']) / results_df['wfga']
results_df['lefgpct'] = (results_df['lfgm'] + 0.5 * results_df['lfgm3']) / results_df['lfga']

results_df['wftpct'] = results_df['wftm'] / results_df['wfta']
results_df['lftpct'] = results_df['lftm'] / results_df['lfta']

results_df.sample(5)

Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,spread,lowidwin,wfgpct,lfgpct,wfg3pct,lfg3pct,wefgpct,lefgpct,wftpct,lftpct
13991,2004,117,1157,69,1149,53,H,0,27,63,...,16,0,0.428571,0.404255,0.285714,0.25,0.595238,0.43617,0.692308,0.705882
56343,2011,52,1408,68,1380,65,A,0,19,46,...,3,0,0.413043,0.416667,0.357143,0.4,0.565217,0.483333,0.757576,0.7
83237,2015,129,1414,63,1415,54,N,0,21,50,...,9,1,0.42,0.354839,0.4,0.117647,0.57,0.370968,0.625,0.5
80648,2015,80,1112,89,1390,82,A,0,30,54,...,7,1,0.555556,0.5,0.416667,0.4,0.666667,0.58,0.666667,0.774194
64605,2012,110,1247,72,1269,71,A,0,26,60,...,1,1,0.433333,0.446154,0.347826,0.235294,0.625,0.476923,0.705882,0.5


In [24]:
# getting differentials between teams for stats

results_df['fgpct_diff'] = results_df['wfgpct'] - results_df['lfgpct']
results_df['fg3pct_diff'] = results_df['wfg3pct'] - results_df['lfg3pct']
results_df['efgpct_diff'] = results_df['wefgpct'] - results_df['lefgpct']
results_df['ftpct_diff'] = results_df['wftpct'] - results_df['lftpct']
results_df['or_diff'] = results_df['wor'] - results_df['lor']
results_df['dr_diff'] = results_df['wdr'] - results_df['ldr']
results_df['ast_diff'] = results_df['wast'] - results_df['last']
results_df['to_diff'] = results_df['wto'] - results_df['lto']
results_df['stl_diff'] = results_df['wstl'] - results_df['lstl']
results_df['blk_diff'] = results_df['wblk'] - results_df['lblk']
results_df['pf_diff'] = results_df['wpf'] - results_df['lpf']

results_df.columns

Index(['season', 'daynum', 'wteamid', 'wscore', 'lteamid', 'lscore', 'wloc',
       'numot', 'wfgm', 'wfga', 'wfgm3', 'wfga3', 'wftm', 'wfta', 'wor', 'wdr',
       'wast', 'wto', 'wstl', 'wblk', 'wpf', 'lfgm', 'lfga', 'lfgm3', 'lfga3',
       'lftm', 'lfta', 'lor', 'ldr', 'last', 'lto', 'lstl', 'lblk', 'lpf',
       'gametype', 'weeknum', 'wseason_week_team', 'lseason_week_team',
       'wordinalrank', 'lordinalrank', 'wfgpercent', 'lfgpercent',
       'wfg3percent', 'lfg3percent', 'wftpercent', 'lftpercent', 'seasonweek',
       'ordinalrank_x', 'wseason-Day-Team', 'lseason-Day-Team',
       'wseason-week-team', 'lseason-week-team', 'wrank', 'lrank', 'rankdiff',
       'spread', 'lowidwin', 'wfgpct', 'lfgpct', 'wfg3pct', 'lfg3pct',
       'wefgpct', 'lefgpct', 'wftpct', 'lftpct', 'fgpct_diff', 'fg3pct_diff',
       'efgpct_diff', 'ftpct_diff', 'or_diff', 'dr_diff', 'ast_diff',
       'to_diff', 'stl_diff', 'blk_diff', 'pf_diff'],
      dtype='object')

In [25]:
# Drop NA's for a nice and pretty DF

clean_results_df = results_df.dropna()

clean_results_df.head()

Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,fg3pct_diff,efgpct_diff,ftpct_diff,or_diff,dr_diff,ast_diff,to_diff,stl_diff,blk_diff,pf_diff
1983,2003,36,1113,76,1305,63,H,0,25,57,...,0.031579,0.02556,0.32381,-4,4,2,-8,3,-1,-7
1984,2003,36,1116,72,1256,60,H,0,25,58,...,0.038363,0.16931,-0.117794,1,-2,4,-7,-1,0,-1
1985,2003,36,1130,85,1235,78,A,0,28,57,...,0.052632,0.224071,-0.05335,-10,0,-1,-1,3,-5,3
1986,2003,36,1139,75,1133,70,H,0,23,56,...,-0.478469,0.019133,0.235577,4,-1,-5,-4,3,0,-4
1987,2003,36,1143,67,1364,60,H,0,18,42,...,0.076923,0.052857,0.233333,4,2,-8,-1,2,1,-16


# Model Assembly and Training

In [26]:
# X = ranks, y = win?

y = clean_results_df['lowidwin']
X = clean_results_df[['rankdiff', 'efgpct_diff', 'ftpct_diff', 'or_diff', 'dr_diff',
       'ast_diff', 'to_diff', 'stl_diff', 'blk_diff', 'pf_diff']]

In [27]:
# Do TTS 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
X_train.shape

(82959, 10)

In [28]:
# Set up the Balanced Random Forest

from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=250, random_state=1)
brf.fit(X_train, y_train)


BalancedRandomForestClassifier(n_estimators=250, random_state=1)

In [33]:
predictions = brf.predict_proba(X_test)
print (predictions)

[[0.856 0.144]
 [0.676 0.324]
 [0.38  0.62 ]
 ...
 [0.384 0.616]
 [0.944 0.056]
 [0.48  0.52 ]]


In [None]:
# Calculated the balanced accuracy score
y_pred = brf.predict(X_test)
print(accuracy_score(y_test, y_pred))

In [None]:
# Get importances and features
importances = brf.feature_importances_
cols = X.columns

# Store in a DataFrame
feature_importances_df = pd.DataFrame({'feature':cols, 'importance': importances})
feature_importances_df

In [None]:
test_df = clean_results_df.loc[clean_results_df['wteamid']== 1463][clean_results_df['daynum']== 37][clean_results_df['season']== 2003]
test_df

In [None]:
probability = 1+(feature_importances_df.loc[feature_importances_df['importance']=='efgpct_diff']*test_df['efgpct_diff'])
probability