In [2]:
!pip install flask-sqlalchemy

Collecting flask-sqlalchemy
  Downloading Flask_SQLAlchemy-2.5.1-py2.py3-none-any.whl (17 kB)




Installing collected packages: flask-sqlalchemy
Successfully installed flask-sqlalchemy-2.5.1


In [12]:
# Import dependencies

# For data processing
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# For ML
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# For Database connections
import sqlalchemy 
from sqlalchemy import create_engine, func
from sqlalchemy import inspect
# from sqlalchemy import session
import psycopg2
from sqlalchemy.ext.automap import automap_base

In [13]:
# Connect to Database - this will be done via config.py in the future
host = "group2022.cem6bfyajguw.us-east-2.rds.amazonaws.com"
database = "postgres"
user = 'postgres'
password = 'postgrespassword'

db_string = f"postgresql://{user}:{password}@{host}/{database}"

In [14]:
# Set up SQLAlchemy
engine = create_engine(db_string)
insp = inspect(engine)
insp.get_table_names()

['teams',
 'all_game_results',
 'conferences',
 'rankings_with_team_names',
 'conferences_with_team_names',
 'sag_system',
 'rankings']

In [16]:
# Teams raw DF
teams_df=pd.read_sql_table('teams', engine)
print(teams_df.shape)
teams_df.head()

(372, 4)


Unnamed: 0,teamid,teamname,firstd1season,lastd1season
0,1101,Abilene Chr,2014,2022
1,1102,Air Force,1985,2022
2,1103,Akron,1985,2022
3,1104,Alabama,1985,2022
4,1105,Alabama A&M,2000,2022


In [17]:
# Rankings raw DF
rankings_df=pd.read_sql_table('rankings', engine)
print(rankings_df.shape)
rankings_df.head()

(4521720, 5)


Unnamed: 0,season,rankingdaynum,systemname,teamid,ordinalrank
0,2010,113,BPI,1315,332
1,2010,113,BPI,1316,238
2,2010,113,BPI,1317,145
3,2010,113,BPI,1318,73
4,2010,113,BPI,1319,206


In [19]:
# Checking ranking systems
rankings_df['systemname'].value_counts()

SAG    122047
MOR    121057
POM    118566
DOK    105270
WLK     97047
        ...  
HRN       351
CRW       351
PMC       351
BP5       345
PH        326
Name: systemname, Length: 187, dtype: int64

In [20]:
# We need to get just SAG ranks for simplicity
SAG_ranks = rankings_df.loc[rankings_df['systemname']=='SAG']
SAG_ranks.head()

Unnamed: 0,season,rankingdaynum,systemname,teamid,ordinalrank
6393,2010,113,SAG,1102,230
6394,2010,113,SAG,1103,101
6395,2010,113,SAG,1104,83
6396,2010,113,SAG,1105,334
6397,2010,113,SAG,1106,314


In [21]:
# Results raw DF

raw_results_df=pd.read_sql_table('all_game_results', engine)
print(raw_results_df.shape)
raw_results_df.head()

(124368, 46)


Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,wseason_week_team,lseason_week_team,wordinalrank,lordinalrank,wfgpercent,lfgpercent,wfg3percent,lfg3percent,wftpercent,lftpercent
0,2003,134,1421,92,1411,84,N,1,32,69,...,2003-19.0-1421,2003-19.0-1411,252.0,262.0,0.463768,0.432836,0.37931,0.387097,0.653846,0.451613
1,2003,134,1421,92,1411,84,N,1,32,69,...,2003-19.0-1421,2003-19.0-1411,252.0,257.0,0.463768,0.432836,0.37931,0.387097,0.653846,0.451613
2,2003,134,1421,92,1411,84,N,1,32,69,...,2003-19.0-1421,2003-19.0-1411,251.0,262.0,0.463768,0.432836,0.37931,0.387097,0.653846,0.451613
3,2003,134,1421,92,1411,84,N,1,32,69,...,2003-19.0-1421,2003-19.0-1411,251.0,257.0,0.463768,0.432836,0.37931,0.387097,0.653846,0.451613
4,2003,136,1112,80,1436,51,N,0,31,66,...,2003-19.0-1112,2003-19.0-1436,1.0,166.0,0.469697,0.3125,0.304348,0.25,0.785714,1.0


In [26]:
raw_results_df.columns

Index(['season', 'daynum', 'wteamid', 'wscore', 'lteamid', 'lscore', 'wloc',
       'numot', 'wfgm', 'wfga', 'wfgm3', 'wfga3', 'wftm', 'wftaint', 'wor',
       'wdr', 'wast', 'wto', 'wstl', 'wblk', 'wpf', 'lfgm', 'lfga', 'lfgm3',
       'lfga3', 'lftm', 'lfta', 'lor', 'ldr', 'last', 'lto', 'lstl', 'lblk',
       'lpf', 'gametype', 'weeknum', 'wseason_week_team', 'lseason_week_team',
       'wordinalrank', 'lordinalrank', 'wfgpercent', 'lfgpercent',
       'wfg3percent', 'lfg3percent', 'wftpercent', 'lftpercent'],
      dtype='object')

In [24]:
# We want to drop years before 2003, because rankings go to 2003
results_df = raw_results_df[raw_results_df['season']>=2003]
results_df.head()

Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,wseason_week_team,lseason_week_team,wordinalrank,lordinalrank,wfgpercent,lfgpercent,wfg3percent,lfg3percent,wftpercent,lftpercent
0,2003,134,1421,92,1411,84,N,1,32,69,...,2003-19.0-1421,2003-19.0-1411,252.0,262.0,0.463768,0.432836,0.37931,0.387097,0.653846,0.451613
1,2003,134,1421,92,1411,84,N,1,32,69,...,2003-19.0-1421,2003-19.0-1411,252.0,257.0,0.463768,0.432836,0.37931,0.387097,0.653846,0.451613
2,2003,134,1421,92,1411,84,N,1,32,69,...,2003-19.0-1421,2003-19.0-1411,251.0,262.0,0.463768,0.432836,0.37931,0.387097,0.653846,0.451613
3,2003,134,1421,92,1411,84,N,1,32,69,...,2003-19.0-1421,2003-19.0-1411,251.0,257.0,0.463768,0.432836,0.37931,0.387097,0.653846,0.451613
4,2003,136,1112,80,1436,51,N,0,31,66,...,2003-19.0-1112,2003-19.0-1436,1.0,166.0,0.469697,0.3125,0.304348,0.25,0.785714,1.0


# Preprocessing & Feature Engineering


In [29]:
# Make a new column in SAG_Ranks called 'Season_Day_Team' that we will repeat in Results 

SAG_ranks['season-day-team'] = SAG_ranks['season'].map(str) + '-' + SAG_ranks['rankingdaynum'].map(str) + '-' + SAG_ranks['teamid'].map(str)
SAG_ranks.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,season,rankingdaynum,systemname,teamid,ordinalrank,Season-Day-Team,season-day-team
6393,2010,113,SAG,1102,230,2010-113-1102,2010-113-1102
6394,2010,113,SAG,1103,101,2010-113-1103,2010-113-1103
6395,2010,113,SAG,1104,83,2010-113-1104,2010-113-1104
6396,2010,113,SAG,1105,334,2010-113-1105,2010-113-1105
6397,2010,113,SAG,1106,314,2010-113-1106,2010-113-1106


In [30]:
# Make new columns in SAG_Ranks called 'RankingWeek' and 'Season_Week_Team' because rankings only come out weekly

SAG_ranks['rankingweek'] = (SAG_ranks['rankingdaynum']/7).apply(np.ceil)

SAG_ranks['season-week-team'] = SAG_ranks['season'].map(str) + '-' + SAG_ranks['rankingweek'].map(str) + '-' + SAG_ranks['teamid'].map(str)
SAG_ranks.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,season,rankingdaynum,systemname,teamid,ordinalrank,Season-Day-Team,season-day-team,rankingweek,season-week-team
6393,2010,113,SAG,1102,230,2010-113-1102,2010-113-1102,17.0,2010-17.0-1102
6394,2010,113,SAG,1103,101,2010-113-1103,2010-113-1103,17.0,2010-17.0-1103
6395,2010,113,SAG,1104,83,2010-113-1104,2010-113-1104,17.0,2010-17.0-1104
6396,2010,113,SAG,1105,334,2010-113-1105,2010-113-1105,17.0,2010-17.0-1105
6397,2010,113,SAG,1106,314,2010-113-1106,2010-113-1106,17.0,2010-17.0-1106


In [31]:
# Make a new column in Results called 'WSeason_Day_Team' & 'LSeason_Day_Team' that we'll use for potential indices

results_df['wseason-Day-Team'] = results_df['season'].map(str) + '-' + results_df['daynum'].map(str) + '-' + results_df['wteamid'].map(str)
results_df['lseason-Day-Team'] = results_df['season'].map(str) + '-' + results_df['daynum'].map(str) + '-' + results_df['lteamid'].map(str)
results_df.head()

Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,wordinalrank,lordinalrank,wfgpercent,lfgpercent,wfg3percent,lfg3percent,wftpercent,lftpercent,wseason-Day-Team,lseason-Day-Team
0,2003,134,1421,92,1411,84,N,1,32,69,...,252.0,262.0,0.463768,0.432836,0.37931,0.387097,0.653846,0.451613,2003-134-1421,2003-134-1411
1,2003,134,1421,92,1411,84,N,1,32,69,...,252.0,257.0,0.463768,0.432836,0.37931,0.387097,0.653846,0.451613,2003-134-1421,2003-134-1411
2,2003,134,1421,92,1411,84,N,1,32,69,...,251.0,262.0,0.463768,0.432836,0.37931,0.387097,0.653846,0.451613,2003-134-1421,2003-134-1411
3,2003,134,1421,92,1411,84,N,1,32,69,...,251.0,257.0,0.463768,0.432836,0.37931,0.387097,0.653846,0.451613,2003-134-1421,2003-134-1411
4,2003,136,1112,80,1436,51,N,0,31,66,...,1.0,166.0,0.469697,0.3125,0.304348,0.25,0.785714,1.0,2003-136-1112,2003-136-1436


In [32]:
# Make a new column in Results called 'WSeason_Week_Team' & 'LSeason_Week_Team' that we'll use for loc to pull in OrdinalRank for winning and losing team

results_df['weeknum'] = (results_df['daynum']/7).apply(np.ceil)

results_df['wseason-week-team'] = results_df['season'].map(str) + '-' + results_df['weeknum'].map(str) + '-' + results_df['wteamid'].map(str)
results_df['lseason-week-team'] = results_df['season'].map(str) + '-' + results_df['weeknum'].map(str) + '-' + results_df['lteamid'].map(str)
results_df.head()

Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,wfgpercent,lfgpercent,wfg3percent,lfg3percent,wftpercent,lftpercent,wseason-Day-Team,lseason-Day-Team,wseason-week-team,lseason-week-team
0,2003,134,1421,92,1411,84,N,1,32,69,...,0.463768,0.432836,0.37931,0.387097,0.653846,0.451613,2003-134-1421,2003-134-1411,2003-20.0-1421,2003-20.0-1411
1,2003,134,1421,92,1411,84,N,1,32,69,...,0.463768,0.432836,0.37931,0.387097,0.653846,0.451613,2003-134-1421,2003-134-1411,2003-20.0-1421,2003-20.0-1411
2,2003,134,1421,92,1411,84,N,1,32,69,...,0.463768,0.432836,0.37931,0.387097,0.653846,0.451613,2003-134-1421,2003-134-1411,2003-20.0-1421,2003-20.0-1411
3,2003,134,1421,92,1411,84,N,1,32,69,...,0.463768,0.432836,0.37931,0.387097,0.653846,0.451613,2003-134-1421,2003-134-1411,2003-20.0-1421,2003-20.0-1411
4,2003,136,1112,80,1436,51,N,0,31,66,...,0.469697,0.3125,0.304348,0.25,0.785714,1.0,2003-136-1112,2003-136-1436,2003-20.0-1112,2003-20.0-1436


In [33]:
# Merging winning team rankings on season-week-team. There will be NA's for Rank bc of beginning and end of season

results_df = results_df.merge(SAG_ranks[['season-week-team','ordinalrank']], how='left', left_on='wseason-week-team', right_on='season-week-team').drop(columns=['season-week-team'])

results_df.sample(10)

Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,lfgpercent,wfg3percent,lfg3percent,wftpercent,lftpercent,wseason-Day-Team,lseason-Day-Team,wseason-week-team,lseason-week-team,ordinalrank
74473,2010,104,1185,66,1444,52,H,0,25,60,...,0.377778,0.263158,0.153846,0.647059,0.615385,2010-104-1185,2010-104-1444,2010-15.0-1185,2010-15.0-1444,206.0
131288,2019,59,1209,63,1426,58,A,0,16,39,...,0.357143,0.380952,0.08,0.605263,0.727273,2019-59-1209,2019-59-1426,2019-9.0-1209,2019-9.0-1426,122.0
23606,2004,117,1350,75,1203,67,A,0,25,50,...,0.403226,0.571429,0.1,0.636364,0.789474,2004-117-1350,2004-117-1203,2004-17.0-1350,2004-17.0-1203,46.0
49932,2008,61,1273,77,1244,75,A,1,31,70,...,0.353659,0.357143,0.227273,0.625,0.545455,2008-61-1273,2008-61-1244,2008-9.0-1273,2008-9.0-1244,257.0
35672,2006,117,1228,71,1234,59,H,0,23,51,...,0.446809,0.388889,0.25,0.62069,0.764706,2006-117-1228,2006-117-1234,2006-17.0-1228,2006-17.0-1234,9.0
35861,2006,121,1350,81,1269,66,H,0,27,47,...,0.489362,0.352941,0.0,0.65625,0.645161,2006-121-1350,2006-121-1269,2006-18.0-1350,2006-18.0-1269,202.0
88705,2012,129,1234,64,1228,61,N,0,23,57,...,0.435484,0.315789,0.269231,0.631579,,2012-129-1234,2012-129-1228,2012-19.0-1234,2012-19.0-1228,94.0
69055,2010,22,1412,77,1407,59,H,0,28,56,...,0.372881,0.333333,0.333333,0.636364,0.666667,2010-22-1412,2010-22-1407,2010-4.0-1412,2010-4.0-1407,50.0
31011,2005,130,1112,90,1333,59,N,0,35,59,...,0.388889,0.466667,0.238095,0.8125,0.857143,2005-130-1112,2005-130-1333,2005-19.0-1112,2005-19.0-1333,13.0
93597,2013,103,1183,79,1293,70,H,0,24,43,...,0.416667,0.583333,0.384615,0.705882,0.625,2013-103-1183,2013-103-1293,2013-15.0-1183,2013-15.0-1293,297.0


In [35]:
# Rename OrdinalRank to WRank
results_df.rename(columns={'ordinalrank': 'wrank'},inplace=True)
results_df.sample(10)

Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,lfgpercent,wfg3percent,lfg3percent,wftpercent,lftpercent,wseason-Day-Team,lseason-Day-Team,wseason-week-team,lseason-week-team,wrank
102242,2014,130,1409,76,1292,69,N,0,23,50,...,0.403846,0.272727,0.4,0.692308,0.703704,2014-130-1409,2014-130-1292,2014-19.0-1409,2014-19.0-1292,71.0
44436,2008,16,1198,83,1244,74,H,0,28,55,...,0.38806,0.3,0.333333,0.685714,0.75,2008-16-1198,2008-16-1244,2008-3.0-1198,2008-3.0-1244,275.0
2779,2013,139,1242,70,1314,58,N,0,24,55,...,0.30137,0.357143,0.285714,0.809524,0.8,2013-139-1242,2013-139-1314,2013-20.0-1242,2013-20.0-1314,
52961,2008,75,1366,79,1255,67,H,0,28,61,...,0.339286,0.5,0.388889,0.666667,0.594595,2008-75-1366,2008-75-1255,2008-11.0-1366,2008-11.0-1255,274.0
82693,2011,130,1212,81,1238,75,N,1,22,54,...,0.410714,0.4,0.1,0.707317,0.756757,2011-130-1212,2011-130-1238,2011-19.0-1212,2011-19.0-1238,332.0
129726,2019,14,1153,78,1444,52,H,0,30,64,...,0.333333,0.384615,0.3,0.764706,0.516129,2019-14-1153,2019-14-1444,2019-2.0-1153,2019-2.0-1444,38.0
137485,2020,40,1332,71,1276,70,A,1,27,53,...,0.409836,0.5,0.428571,0.692308,0.785714,2020-40-1332,2020-40-1276,2020-6.0-1332,2020-6.0-1276,10.0
117250,2017,41,1254,74,1385,73,N,0,28,62,...,0.425926,0.5,0.409091,0.631579,0.692308,2017-41-1254,2017-41-1385,2017-6.0-1254,2017-6.0-1385,245.0
76596,2010,131,1326,88,1228,81,N,2,32,67,...,0.413333,0.36,0.333333,0.625,0.9,2010-131-1326,2010-131-1228,2010-19.0-1326,2010-19.0-1228,14.0
118853,2017,82,1198,94,1412,85,H,0,32,49,...,0.54717,0.714286,0.473684,0.869565,0.782609,2017-82-1198,2017-82-1412,2017-12.0-1198,2017-12.0-1412,291.0


In [36]:
# Merging losing team rankings on season-week-team. There will be NA's for Rank bc of beginning and end of season

results_df = results_df.merge(SAG_ranks[['season-week-team','ordinalrank']], how='left', left_on='lseason-week-team', right_on='season-week-team').drop(columns=['season-week-team'])

results_df.sample(5)

Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,wfg3percent,lfg3percent,wftpercent,lftpercent,wseason-Day-Team,lseason-Day-Team,wseason-week-team,lseason-week-team,wrank,ordinalrank
97891,2009,107,1395,62,1102,50,H,0,18,41,...,0.4,0.304348,0.782609,0.818182,2009-107-1395,2009-107-1102,2009-16.0-1395,2009-16.0-1102,116.0,253.0
139125,2014,58,1153,65,1374,57,H,0,19,52,...,0.25,0.428571,0.741935,0.888889,2014-58-1153,2014-58-1374,2014-9.0-1153,2014-9.0-1374,30.0,51.0
133567,2013,103,1341,80,1290,77,H,0,32,63,...,0.416667,0.227273,0.478261,0.628571,2013-103-1341,2013-103-1290,2013-15.0-1341,2013-15.0-1290,329.0,342.0
45013,2005,129,1393,81,1353,57,N,0,28,57,...,0.4375,0.285714,0.818182,0.692308,2005-129-1393,2005-129-1353,2005-19.0-1393,2005-19.0-1353,20.0,137.0
58961,2007,129,1249,99,1358,98,N,0,31,58,...,0.391304,0.322581,0.8,0.869565,2007-129-1249,2007-129-1358,2007-19.0-1249,2007-19.0-1358,222.0,125.0


In [37]:
# Rename OrdinalRank to LRank
results_df.rename(columns={'ordinalrank': 'lrank'},inplace=True)
results_df.sample(5)

Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,wfg3percent,lfg3percent,wftpercent,lftpercent,wseason-Day-Team,lseason-Day-Team,wseason-week-team,lseason-week-team,wrank,lrank
76805,2008,75,1181,93,1155,80,H,0,31,58,...,0.318182,0.333333,0.75,0.434783,2008-75-1181,2008-75-1155,2008-11.0-1181,2008-11.0-1155,4.0,24.0
94124,2009,82,1136,57,1254,55,H,0,20,45,...,0.315789,0.235294,0.733333,0.833333,2009-82-1136,2009-82-1254,2009-12.0-1136,2009-12.0-1254,327.0,251.0
54152,2007,33,1247,64,1410,63,N,0,23,52,...,0.266667,0.5,0.636364,0.777778,2007-33-1247,2007-33-1410,2007-5.0-1247,2007-5.0-1410,,
110331,2010,101,1305,67,1225,66,A,0,24,54,...,0.5,0.25,0.818182,0.8,2010-101-1305,2010-101-1225,2010-15.0-1305,2010-15.0-1225,81.0,141.0
54300,2007,36,1396,76,1444,71,H,0,27,58,...,0.368421,0.36,0.6,0.761905,2007-36-1396,2007-36-1444,2007-6.0-1396,2007-6.0-1444,94.0,132.0


In [38]:
# Getting ranking differential to use as a variable - negative values are underdog wins
results_df['rankdiff'] = results_df['lrank'] - results_df['wrank']

# Getting Score differential just in case
results_df['spread'] = results_df['wscore'] - results_df['lscore']

# Outcome column for if the lower teamID won, as that is the submission format for Kaggle
results_df['lowidwin'] = np.where((results_df['wteamid'] < results_df['lteamid']), 1, 0)

results_df.sample(5)

Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,lftpercent,wseason-Day-Team,lseason-Day-Team,wseason-week-team,lseason-week-team,wrank,lrank,rankdiff,spread,lowidwin
134337,2013,121,1390,83,1143,70,A,0,24,49,...,0.833333,2013-121-1390,2013-121-1143,2013-18.0-1390,2013-18.0-1143,53.0,43.0,-10.0,13,0
4150,2018,143,1276,99,1401,72,N,0,39,63,...,0.5,2018-143-1276,2018-143-1401,2018-21.0-1276,2018-21.0-1401,,,,27,1
71693,2008,61,1177,70,1344,65,H,0,23,58,...,0.631579,2008-61-1177,2008-61-1344,2008-9.0-1177,2008-9.0-1344,122.0,45.0,-77.0,5,1
170137,2018,24,1314,102,1339,78,N,0,39,78,...,0.7,2018-24-1314,2018-24-1339,2018-4.0-1314,2018-4.0-1339,5.0,281.0,276.0,24,1
186911,2020,25,1361,83,1234,73,N,0,27,52,...,0.9,2020-25-1361,2020-25-1234,2020-4.0-1361,2020-4.0-1234,30.0,63.0,33.0,10,0


In [45]:
results_df.columns

Index(['season', 'daynum', 'wteamid', 'wscore', 'lteamid', 'lscore', 'wloc',
       'numot', 'wfgm', 'wfga', 'wfgm3', 'wfga3', 'wftm', 'wftaint', 'wor',
       'wdr', 'wast', 'wto', 'wstl', 'wblk', 'wpf', 'lfgm', 'lfga', 'lfgm3',
       'lfga3', 'lftm', 'lfta', 'lor', 'ldr', 'last', 'lto', 'lstl', 'lblk',
       'lpf', 'gametype', 'weeknum', 'wseason_week_team', 'lseason_week_team',
       'wordinalrank', 'lordinalrank', 'wfgpercent', 'lfgpercent',
       'wfg3percent', 'lfg3percent', 'wftpercent', 'lftpercent',
       'wseason-Day-Team', 'lseason-Day-Team', 'wseason-week-team',
       'lseason-week-team', 'wrank', 'lrank', 'rankdiff', 'spread', 'lowidwin',
       'wfgpct', 'lfgpct', 'wfg3pct', 'lfg3pct', 'wefgpct', 'lefgpct'],
      dtype='object')

In [47]:
# Getting shooting percentages

results_df['wfgpct'] = results_df['wfgm'] / results_df['wfga']
results_df['lfgpct'] = results_df['lfgm'] / results_df['lfga']

results_df['wfg3pct'] = results_df['wfgm3'] / results_df['wfga3']
results_df['lfg3pct'] = results_df['lfgm3'] / results_df['lfga3']

results_df['wefgpct'] = (results_df['wfgm'] + 0.5 * results_df['wfga3']) / results_df['wfga']
results_df['lefgpct'] = (results_df['lfgm'] + 0.5 * results_df['lfgm3']) / results_df['lfga']

results_df['wftpct'] = results_df['wftm'] / results_df['wftaint']
results_df['lftpct'] = results_df['lftm'] / results_df['lfta']

results_df.sample(5)

Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,spread,lowidwin,wfgpct,lfgpct,wfg3pct,lfg3pct,wefgpct,lefgpct,wftpct,lftpct
110506,2010,103,1457,66,1342,53,H,0,22,57,...,13,0,0.385965,0.404255,0.272727,0.277778,0.482456,0.457447,0.791667,0.833333
8068,2003,100,1168,74,1415,64,A,0,23,45,...,10,1,0.511111,0.489796,0.466667,0.318182,0.677778,0.561224,0.913043,0.818182
52287,2006,130,1181,80,1274,76,N,0,25,61,...,4,1,0.409836,0.428571,0.375,0.428571,0.672131,0.47619,0.782609,0.761905
27632,2004,101,1113,74,1417,62,H,1,24,60,...,12,1,0.4,0.4,0.28,0.294118,0.608333,0.441667,0.95,0.529412
121327,2011,131,1103,66,1245,65,N,1,25,61,...,1,1,0.409836,0.333333,0.421053,0.368421,0.565574,0.384058,0.470588,0.521739


In [49]:
# getting differentials between teams for stats

results_df['fgpct_diff'] = results_df['wfgpct'] - results_df['lfgpct']
results_df['fg3pct_diff'] = results_df['wfg3pct'] - results_df['lfg3pct']
results_df['efgpct_diff'] = results_df['wefgpct'] - results_df['lefgpct']
results_df['ftpct_diff'] = results_df['wftpct'] - results_df['lftpct']
results_df['or_diff'] = results_df['wor'] - results_df['lor']
results_df['dr_diff'] = results_df['wdr'] - results_df['ldr']
results_df['ast_diff'] = results_df['wast'] - results_df['last']
results_df['to_diff'] = results_df['wto'] - results_df['lto']
results_df['stl_diff'] = results_df['wstl'] - results_df['lstl']
results_df['blk_diff'] = results_df['wblk'] - results_df['lblk']
results_df['pf_diff'] = results_df['wpf'] - results_df['lpf']

results_df.columns

Index(['season', 'daynum', 'wteamid', 'wscore', 'lteamid', 'lscore', 'wloc',
       'numot', 'wfgm', 'wfga', 'wfgm3', 'wfga3', 'wftm', 'wftaint', 'wor',
       'wdr', 'wast', 'wto', 'wstl', 'wblk', 'wpf', 'lfgm', 'lfga', 'lfgm3',
       'lfga3', 'lftm', 'lfta', 'lor', 'ldr', 'last', 'lto', 'lstl', 'lblk',
       'lpf', 'gametype', 'weeknum', 'wseason_week_team', 'lseason_week_team',
       'wordinalrank', 'lordinalrank', 'wfgpercent', 'lfgpercent',
       'wfg3percent', 'lfg3percent', 'wftpercent', 'lftpercent',
       'wseason-Day-Team', 'lseason-Day-Team', 'wseason-week-team',
       'lseason-week-team', 'wrank', 'lrank', 'rankdiff', 'spread', 'lowidwin',
       'wfgpct', 'lfgpct', 'wfg3pct', 'lfg3pct', 'wefgpct', 'lefgpct',
       'wftpct', 'lftpct', 'fgpct_diff', 'fg3pct_diff', 'efgpct_diff',
       'ftpct_diff', 'or_diff', 'dr_diff', 'ast_diff', 'to_diff', 'stl_diff',
       'blk_diff', 'pf_diff'],
      dtype='object')

In [50]:
# Drop NA's for a nice and pretty DF

clean_results_df = results_df.dropna()

clean_results_df.head()

Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,fg3pct_diff,efgpct_diff,ftpct_diff,or_diff,dr_diff,ast_diff,to_diff,stl_diff,blk_diff,pf_diff
4703,2003,38,1463,71,1148,59,N,0,27,53,...,-0.166667,0.216638,0.166667,-2,5,3,-2,-1,1,-4
5517,2003,36,1113,76,1305,63,H,0,25,57,...,0.031579,0.02556,0.32381,-4,4,2,-8,3,-1,-7
5518,2003,36,1116,72,1256,60,H,0,25,58,...,0.038363,0.16931,-0.117794,1,-2,4,-7,-1,0,-1
5519,2003,36,1130,85,1235,78,A,0,28,57,...,0.052632,0.224071,-0.05335,-10,0,-1,-1,3,-5,3
5520,2003,36,1139,75,1133,70,H,0,23,56,...,-0.478469,0.019133,0.235577,4,-1,-5,-4,3,0,-4


# Model Assembly and Training

In [51]:
# X = ranks, y = win?

y = clean_results_df['lowidwin']
X = clean_results_df[['rankdiff', 'efgpct_diff', 'ftpct_diff', 'or_diff', 'dr_diff',
       'ast_diff', 'to_diff', 'stl_diff', 'blk_diff', 'pf_diff']]

In [52]:
# Do TTS 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
X_train.shape

(140586, 10)

In [53]:
# Set up the Balanced Random Forest

from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=250, random_state=1)
brf.fit(X_train, y_train)

BalancedRandomForestClassifier(n_estimators=250, random_state=1)

In [54]:
# Calculated the balanced accuracy score
y_pred = brf.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.7754091714145488


In [55]:
# Get importances and features
importances = brf.feature_importances_
cols = X.columns

# Store in a DataFrame
feature_importances_df = pd.DataFrame({'feature':cols, 'importance': importances})
feature_importances_df

Unnamed: 0,feature,importance
0,rankdiff,0.108379
1,efgpct_diff,0.149076
2,ftpct_diff,0.148531
3,or_diff,0.091627
4,dr_diff,0.087276
5,ast_diff,0.089777
6,to_diff,0.084454
7,stl_diff,0.079778
8,blk_diff,0.078442
9,pf_diff,0.08266


In [65]:
test_df = clean_results_df.loc[clean_results_df['wteamid']== 1463][clean_results_df['daynum']== 37][clean_results_df['season']== 2003]
test_df

  """Entry point for launching an IPython kernel.


Unnamed: 0,season,daynum,wteamid,wscore,lteamid,lscore,wloc,numot,wfgm,wfga,...,fg3pct_diff,efgpct_diff,ftpct_diff,or_diff,dr_diff,ast_diff,to_diff,stl_diff,blk_diff,pf_diff
5564,2003,37,1463,70,1221,66,N,0,23,47,...,-0.047619,0.287766,-0.062201,-8,5,2,4,-2,3,0


In [69]:
probability = 1+(feature_importances_df.loc[feature_importances_df['importance']=='efgpct_diff']*test_df['efgpct_diff'])
probability

Unnamed: 0,feature,importance,5564
