In [1]:
# Import dependencies
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# Files
teams_path = 'Resources/MTeams.csv'
rankings_path = 'Resources/MMasseyOrdinals_thruDay128.csv'
results_path = 'Resources/MRegularSeasonCompactResults.csv'

In [3]:
# Teams raw DF
teams_df=pd.read_csv(teams_path)
print(teams_df.shape)
teams_df.head()

(372, 4)


Unnamed: 0,TeamID,TeamName,FirstD1Season,LastD1Season
0,1101,Abilene Chr,2014,2022
1,1102,Air Force,1985,2022
2,1103,Akron,1985,2022
3,1104,Alabama,1985,2022
4,1105,Alabama A&M,2000,2022


In [4]:
# Rankings raw DF
rankings_df=pd.read_csv(rankings_path)
print(rankings_df.shape)
rankings_df.head()

(4601850, 5)


Unnamed: 0,Season,RankingDayNum,SystemName,TeamID,OrdinalRank
0,2003,35,SEL,1102,159
1,2003,35,SEL,1103,229
2,2003,35,SEL,1104,12
3,2003,35,SEL,1105,314
4,2003,35,SEL,1106,260


In [5]:
# Checking ranking systems
rankings_df['SystemName'].value_counts()

SAG    123479
MOR    122489
POM    119998
DOK    106702
WLK     98479
        ...  
HRN       351
CRW       351
PMC       351
BP5       345
PH        326
Name: SystemName, Length: 187, dtype: int64

In [6]:
# We need to get just SAG ranks for simplicity
SAG_ranks = rankings_df.loc[rankings_df['SystemName']=='SAG']
SAG_ranks.head()

Unnamed: 0,Season,RankingDayNum,SystemName,TeamID,OrdinalRank
3622,2003,37,SAG,1102,114
3623,2003,37,SAG,1103,193
3624,2003,37,SAG,1104,11
3625,2003,37,SAG,1105,310
3626,2003,37,SAG,1106,257


In [7]:
# This is good for now - but we'll need to look up the OrdinalRank by TeamID and 
# load that into Results for the DayNum (results) corresponding to RankingDayNum

In [8]:
# Results raw DF
raw_results_df=pd.read_csv(results_path)
print(raw_results_df.shape)
raw_results_df.head()

(176080, 8)


Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,20,1228,81,1328,64,N,0
1,1985,25,1106,77,1354,70,H,0
2,1985,25,1112,63,1223,56,H,0
3,1985,25,1165,70,1432,54,H,0
4,1985,25,1192,86,1447,74,H,0


In [9]:
# We want to drop years before 2003, because rankings go to 2003
results_df = raw_results_df[raw_results_df['Season']>=2003]
results_df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
74048,2003,10,1104,68,1328,62,N,0
74049,2003,10,1272,70,1393,63,N,0
74050,2003,11,1266,73,1437,61,N,0
74051,2003,11,1296,56,1457,50,N,0
74052,2003,11,1400,77,1208,71,N,0


In [10]:
# Make a new column in SAG_Ranks called 'Season_Day_Team' that we will repeat in Results 

SAG_ranks['Season-Day-Team'] = SAG_ranks['Season'].map(str) + '-' + SAG_ranks['RankingDayNum'].map(str) + '-' + SAG_ranks['TeamID'].map(str)
SAG_ranks.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Season,RankingDayNum,SystemName,TeamID,OrdinalRank,Season-Day-Team
3622,2003,37,SAG,1102,114,2003-37-1102
3623,2003,37,SAG,1103,193,2003-37-1103
3624,2003,37,SAG,1104,11,2003-37-1104
3625,2003,37,SAG,1105,310,2003-37-1105
3626,2003,37,SAG,1106,257,2003-37-1106


In [11]:
# Make new columns in SAG_Ranks called 'RankingWeek' and 'Season_Week_Team' because rankings only come out weekly

SAG_ranks['RankingWeek'] = (SAG_ranks['RankingDayNum']/7).round(decimals=0)

SAG_ranks['Season-Week-Team'] = SAG_ranks['Season'].map(str) + '-' + SAG_ranks['RankingWeek'].map(str) + '-' + SAG_ranks['TeamID'].map(str)
SAG_ranks.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,Season,RankingDayNum,SystemName,TeamID,OrdinalRank,Season-Day-Team,RankingWeek,Season-Week-Team
3622,2003,37,SAG,1102,114,2003-37-1102,5.0,2003-5.0-1102
3623,2003,37,SAG,1103,193,2003-37-1103,5.0,2003-5.0-1103
3624,2003,37,SAG,1104,11,2003-37-1104,5.0,2003-5.0-1104
3625,2003,37,SAG,1105,310,2003-37-1105,5.0,2003-5.0-1105
3626,2003,37,SAG,1106,257,2003-37-1106,5.0,2003-5.0-1106


In [12]:
# Make a new column in Results called 'WSeason_Day_Team' & 'LSeason_Day_Team' that we'll use for potential indices

results_df['WSeason-Day-Team'] = results_df['Season'].map(str) + '-' + results_df['DayNum'].map(str) + '-' + results_df['WTeamID'].map(str)
results_df['LSeason-Day-Team'] = results_df['Season'].map(str) + '-' + results_df['DayNum'].map(str) + '-' + results_df['LTeamID'].map(str)
results_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WSeason-Day-Team,LSeason-Day-Team
74048,2003,10,1104,68,1328,62,N,0,2003-10-1104,2003-10-1328
74049,2003,10,1272,70,1393,63,N,0,2003-10-1272,2003-10-1393
74050,2003,11,1266,73,1437,61,N,0,2003-11-1266,2003-11-1437
74051,2003,11,1296,56,1457,50,N,0,2003-11-1296,2003-11-1457
74052,2003,11,1400,77,1208,71,N,0,2003-11-1400,2003-11-1208


In [13]:
# Make a new column in Results called 'WSeason_Week_Team' & 'LSeason_Week_Team' that we'll use for loc to pull in OrdinalRank for winning and losing team

results_df['WeekNum'] = (results_df['DayNum']/7).round(decimals=0)

results_df['WSeason-Week-Team'] = results_df['Season'].map(str) + '-' + results_df['WeekNum'].map(str) + '-' + results_df['WTeamID'].map(str)
results_df['LSeason-Week-Team'] = results_df['Season'].map(str) + '-' + results_df['WeekNum'].map(str) + '-' + results_df['LTeamID'].map(str)
results_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WSeason-Day-Team,LSeason-Day-Team,WeekNum,WSeason-Week-Team,LSeason-Week-Team
74048,2003,10,1104,68,1328,62,N,0,2003-10-1104,2003-10-1328,1.0,2003-1.0-1104,2003-1.0-1328
74049,2003,10,1272,70,1393,63,N,0,2003-10-1272,2003-10-1393,1.0,2003-1.0-1272,2003-1.0-1393
74050,2003,11,1266,73,1437,61,N,0,2003-11-1266,2003-11-1437,2.0,2003-2.0-1266,2003-2.0-1437
74051,2003,11,1296,56,1457,50,N,0,2003-11-1296,2003-11-1457,2.0,2003-2.0-1296,2003-2.0-1457
74052,2003,11,1400,77,1208,71,N,0,2003-11-1400,2003-11-1208,2.0,2003-2.0-1400,2003-2.0-1208


In [14]:
# Get winning and losing team rank for that day in our results df

# # Create empty columns
# results_df['WRank'] = "" 
# results_df['LRank'] = ""
# results_df.head()

In [15]:
# Merging winning team rankings on season-week-team. There will be NA's for Rank bc of beginning and end of season

results_df = results_df.merge(SAG_ranks[['Season-Week-Team','OrdinalRank']], how='left', left_on='WSeason-Week-Team', right_on='Season-Week-Team').drop(columns=['Season-Week-Team'])

results_df.sample(10)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WSeason-Day-Team,LSeason-Day-Team,WeekNum,WSeason-Week-Team,LSeason-Week-Team,OrdinalRank
3390,2003,101,1362,72,1339,55,A,0,2003-101-1362,2003-101-1339,14.0,2003-14.0-1362,2003-14.0-1339,161.0
89521,2020,41,1275,79,1290,67,H,0,2020-41-1275,2020-41-1290,6.0,2020-6.0-1275,2020-6.0-1290,231.0
23722,2007,132,1326,66,1458,49,N,0,2007-132-1326,2007-132-1458,19.0,2007-19.0-1326,2007-19.0-1458,2.0
25331,2008,45,1388,64,1408,57,N,0,2008-45-1388,2008-45-1408,6.0,2008-6.0-1388,2008-6.0-1408,27.0
5783,2004,49,1131,64,1156,62,H,0,2004-49-1131,2004-49-1156,7.0,2004-7.0-1131,2004-7.0-1156,94.0
29005,2009,12,1301,65,1309,59,A,0,2009-12-1301,2009-12-1309,2.0,2009-2.0-1301,2009-2.0-1309,59.0
47565,2012,72,1372,67,1146,56,A,0,2012-72-1372,2012-72-1146,10.0,2012-10.0-1372,2012-10.0-1146,238.0
34786,2010,23,1135,70,1136,68,A,0,2010-23-1135,2010-23-1136,3.0,2010-3.0-1135,2010-3.0-1136,215.0
99638,2022,75,1329,61,1124,54,A,0,2022-75-1329,2022-75-1124,11.0,2022-11.0-1329,2022-11.0-1124,45.0
22947,2007,112,1204,81,1154,62,H,0,2007-112-1204,2007-112-1154,16.0,2007-16.0-1204,2007-16.0-1154,229.0


In [25]:
# Rename OrdinalRank to WRank
results_df.rename(columns={'OrdinalRank': 'WRank'},inplace=True)
results_df.sample(10)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WSeason-Day-Team,LSeason-Day-Team,WeekNum,WSeason-Week-Team,LSeason-Week-Team,WRank
11917,2005,86,1453,60,1178,54,A,0,2005-86-1453,2005-86-1178,12.0,2005-12.0-1453,2005-12.0-1178,132.0
100164,2022,86,1261,70,1401,64,H,0,2022-86-1261,2022-86-1401,12.0,2022-12.0-1261,2022-12.0-1401,11.0
9542,2005,24,1330,67,1395,62,A,0,2005-24-1330,2005-24-1395,3.0,2005-3.0-1330,2005-3.0-1395,95.0
58982,2014,89,1140,84,1388,71,H,0,2014-89-1140,2014-89-1388,13.0,2014-13.0-1140,2014-13.0-1388,45.0
67878,2016,47,1182,72,1352,65,H,0,2016-47-1182,2016-47-1352,7.0,2016-7.0-1182,2016-7.0-1352,125.0
10439,2005,50,1258,64,1159,54,A,0,2005-50-1258,2005-50-1159,7.0,2005-7.0-1258,2005-7.0-1159,132.0
48651,2012,96,1161,67,1102,49,H,0,2012-96-1161,2012-96-1102,14.0,2012-14.0-1161,2012-14.0-1102,92.0
60618,2014,124,1138,88,1132,65,H,0,2014-124-1138,2014-124-1132,18.0,2014-18.0-1138,2014-18.0-1132,100.0
97150,2022,8,1272,89,1399,65,H,0,2022-8-1272,2022-8-1399,1.0,2022-1.0-1272,2022-1.0-1399,
56627,2014,26,1367,73,1398,64,H,0,2014-26-1367,2014-26-1398,4.0,2014-4.0-1367,2014-4.0-1398,105.0


In [26]:
# Merging losing team rankings on season-week-team. There will be NA's for Rank bc of beginning and end of season

results_df = results_df.merge(SAG_ranks[['Season-Week-Team','OrdinalRank']], how='left', left_on='LSeason-Week-Team', right_on='Season-Week-Team').drop(columns=['Season-Week-Team'])

results_df.sample(10)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WSeason-Day-Team,LSeason-Day-Team,WeekNum,WSeason-Week-Team,LSeason-Week-Team,WRank,OrdinalRank
77897,2018,11,1227,65,1300,55,H,0,2018-11-1227,2018-11-1300,2.0,2018-2.0-1227,2018-2.0-1300,206.0,285.0
984,2003,43,1210,88,1407,66,H,0,2003-43-1210,2003-43-1407,6.0,2003-6.0-1210,2003-6.0-1407,61.0,130.0
69038,2016,57,1323,73,1251,56,H,0,2016-57-1323,2016-57-1251,8.0,2016-8.0-1323,2016-8.0-1251,32.0,345.0
1654,2003,61,1334,66,1429,54,H,0,2003-61-1334,2003-61-1429,9.0,2003-9.0-1334,2003-9.0-1429,104.0,75.0
26591,2008,75,1307,59,1102,44,H,0,2008-75-1307,2008-75-1102,11.0,2008-11.0-1307,2008-11.0-1102,53.0,182.0
19693,2007,30,1399,86,1133,84,H,0,2007-30-1399,2007-30-1133,4.0,2007-4.0-1399,2007-4.0-1133,,
1778,2003,65,1173,76,1247,72,H,0,2003-65-1173,2003-65-1247,9.0,2003-9.0-1173,2003-9.0-1247,44.0,135.0
84824,2019,40,1243,71,1209,59,H,0,2019-40-1243,2019-40-1209,6.0,2019-6.0-1243,2019-6.0-1209,32.0,130.0
101993,2022,105,1158,79,1180,75,H,0,2022-105-1158,2022-105-1180,15.0,2022-15.0-1158,2022-15.0-1180,172.0,154.0
21841,2007,86,1273,68,1366,64,H,0,2007-86-1273,2007-86-1366,12.0,2007-12.0-1273,2007-12.0-1366,272.0,291.0


In [27]:
# Rename OrdinalRank to LRank
results_df.rename(columns={'OrdinalRank': 'LRank'},inplace=True)
results_df.sample(10)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WSeason-Day-Team,LSeason-Day-Team,WeekNum,WSeason-Week-Team,LSeason-Week-Team,WRank,LRank
8298,2004,110,1133,76,1296,67,H,0,2004-110-1133,2004-110-1296,16.0,2004-16.0-1133,2004-16.0-1296,161.0,254.0
38463,2010,107,1139,73,1227,55,H,0,2010-107-1139,2010-107-1227,15.0,2010-15.0-1139,2010-15.0-1227,26.0,260.0
45772,2012,15,1292,86,1417,66,A,0,2012-15-1292,2012-15-1417,2.0,2012-2.0-1292,2012-2.0-1417,90.0,47.0
100977,2022,86,1255,79,1299,71,H,0,2022-86-1255,2022-86-1299,12.0,2022-12.0-1255,2022-12.0-1299,207.0,270.0
44747,2011,131,1246,72,1104,58,N,0,2011-131-1246,2011-131-1104,19.0,2011-19.0-1246,2011-19.0-1104,12.0,60.0
102972,2022,124,1308,62,1430,46,H,0,2022-124-1308,2022-124-1430,18.0,2022-18.0-1308,2022-18.0-1430,89.0,133.0
29518,2009,21,1129,74,1286,61,H,0,2009-21-1129,2009-21-1286,3.0,2009-3.0-1129,2009-3.0-1286,140.0,210.0
3776,2003,110,1241,57,1206,52,A,0,2003-110-1241,2003-110-1206,16.0,2003-16.0-1241,2003-16.0-1206,224.0,140.0
83213,2018,130,1285,91,1294,89,N,1,2018-130-1285,2018-130-1294,19.0,2018-19.0-1285,2018-19.0-1294,98.0,146.0
100009,2022,65,1283,71,1133,69,A,0,2022-65-1283,2022-65-1133,9.0,2022-9.0-1283,2022-9.0-1133,93.0,134.0


In [29]:
# Getting ranking differential to use as a variable - negative values are underdog wins
results_df['RankDiff'] = results_df['WRank'] - results_df['LRank']

# Getting Score differential just in case
results_df['Spread'] = results_df['WScore'] - results_df['LScore']

results_df.sample(10)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WSeason-Day-Team,LSeason-Day-Team,WeekNum,WSeason-Week-Team,LSeason-Week-Team,WRank,LRank,RankDiff,Spread
28150,2008,110,1241,89,1287,57,H,0,2008-110-1241,2008-110-1287,16.0,2008-16.0-1241,2008-16.0-1287,206.0,258.0,-52.0,32
89979,2020,29,1435,90,1138,76,H,0,2020-29-1435,2020-29-1138,4.0,2020-4.0-1435,2020-4.0-1138,125.0,102.0,23.0,14
66500,2015,117,1257,81,1199,59,A,0,2015-117-1257,2015-117-1199,17.0,2015-17.0-1257,2015-17.0-1199,12.0,97.0,-85.0,22
26293,2008,68,1287,81,1404,79,A,0,2008-68-1287,2008-68-1404,10.0,2008-10.0-1287,2008-10.0-1404,272.0,269.0,3.0,2
56164,2013,124,1395,70,1328,67,H,0,2013-124-1395,2013-124-1328,18.0,2013-18.0-1395,2013-18.0-1328,231.0,40.0,191.0,3
7488,2004,89,1440,68,1457,61,H,0,2004-89-1440,2004-89-1457,13.0,2004-13.0-1440,2004-13.0-1457,313.0,225.0,88.0,7
96691,2021,100,1372,80,1394,68,H,0,2021-100-1372,2021-100-1394,14.0,2021-14.0-1372,2021-14.0-1394,153.0,342.0,-189.0,12
60727,2014,108,1377,87,1303,86,H,0,2014-108-1377,2014-108-1303,15.0,2014-15.0-1377,2014-15.0-1303,216.0,150.0,66.0,1
92678,2020,96,1423,70,1456,64,H,0,2020-96-1423,2020-96-1456,14.0,2020-14.0-1423,2020-14.0-1456,320.0,209.0,111.0,6
76796,2017,108,1170,67,1285,65,A,0,2017-108-1170,2017-108-1285,15.0,2017-15.0-1170,2017-15.0-1285,304.0,185.0,119.0,2


In [37]:
# Drop NA's

clean_results_df = results_df.dropna()

clean_results_df

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WSeason-Day-Team,LSeason-Day-Team,WeekNum,WSeason-Week-Team,LSeason-Week-Team,WRank,LRank,RankDiff,Spread
632,2003,32,1110,73,1347,49,H,0,2003-32-1110,2003-32-1347,5.0,2003-5.0-1110,2003-5.0-1347,204.0,300.0,-96.0,24
633,2003,32,1127,74,1176,69,N,0,2003-32-1127,2003-32-1176,5.0,2003-5.0-1127,2003-5.0-1176,225.0,212.0,13.0,5
634,2003,32,1156,81,1271,61,H,0,2003-32-1156,2003-32-1271,5.0,2003-5.0-1156,2003-5.0-1271,189.0,318.0,-129.0,20
635,2003,32,1159,87,1306,68,H,0,2003-32-1159,2003-32-1306,5.0,2003-5.0-1159,2003-5.0-1306,289.0,316.0,-27.0,19
636,2003,32,1162,58,1119,44,H,0,2003-32-1162,2003-32-1119,5.0,2003-5.0-1162,2003-5.0-1119,321.0,323.0,-2.0,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103178,2022,129,1435,82,1104,76,N,0,2022-129-1435,2022-129-1104,18.0,2022-18.0-1435,2022-18.0-1104,77.0,23.0,54.0,6
103179,2022,129,1437,66,1385,65,N,0,2022-129-1437,2022-129-1385,18.0,2022-18.0-1437,2022-18.0-1385,10.0,53.0,-43.0,1
103180,2022,129,1439,87,1323,80,N,0,2022-129-1439,2022-129-1323,18.0,2022-18.0-1439,2022-18.0-1323,36.0,49.0,-13.0,7
103181,2022,129,1451,68,1285,56,N,0,2022-129-1451,2022-129-1285,18.0,2022-18.0-1451,2022-18.0-1285,171.0,226.0,-55.0,12


In [19]:
# We'll take that DF and create a winner column
# DF columns = Day/WR/LR/Win?

In [20]:
# X = ranks, y = win?
# Do TTS 

In [21]:
# Train

In [22]:
# Predict