In [1]:
# Import dependencies
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# Files
teams_path = 'Resources/MTeams.csv'
rankings_path = 'Resources/MMasseyOrdinals_thruDay128.csv'
results_path = 'Resources/MRegularSeasonCompactResults.csv'

In [3]:
# Teams raw DF
teams_df=pd.read_csv(teams_path)
print(teams_df.shape)
teams_df.head()

(372, 4)


Unnamed: 0,TeamID,TeamName,FirstD1Season,LastD1Season
0,1101,Abilene Chr,2014,2022
1,1102,Air Force,1985,2022
2,1103,Akron,1985,2022
3,1104,Alabama,1985,2022
4,1105,Alabama A&M,2000,2022


In [4]:
# Rankings raw DF
rankings_df=pd.read_csv(rankings_path)
print(rankings_df.shape)
rankings_df.head()

(4601850, 5)


Unnamed: 0,Season,RankingDayNum,SystemName,TeamID,OrdinalRank
0,2003,35,SEL,1102,159
1,2003,35,SEL,1103,229
2,2003,35,SEL,1104,12
3,2003,35,SEL,1105,314
4,2003,35,SEL,1106,260


In [5]:
# Checking ranking systems
rankings_df['SystemName'].value_counts()

SAG    123479
MOR    122489
POM    119998
DOK    106702
WLK     98479
        ...  
HRN       351
CRW       351
PMC       351
BP5       345
PH        326
Name: SystemName, Length: 187, dtype: int64

In [6]:
# We need to get just SAG ranks for simplicity
SAG_ranks = rankings_df.loc[rankings_df['SystemName']=='SAG']
SAG_ranks.head()

Unnamed: 0,Season,RankingDayNum,SystemName,TeamID,OrdinalRank
3622,2003,37,SAG,1102,114
3623,2003,37,SAG,1103,193
3624,2003,37,SAG,1104,11
3625,2003,37,SAG,1105,310
3626,2003,37,SAG,1106,257


In [7]:
# This is good for now - but we'll need to look up the OrdinalRank by TeamID and 
# load that into Results for the DayNum (results) corresponding to RankingDayNum

In [8]:
# Results raw DF
raw_results_df=pd.read_csv(results_path)
print(raw_results_df.shape)
raw_results_df.head()

(176080, 8)


Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,20,1228,81,1328,64,N,0
1,1985,25,1106,77,1354,70,H,0
2,1985,25,1112,63,1223,56,H,0
3,1985,25,1165,70,1432,54,H,0
4,1985,25,1192,86,1447,74,H,0


In [9]:
# We want to drop years before 2003, because rankings go to 2003
results_df = raw_results_df[raw_results_df['Season']>=2003]
results_df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
74048,2003,10,1104,68,1328,62,N,0
74049,2003,10,1272,70,1393,63,N,0
74050,2003,11,1266,73,1437,61,N,0
74051,2003,11,1296,56,1457,50,N,0
74052,2003,11,1400,77,1208,71,N,0


In [11]:
# Make a new column in SAG_Ranks called 'Season_Day_Team' that we will repeat in Results 

SAG_ranks['Season-Day-Team'] = SAG_ranks['Season'].map(str) + '-' + SAG_ranks['RankingDayNum'].map(str) + '-' + SAG_ranks['TeamID'].map(str)
SAG_ranks.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Season,RankingDayNum,SystemName,TeamID,OrdinalRank,Season-Day-Team
3622,2003,37,SAG,1102,114,2003-37-1102
3623,2003,37,SAG,1103,193,2003-37-1103
3624,2003,37,SAG,1104,11,2003-37-1104
3625,2003,37,SAG,1105,310,2003-37-1105
3626,2003,37,SAG,1106,257,2003-37-1106


In [34]:
# Making two copies of SAG_ranks to use for map functions later on

ranks_index = SAG_ranks.set_index('Season-Day-Team')
ranks_index

Unnamed: 0_level_0,Season,RankingDayNum,SystemName,TeamID,OrdinalRank
Season-Day-Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2003-37-1102,2003,37,SAG,1102,114
2003-37-1103,2003,37,SAG,1103,193
2003-37-1104,2003,37,SAG,1104,11
2003-37-1105,2003,37,SAG,1105,310
2003-37-1106,2003,37,SAG,1106,257
...,...,...,...,...,...
2022-128-1468,2022,128,SAG,1468,203
2022-128-1469,2022,128,SAG,1469,294
2022-128-1470,2022,128,SAG,1470,227
2022-128-1471,2022,128,SAG,1471,256


In [24]:
# Make a new column in Results called 'WSeason_Day_Team' & 'LSeason_Day_Team' that we'll use for loc to pull in OrdinalRank for winning and losing team

results_df['WSeason-Day-Team'] = results_df['Season'].map(str) + '_' + results_df['DayNum'].map(str) + '_' + results_df['WTeamID'].map(str)
results_df['LSeason-Day-Team'] = results_df['Season'].map(str) + '_' + results_df['DayNum'].map(str) + '_' + results_df['LTeamID'].map(str)
results_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WSeason-Day-Team,LSeason-Day-Team
74048,2003,10,1104,68,1328,62,N,0,2003_10_1104,2003_10_1328
74049,2003,10,1272,70,1393,63,N,0,2003_10_1272,2003_10_1393
74050,2003,11,1266,73,1437,61,N,0,2003_11_1266,2003_11_1437
74051,2003,11,1296,56,1457,50,N,0,2003_11_1296,2003_11_1457
74052,2003,11,1400,77,1208,71,N,0,2003_11_1400,2003_11_1208


In [32]:
# Get winning and losing team rank for that day in our results df

# Create empty columns
results_df['WRank'] = "" 
results_df['LRank'] = ""
results_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WSeason-Day-Team,LSeason-Day-Team,WRank,LRank
74048,2003,10,1104,68,1328,62,N,0,2003_10_1104,2003_10_1328,,
74049,2003,10,1272,70,1393,63,N,0,2003_10_1272,2003_10_1393,,
74050,2003,11,1266,73,1437,61,N,0,2003_11_1266,2003_11_1437,,
74051,2003,11,1296,56,1457,50,N,0,2003_11_1296,2003_11_1457,,
74052,2003,11,1400,77,1208,71,N,0,2003_11_1400,2003_11_1208,,


In [43]:
# Now make a w_results with that season-day-team as index to map to the ranks_index DF
# trying this method: https://towardsdatascience.com/vlookup-implementation-in-python-in-three-simple-steps-93b5a290fd72
# SHANE NOTE: I am stuck here - will work on it this week

w_results = results_df.set_index('WSeason-Day-Team')
w_results['WRank'] = w_results.index.map(ranks_index['OrdinalRank'])
w_results.sample(20)

Unnamed: 0_level_0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,LSeason-Day-Team,WRank,LRank
WSeason-Day-Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2013_115_1447,2013,115,1447,84,1192,66,A,0,2013_115_1192,,
2013_42_1356,2013,42,1356,74,1309,61,H,0,2013_42_1309,,
2011_25_1166,2011,25,1166,75,1244,57,H,0,2011_25_1244,,
2020_36_1320,2020,36,1320,79,1160,76,A,0,2020_36_1160,,
2012_13_1133,2012,13,1133,68,1282,58,H,0,2012_13_1282,,
2005_118_1437,2005,118,1437,67,1207,56,A,0,2005_118_1207,,
2009_115_1201,2009,115,1201,68,1305,66,H,0,2009_115_1305,,
2007_43_1278,2007,43,1278,74,1416,63,H,0,2007_43_1416,,
2012_96_1163,2012,96,1163,69,1371,46,H,0,2012_96_1371,,
2013_26_1150,2013,26,1150,76,1187,64,H,0,2013_26_1187,,


In [35]:
# # Rename OrdinalRank to WRank
# results_df.rename(columns={'OrdinalRank': 'WRank'},inplace=True)
# results_df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WSeason-Day-Team,LSeason-Day-Team,WRank,LRank
74048,2003,10,1104,68,1328,62,N,0,2003_10_1104,2003_10_1328,,
74049,2003,10,1272,70,1393,63,N,0,2003_10_1272,2003_10_1393,,
74050,2003,11,1266,73,1437,61,N,0,2003_11_1266,2003_11_1437,,
74051,2003,11,1296,56,1457,50,N,0,2003_11_1296,2003_11_1457,,
74052,2003,11,1400,77,1208,71,N,0,2003_11_1400,2003_11_1208,,


In [15]:
# We need to find winner rank, loser rank and probably day number then load into a new DF

In [16]:
# We'll take that DF and create a winner column
# DF columns = Day/WR/LR/Win?

In [17]:
# X = ranks, y = win?
# Do TTS 

In [18]:
# Train

In [19]:
# Predict