In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

# Approach 1: Top Down (Average Team Stats and attach to the Match Data) 

## Load in and Prepare the Player Data

In [2]:
#Load in the player data
df_players = pd.read_csv('../data/PlayerData_v1.csv',index_col=0)
df_players

Unnamed: 0,Player,Position,T,Ti50,PA,DHPA,S,CDOOO,CDL,CDL%,...,SI,SL,CC,SC,HTA,HTW%,HTA%,RC,lPlayer,Team
0,Joel Corey,MID,12,0,24,11,0,0,0,0.0,...,3,1,2,2,0,0.0,0.0,0,joel corey,Geelong
1,Michael Barlow,MID,11,3,27,14,2,0,0,0.0,...,4,0,1,1,0,0.0,0.0,0,michael barlow,Fremantle
2,Jimmy Bartel,MID,8,1,17,9,1,0,0,0.0,...,4,0,1,4,0,0.0,0.0,0,jimmy bartel,Geelong
3,Paul Chapman,FWD,8,3,20,5,2,0,0,0.0,...,3,1,1,3,0,0.0,0.0,0,paul chapman,Geelong
4,Chris Mayne,FWD,8,4,19,8,0,0,0,0.0,...,2,1,0,0,0,0.0,0.0,0,chris mayne,Fremantle
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111116,Bobby Hill,FWD,1,0,17,1,0,0,0,0.0,...,5,1,1,1,0,0.0,0.0,0,bobby hill,Greater Western Sydney
111117,Bobby Hill,FWD,4,1,13,3,1,0,0,0.0,...,2,1,0,0,0,0.0,0.0,0,bobby hill,Greater Western Sydney
111118,Bobby Hill,FWD,3,1,22,11,0,0,0,0.0,...,5,0,0,0,0,0.0,0.0,0,bobby hill,Greater Western Sydney
111119,Bobby Hill,FWD,4,0,17,9,1,0,0,0.0,...,4,0,0,0,0,0.0,0.0,0,bobby hill,Greater Western Sydney


In [3]:
#Remove the 'lPlayer' column, City and any rows with ToG% == 0 and pre 2024
df_players.drop('lPlayer',axis=1,inplace=True)
df_players = df_players[(df_players['ToG%']>0)&(df_players['Season']<2024)]

In [None]:
df_players

In [None]:
#Remove plural 's' from 'Round' Column
df_players.loc[:,'Round'] = df_players['Round'].str.replace(r's$','',regex=True) 

In [None]:
df_players['Round'].unique()

## Load in and Prepare the Match Data

In [None]:
#Load in the match data
df_matches = pd.read_csv('../data/MatchData_v2.csv',index_col=0)
#Remove 'City'
df_matches.drop('City',axis=1,inplace=True)
#Covert weather data into floats
weather_cols = ['Max Temp (C)','Min Temp (C)','Max Humid (%)','Min Humid (%)','Wind Speed (m/s)',\
                'Solar Rad (MJ/sq m)','Rain (mm)']
df_matches[weather_cols] = df_matches[weather_cols].apply(pd.to_numeric,errors='coerce')

In [None]:
#Keep only pre 2024
df_matches = df_matches[df_matches['Season']<2024] 

In [None]:
df_matches['Round'].unique()

In [None]:
#Commonise the 'Round' Column
df_matches.loc[(df_matches['Round']=='Qualifying Final')|(df_matches['Round']=='Elimination Final'),'Round'] = 'Finals Week 1'

## Aggregate player data and join onto the Match Data

In [None]:
#Extract only ther numeric columns
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
num_cols = list(df_players.select_dtypes(include=numerics).columns)
#Append Round and Team
num_cols.append('Round')
num_cols.append('Team')

In [None]:
#Keep only numeric and aggregation variables
df_play_num = df_players[num_cols]

## NOTE: May have to approach some values (such as num goals) with sum instead of mean

In [None]:
df_play_agg = df_play_num.groupby(['Season','Round','Team']).agg('mean').reset_index()

### Join to match data

In [None]:
#Join on team data
df_matches_v2 = df_matches.merge(df_play_agg,on=['Season','Round','Team'],how='left')
#Join on opponent data
df_matches_v2 = df_matches_v2.merge(df_play_agg,left_on=['Season','Round','Opponent'],right_on=['Season','Round','Team'],\
                                 how='left',suffixes=('_team','_opp'))
#Rename Team_team to Team and drop Team_opp
df_matches_v2.rename(columns={'Team_team':'Team'},inplace=True)
df_matches_v2.drop('Team_opp',axis=1,inplace=True)

In [None]:
#Rename Round for ease of sorting
round_dict = {'Finals Week 1':25,'Semi Final':26,'Preliminary Final':27,\
             'Grand Final':28}
#Define function to change values based on dict
def rename_round(x):
    if x in list(round_dict.keys()):
        return round_dict[x]
    else:
        return int(re.search(r'(\d+)',x)[0])

In [None]:
df_matches_v2

In [None]:
#Apply changes to round names
df_matches_v2['RoundNum'] = df_matches_v2['Round'].apply(rename_round) 

In [None]:
#Sort Match data by Team, Season, Round
with pd.option_context('display.max_row',None):
    print(df_matches_v2.sort_values(['Team','Season','RoundNum'])[['Team','Season','Round']])

In [None]:
df_matches_v2 = df_matches_v2.sort_values(['Team','Season','RoundNum'])

## Extract all columns with _team or _opp and attendacne and lag them by 3

In [None]:
df_team_stats = df_matches_v2.filter(regex=(".+_opp|.+_team"))
df_team_stats = pd.concat([df_team_stats,df_matches_v2['Attendance']],axis=1)

In [None]:
#Try the last 3 matches as predictors
num_lags = 3
for i in range(num_lags):
    #Lag the stats
    df_team_temp = df_team_stats.shift(i+1)
    #Get the column names
    cols = df_team_temp.columns.to_list()
    #Define new column names with lag number as suffix
    new_cols = [col+str(i+1) for col in cols]
    df_team_temp.columns = new_cols
    #Concatenate with the match data
    df_matches_v2 = pd.concat([df_matches_v2,df_team_temp],axis=1)

In [None]:
#Remove the present match stats from the dataframe
present_cols = df_matches_v2.filter(regex=(".+_opp$|.+_team$")).columns.to_list()
#add Attendance to the list
present_cols.append('Attendance')
#Drop from the data
df_matches_v2.drop(present_cols,axis=1,inplace=True)

In [None]:
#Remove 2012 Round 1-3 to remove NAN values
df_matches_v3 = df_matches_v2[~((df_matches_v2['Season']==2012)&(df_matches_v2['Round'].isin(['Round 1','Round 2','Round 3'])))]

## Check and impute missing values

In [None]:
with pd.option_context('display.max_row',None):
    print(df_matches_v3.isna().sum())

We have seen previously that missing attendance is associated with the COVID period. We will set the missings to 0. As for missing weather information, we will assume 0 rainfall and impute with average values in the others.

In [None]:
#Fill na with 0 for Rain (mm) and Attendance
df_matches_v3.loc[:,['Rain (mm)','Attendance1','Attendance2','Attendance3']] = df_matches_v3[['Rain (mm)','Attendance1','Attendance2','Attendance3']].fillna(0.0)
#Fill remaining with means
impute_cols = ['Max Temp (C)','Min Temp (C)','Max Humid (%)','Min Humid (%)','Wind Speed (m/s)','Solar Rad (MJ/sq m)']
df_matches_v3.loc[:,impute_cols] = df_matches_v3[impute_cols].fillna(df_matches_v3[impute_cols].mean()) 

### Final Analysis Version Data: 

In [None]:
df_matches_v3

### Remove Non Predictors

In [None]:
#Remover the time based data and points for and against at present match
#Remove Season later after separating into IT/OOT
df_ML = df_matches_v3.drop(['PointsF','PointsA','Round','Date','RoundNum'],axis=1)

## Pre Encoding Preprocessing

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
def split_preprocess(data):
    data_IT = data[data['Season']<=2019]
    data_OOT = data[data['Season']>2019]
    #Get the dependent variable
    y = data_IT.pop('Outcome')
    #Split the in time data, stratify by outcome and shuffle
    X_train, X_test, y_train, y_test = train_test_split(data_IT,y,test_size=0.3,shuffle=True,random_state=5432,stratify=y)
    #Return the split data
    return data_IT, data_OOT, X_train, X_test, y_train, y_test

### Encode Categorical Data

In [None]:
#Identify and remove the object data and convert to category
df_categories = df_ML.select_dtypes(include=['object']).astype('category')
cat_cols = df_categories.columns
#Remove from the ML data to join back on later
df_ML.drop(cat_cols,axis=1,inplace=True)
#One Hot Encode the categories
df_categories = pd.get_dummies(df_categories,dtype=int)
#Join back on to the df_ML data
df_ML = df_ML.join(df_categories)

### Separate into in Time and OOT

In [None]:
df_ML_IT, df_ML_OOT, X_train, X_test, y_train, y_test = split_preprocess(df_ML)
#Drop season from all the data
df_ML_IT = df_ML_IT.drop('Season',axis=1)
df_ML_OOT = df_ML_OOT.drop('Season',axis=1)
X_train.drop('Season',axis=1,inplace=True)
X_test.drop('Season',axis=1,inplace=True)

In [None]:
#Extract Feature names for diagnostics
feature_names = df_ML_IT.columns.to_list()

## Variable Importance

Note that variable importance here is specific to the model we produce with RF. As such, if the model is bad, the feature selection may be useless. Only consider these results in parrallel to the model performance and with other measures such as IVs.

In [None]:
#Import random forest for variable importance
from sklearn.ensemble import RandomForestClassifier
#Set up model
forest = RandomForestClassifier(random_state=5432)
#Fit the model
forest.fit(X_train,y_train)

#### Check performance on test data

In [None]:
y_pred = forest.predict(X_test)

confusion_matrix = pd.crosstab(y_test,y_pred,rownames=['Actual'],colnames=['Pred'])

In [None]:
confusion_matrix

**Definitions Reminder:**

- Precision: %Predicted wins that were actually wins.
- Recall: %Wins that were predicted

We want to be sure the predicted wins are correct, precision should be our target metric.

In [None]:
acc = np.trace(confusion_matrix)/confusion_matrix.sum()
prec = confusion_matrix.loc[1,1]/confusion_matrix.loc[:,1].sum()
rec = confusion_matrix.loc[1,1]/confusion_matrix.loc[1,:].sum()

print('Accuracy: ',acc,'\nPrecision: ',prec*100,'\nRecall: ',rec*100)

Results aren't fantastic, so take the following importances with a grain of salt.

In [None]:
importances = pd.Series(forest.feature_importances_,index=feature_names)
importances = importances.sort_values(ascending=False)
#Plot the importances
fig, ax = plt.subplots(figsize=(10,5))
importances[0:50].plot.bar(ax=ax)
ax.set_title('Feature importance')
ax.set_ylabel('Mean accuracy decrease')
fig.tight_layout()
plt.show()

In [None]:
with pd.option_context('display.max_row',None):
    print(importances)

### Player Stats

At a glance it appears **SI** (Score Involvement), **SL** (Score Launches), **R** (Rating Points), **DHPA** (defensive half pressure acts), **SAG** (shots at goal), **i50s** (Inside 50s), **HTA%** (Hitouts to Advantage percent), **HTW%** (Hitouts to Win percent), **KE%** (kick efficiency), **KHB** (kick to handball ratio), **CP** (contested possessions), **Mi50** (Marks Inside 50), **GA%** (shots at goal accuracy), **CDL%** (contested defensive losses) are important player stats.

### Weather Data

**Max/Min Temp**, **Solar Radiation**, **Wind Speed** seem important, however **Rain** and **Humidity** do not.

### Other

**Attendance** seems very important. Keep **home/away** as well, but seems less important. **Venue**, **Team**, **Opponent** and **Day** to be quite unimportant. 

## Information Values:

Get the information values for all variables, binning the numerical varaibles into deciles.

In [None]:
df_matches_v3_IT, df_matches_v3_OOT, Xv2_train, Xv2_test, yv2_train, yv2_test = split_preprocess(df_matches_v3)
#Drop season from all the data
df_matches_v3_IT = df_matches_v3_IT.drop(['PointsF','PointsA','Round','Date','RoundNum','Season'],axis=1)
df_matches_v3_OOT = df_matches_v3_OOT.drop(['PointsF','PointsA','Round','Date','RoundNum','Season'],axis=1)
Xv2_train.drop(['PointsF','PointsA','Round','Date','RoundNum','Season'],axis=1,inplace=True)
Xv2_test.drop(['PointsF','PointsA','Round','Date','RoundNum','Season'],axis=1,inplace=True)

In [None]:
#Join the outcome variable back to the data
Xv2_train = pd.concat([Xv2_train,yv2_train],axis=1)

### Bin the numerical variables as quantiles 

In [None]:
#Get numerical columns
#Extract only ther numeric columns
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
num_cols = list(Xv2_train.select_dtypes(include=numerics).columns)
num_cols.remove('Outcome')

In [None]:
Xv2_train_copy = Xv2_train.copy(deep=True)

In [None]:
for col in num_cols:
    try:
        Xv2_train_copy.loc[:,col] = pd.qcut(Xv2_train[col],10,duplicates='drop') 
    except:
        continue

In [None]:
num_cols = list(Xv2_train.select_dtypes(include=numerics).columns)

In [None]:
num_cols

## IV Functions

In [None]:
import sys
sys.path.append('../functions')
from IVfunctions import * 

In [None]:
df_IV,Xout = IV_summary(Xv2_train_copy,'Outcome')

In [None]:
df_IV[df_IV['IV']>=0.07].plot.bar(x = 'Variable', y = 'IV', fontsize='9')

Seems the important variables are 'Team', 'Opponent', 'SAG', 'SI','SL', 'i50s','R', 'G','DHPA','Mi50'

## Univariate Analysis

In [None]:
#Extract only the relevant columns
Xv2_train = Xv2_train[['Team','Opponent','SAG_team1','SAG_team2','SAG_team3','SAG_opp1','SAG_opp2','SAG_opp3',
                    'SL_team1','SL_team2','SL_team3','SL_opp1','SL_opp2','SL_opp3',
                    'SI_team1','SI_team2','SI_team3','SI_opp1','SI_opp2','SI_opp3',
                    'i50s_team1','i50s_team2','i50s_team3','i50s_opp1','i50s_opp2','i50s_opp3',
                    'R_team1','R_team2','R_team3','R_opp1','R_opp2','R_opp3',
                    'G_team1','G_team2','G_team3','G_opp1','G_opp2','G_opp3',
                    'DHPA_team1','DHPA_team2','DHPA_team3','DHPA_opp1','DHPA_opp2','DHPA_opp3',
                    'Mi50_team1','Mi50_team2','Mi50_team3','Mi50_opp1','Mi50_opp2','Mi50_opp3','Outcome']]

In [None]:
#Numeric columns
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
num_cols = list(Xv2_train.select_dtypes(include=numerics).columns)
num_cols.remove('Outcome')
#Category columns
cat_cols = list(Xv2_train.select_dtypes(include='object').columns)


In [None]:
for col in num_cols:
    #round to 2d.p.'s
    Xv2_train['rounded_'+col] = Xv2_train[col].round(1)
    agg = Xv2_train.groupby('rounded_'+col).agg({'Outcome':'mean'})
    agg = agg.sort_index()
    agg = agg.reset_index()
    fig,ax = plt.subplots()
    agg.plot(x='rounded_'+col,y='Outcome',ax=ax)
    plt.show()

### Summary

Team Positive relationships:
- SAG
- SL
- SI
- i50s
- R
- G
- Mi50s

Team Negative relationships:
- DHPA

In [None]:
#Remove the rounded columns created from earlier
rounded_cols = Xv2_train.columns[Xv2_train.columns.str.match(r'^rounded_')].to_list()

#Remove from the dataframe
Xv2_train=Xv2_train.drop(rounded_cols,axis=1)

In [None]:
for col in cat_cols:
    #round to 2d.p.'s
    agg = Xv2_train.groupby(col).agg({'Outcome':'mean'})
    agg = agg.sort_values('Outcome')
    agg = agg.reset_index()
    fig,ax = plt.subplots()
    agg.plot.bar(x=col,y='Outcome',ax=ax)
    plt.show()

## Use Decision Tree to find Optimal Cut Points for Numeric Variables

In [None]:
Xv2_train.pop('Outcome')

## Get Correlations between remaining variables

In [None]:
dep_vars = ['SAG_team1','SAG_opp3','SL_team1','SL_opp3',\
                       'SI_team1','SI_team2','SI_team3','SI_opp1','SI_opp2',\
                       'SI_opp3','i50s_team1','i50s_team3','R_team3']

corr = Xv2_train[dep_vars].corr()

corr.style.background_gradient(cmap='coolwarm')

**High correlations >80%:**

- SAG_team1 and SL_team1 SI_team1 
- SAG_opp3 and SL_opp3 SI_opp3
- SL and SI.



**Moderate Correlation > 60%:**
- SAG_team1 and i50s_team1
- SL and i50s
- SI and R i50s
- i50s and R

Conclusion: Remove **SI**, **SL**, and **R**

In [None]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(max_depth=3,random_state=5432)

In [None]:
for col in num_cols:
    clf.fit(Xv2_train[col].values.reshape(-1,1),yv2_train)
    fig,ax = plt.subplots(figsize=(10,10))
    fig.suptitle(col)
    tree.plot_tree(clf,ax=ax)
    plt.show()

## Bin the Numeric variables

In [None]:
# SAG
bins_SAG = pd.IntervalIndex.from_tuples([(-np.inf, 0.8), (0.8, 1.0), (1.0, 1.25),(1.25,1.5),(1.5,np.inf)])
Xv2_train['SAG_team1_binned'] = pd.cut(Xv2_train['SAG_team1'],bins_SAG)
Xv2_train['SAG_team2_binned'] = pd.cut(Xv2_train['SAG_team2'],bins_SAG)
Xv2_train['SAG_team3_binned'] = pd.cut(Xv2_train['SAG_team3'],bins_SAG)
Xv2_train['SAG_opp1_binned'] = pd.cut(Xv2_train['SAG_opp1'],bins_SAG)
Xv2_train['SAG_opp2_binned'] = pd.cut(Xv2_train['SAG_opp2'],bins_SAG)
Xv2_train['SAG_opp3_binned'] = pd.cut(Xv2_train['SAG_opp3'],bins_SAG)

#SL
bins_SL = pd.IntervalIndex.from_tuples([(-np.inf, 0.8),(0.8, 1.5),(1.5,1.7),(1.7,np.inf)])
Xv2_train['SL_team1_binned'] = pd.cut(Xv2_train['SL_team1'],bins_SL)
Xv2_train['SL_team2_binned'] = pd.cut(Xv2_train['SL_team2'],bins_SL)
Xv2_train['SL_team3_binned'] = pd.cut(Xv2_train['SL_team3'],bins_SL)
Xv2_train['SL_opp1_binned'] = pd.cut(Xv2_train['SL_opp1'],bins_SL)
Xv2_train['SL_opp2_binned'] = pd.cut(Xv2_train['SL_opp2'],bins_SL)
Xv2_train['SL_opp3_binned'] = pd.cut(Xv2_train['SL_opp3'],bins_SL)

#SI
bins_SI = pd.IntervalIndex.from_tuples([(-np.inf, 2.5), (2.5, 3.0),(3.0, 4.5),(4.5,5.5),(5.5,np.inf)])
Xv2_train['SI_team1_binned'] = pd.cut(Xv2_train['SI_team1'],bins_SI)
Xv2_train['SI_team2_binned'] = pd.cut(Xv2_train['SI_team2'],bins_SI)
Xv2_train['SI_team3_binned'] = pd.cut(Xv2_train['SI_team3'],bins_SI)
Xv2_train['SI_opp1_binned'] = pd.cut(Xv2_train['SI_opp1'],bins_SI)
Xv2_train['SI_opp2_binned'] = pd.cut(Xv2_train['SI_opp2'],bins_SI)
Xv2_train['SI_opp3_binned'] = pd.cut(Xv2_train['SI_opp3'],bins_SI)

#i50s
bins_i50s = pd.IntervalIndex.from_tuples([(-np.inf, 1.5), (1.5, 1.9),(1.9, 2.5),(2.5,3),(3,np.inf)])
Xv2_train['i50s_team1_binned'] = pd.cut(Xv2_train['i50s_team1'],bins_i50s)
Xv2_train['i50s_team2_binned'] = pd.cut(Xv2_train['i50s_team2'],bins_i50s)
Xv2_train['i50s_team3_binned'] = pd.cut(Xv2_train['i50s_team3'],bins_i50s)
Xv2_train['i50s_opp1_binned'] = pd.cut(Xv2_train['i50s_opp1'],bins_i50s)
Xv2_train['i50s_opp2_binned'] = pd.cut(Xv2_train['i50s_opp2'],bins_i50s)
Xv2_train['i50s_opp3_binned'] = pd.cut(Xv2_train['i50s_opp3'],bins_i50s)

#R
bins_R = pd.IntervalIndex.from_tuples([(-np.inf, 7.5), (7.5, 8.5),(8.5, 10),(10,12),(12,np.inf)])
Xv2_train['R_team1_binned'] = pd.cut(Xv2_train['R_team1'],bins_R)
Xv2_train['R_team2_binned'] = pd.cut(Xv2_train['R_team2'],bins_R)
Xv2_train['R_team3_binned'] = pd.cut(Xv2_train['R_team3'],bins_R)
Xv2_train['R_opp1_binned'] = pd.cut(Xv2_train['R_opp1'],bins_R)
Xv2_train['R_opp2_binned'] = pd.cut(Xv2_train['R_opp2'],bins_R)
Xv2_train['R_opp3_binned'] = pd.cut(Xv2_train['R_opp3'],bins_R)

#G
bins_G = pd.IntervalIndex.from_tuples([(-np.inf, 0.25), (0.25, 0.35),(0.35, 0.45),(0.45,0.6),(0.6,np.inf)])
Xv2_train['G_team1_binned'] = pd.cut(Xv2_train['G_team1'],bins_G)
Xv2_train['G_team2_binned'] = pd.cut(Xv2_train['G_team2'],bins_G)
Xv2_train['G_team3_binned'] = pd.cut(Xv2_train['G_team3'],bins_G)
Xv2_train['G_opp1_binned'] = pd.cut(Xv2_train['G_opp1'],bins_G)
Xv2_train['G_opp2_binned'] = pd.cut(Xv2_train['G_opp2'],bins_G)
Xv2_train['G_opp3_binned'] = pd.cut(Xv2_train['G_opp3'],bins_G)

#DHPA
bins_DHPA = pd.IntervalIndex.from_tuples([(-np.inf, 4.5), (4.5, 5.5),(5.5, 6.5),(6.5,7.5),(7.5,np.inf)])
Xv2_train['DHPA_team1_binned'] = pd.cut(Xv2_train['DHPA_team1'],bins_DHPA)
Xv2_train['DHPA_team2_binned'] = pd.cut(Xv2_train['DHPA_team2'],bins_DHPA)
Xv2_train['DHPA_team3_binned'] = pd.cut(Xv2_train['DHPA_team3'],bins_DHPA)
Xv2_train['DHPA_opp1_binned'] = pd.cut(Xv2_train['DHPA_opp1'],bins_DHPA)
Xv2_train['DHPA_opp2_binned'] = pd.cut(Xv2_train['DHPA_opp2'],bins_DHPA)
Xv2_train['DHPA_opp3_binned'] = pd.cut(Xv2_train['DHPA_opp3'],bins_DHPA)

#Mi50
bins_Mi50 = pd.IntervalIndex.from_tuples([(-np.inf, 4.5), (4.5, 5.5),(5.5, 6.5),(6.5,7.5),(7.5,np.inf)])
Xv2_train['Mi50_team1_binned'] = pd.cut(Xv2_train['Mi50_team1'],bins_Mi50)
Xv2_train['Mi50_team2_binned'] = pd.cut(Xv2_train['Mi50_team2'],bins_Mi50)
Xv2_train['Mi50_team3_binned'] = pd.cut(Xv2_train['Mi50_team3'],bins_Mi50)
Xv2_train['Mi50_opp1_binned'] = pd.cut(Xv2_train['Mi50_opp1'],bins_Mi50)
Xv2_train['Mi50_opp2_binned'] = pd.cut(Xv2_train['Mi50_opp2'],bins_Mi50)
Xv2_train['Mi50_opp3_binned'] = pd.cut(Xv2_train['Mi50_opp3'],bins_Mi50)

In [None]:
#Join back on the outcome variable
Xv2_train = pd.concat([Xv2_train,yv2_train],axis=1)

## Logistic Regression

In [None]:
#Xv2_train = Xv2_train.drop('Outcome',axis=1)

In [None]:
import statsmodels.formula.api as smf
log_reg = smf.logit("Outcome ~ \
SAG_opp3 +\
i50s_team1 + i50s_team3 + Team + Opponent", data=Xv2_train).fit()

In [None]:
print(log_reg.summary())

# Training Data

In [None]:
ypred = log_reg.predict(Xv2_train)

In [None]:
from sklearn.metrics import roc_curve,roc_auc_score
fpr,tpr, thresholds = roc_curve(yv2_train,ypred)

In [None]:
fig,ax = plt.subplots()

ax.axline((0,0),slope=1,color='red',linestyle='--')
ax.plot(fpr,tpr)
ax.set_xlabel('TPR')
ax.set_ylabel('FPR')
ax.set_title('Training ROC Curve')

### ROC SCORE and Gini SCORE

In [None]:
AUC = roc_auc_score(yv2_train,ypred)
Gini = 2*AUC-1
print('ROC AUC: ',AUC*100,'\nGini: ',Gini*100)

## Test Data

In [None]:
ypred_test = log_reg.predict(Xv2_test)
fpr,tpr, thresholds = roc_curve(yv2_test,ypred_test)

In [None]:
fig,ax = plt.subplots()

ax.axline((0,0),slope=1,color='red',linestyle='--')
ax.plot(fpr,tpr)
ax.set_xlabel('TPR')
ax.set_ylabel('FPR')
ax.set_title('Test ROC Curve')

In [None]:
AUC = roc_auc_score(yv2_test,ypred_test)
Gini = 2*AUC-1
print('ROC AUC: ',AUC*100,'\nGini: ',Gini*100)

## Out of Time Data

In [None]:
y_pred_OOT = log_reg.predict(df_matches_v3_OOT)
y_OOT = df_matches_v3_OOT['Outcome']
fpr,tpr, thresholds = roc_curve(y_OOT,y_pred_OOT)

In [None]:
fig,ax = plt.subplots()

ax.axline((0,0),slope=1,color='red',linestyle='--')
ax.plot(fpr,tpr)
ax.set_xlabel('TPR')
ax.set_ylabel('FPR')
ax.set_title('Test ROC Curve')

In [None]:
AUC = roc_auc_score(y_OOT,y_pred_OOT)
Gini = 2*AUC-1
print('ROC AUC: ',AUC*100,'\nGini: ',Gini*100)