# Proof of Concept 5. Double Linear Regression with Bootstrap using Normalised Data/Both Teams #
## For Brownlow Predictor Project ##

Trains up 1 models using Normalised Data/Both Team Columns Only (FS_Val = 0.2 and Includes Winloss)

Experiments with using PCA for feature selection


**Author: `Lang (Ron) Chen` 2021.12-2022.1**

___

**0. Import Libraries**

In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
from sklearn.decomposition import PCA

from BrownlowPredictorPCA.predict import predict
from BrownlowPredictorTools.test import test
from BrownlowPredictorTools.return_tp import return_tp
from BrownlowPredictorPCA.wholeseason import wholeseason

In [2]:
choice = 'NormalisedData'

In [3]:
filelist = os.listdir(f'./Data/{choice}')[1:]
# Remove the first file (an ipynb checkpoint file)

**1. Feature Selection**

In [4]:
# Gets list of emperical test games (full 2021 season)
final_test_games = [file for file in filelist if '2021' in file]

In [5]:
# Gathers full games list (except 2021) and performs a single Train-Test Split (note different from previous KFold)
test_train_games = [file for file in filelist if '2021' not in file]
train_games, test_games = train_test_split(test_train_games, train_size = 0.8, test_size = 0.2, random_state = 42)

In [6]:
# Read in pre-prepared sample data of trained data only 
# (the same rows as if we used concatenated all the data from the train_games list)
train_data = pd.read_csv('Train_Data (N).csv')

In [7]:
# Initialises PCA
pca = PCA(n_components = 5)

In [8]:
cols = [col for col in train_data.columns if ('BTN' in col or 'Winloss' in col)]

corr = dict()
for col in cols:
    corr[col] = train_data[[col, 'Brownlow Votes']].corr(method = 'pearson').loc[col]['Brownlow Votes']
corr

corr = list(corr.items())

In [9]:
selected_features = [col[0] for col in corr if col[1] > 0.2]
selected_features

['Kicks BTN',
 'Handballs BTN',
 'Disposals BTN',
 'Goals BTN',
 'Inside 50s BTN',
 'Clearances BTN',
 'Contested Possessions BTN',
 'Uncontested Possessions BTN',
 'Effective Disposals BTN',
 'Centre Clearances BTN',
 'Stoppage Clearances BTN',
 'Score Involvements BTN',
 'Metres Gained BTN',
 'Behind Assists BTN',
 'Ineffective Disposals BTN']

**2.Trains Models**

0. Demonstration of functions (completely same as those in libraries)

Will only comment parts that are different (PCA specific). Can see PoC1 for more details

In [10]:
# import pandas as pd
# import numpy as np

def predict(test_games, lm, selected_features, choice, pca):
    """ takes test_games and model and pca object to output predictions as a list and observations as a Series """
    
    prediction = list()
    
    testdata_y = pd.DataFrame()
    
    for file in test_games:
        
        df = pd.read_csv(f'./Data/{choice}/{file}')
        
        tmp = [0 for i in range(len(df))]
        
        x_final = df[selected_features].replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)
        y_pred = lm.predict(pca.transform(x_final)) # must transform the data using same pca object as training object
        
        testdata_y = pd.concat([testdata_y, df['Brownlow Votes']], axis=0)
        
        enumerated = [(i, score) for i, score in enumerate(y_pred)]
        enumerated.sort(key = lambda x:x[1], reverse = True)
        
        tmp[enumerated[0][0]] = 3
        tmp[enumerated[1][0]] = 2
        tmp[enumerated[2][0]] = 1
#         print(enumerated[0:3])
#         print(enumerated[-3:])
        
        prediction = prediction + tmp
        
    return prediction, testdata_y

Also avaliable is the PCA specific predict_mass function used in the scripts - not used in Proof of Concepts

In [1]:
def predict_mass(test_games, lm, selected_features, choice, pca):
    """ takes test_games and model and pca object to output predictions as a list and observations as a Series """
    
    prediction = list()
    
    testdata_y = pd.DataFrame()
    
    for file in test_games:
        
        df = pd.read_csv(f'./Data/{choice}/{file}')
        
        tmp = [0 for i in range(len(df))]
        
        x_final = df[selected_features].replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)
        y_pred = lm.predict(pca.transform(x_final)) # must transform the data using same pca object as training object
        
        testdata_y = pd.concat([testdata_y, df['Brownlow Votes']], axis=0)
        
        enumerated = [(i, score) for i, score in enumerate(y_pred)]
        enumerated.sort(key = lambda x:x[1], reverse = True)
        
        tmp[enumerated[0][0]] = 3
        tmp[enumerated[1][0]] = 2
        tmp[enumerated[2][0]] = 1
#         print(enumerated[0:3])
#         print(enumerated[-3:])
        
        prediction = prediction + tmp

    out = pd.DataFrame({'predictions': prediction, 'observations': list(testdata_y[testdata_y.columns[0]])})
    
    return out

1. Regular model

In [11]:
# Trains LR model
traindata_x_1 = train_data[selected_features]
traindata_x_1 = traindata_x_1.replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)
traindata_x_1.index = range(0,len(traindata_x_1))

principalComponents = pca.fit_transform(traindata_x_1) # fits pca model and transforms using it

traindata_y_1 = train_data['Brownlow Votes']
traindata_y_1.index = range(0,len(traindata_y_1))

lm_1 = linear_model.LinearRegression()
model_1 = lm_1.fit(principalComponents, traindata_y_1)

In [12]:
# prints pca explained variance
pca.explained_variance_ratio_

array([0.49335605, 0.13844796, 0.12263638, 0.06489825, 0.03992478])

In [13]:
# Get predictions and observations
predictions_1, testdata_y_1 = predict(test_games, lm_1, selected_features, choice, pca)

In [14]:
# Get True Positive/True Negative results
result1_1, result2_1 = test(predictions_1, testdata_y_1, 4)

In [15]:
# TP/TN based on what was predicted
result1_1

[[0.967195654489296,
  0.014272020449462136,
  0.012141868143572265,
  0.006390456917669613],
 [0.62882096069869,
  0.13973799126637554,
  0.1222707423580786,
  0.1091703056768559],
 [0.5021834061135371,
  0.13973799126637554,
  0.1615720524017467,
  0.1965065502183406],
 [0.21397379912663755,
  0.13537117903930132,
  0.2183406113537118,
  0.43231441048034935]]

In [16]:
# TP/TN based on what was observed
result2_1

[[0.967195654489296,
  0.015337096602407072,
  0.012248375758866758,
  0.005218873149430184],
 [0.5851528384279476,
  0.13973799126637554,
  0.13973799126637554,
  0.13537117903930132],
 [0.4978165938864629,
  0.1222707423580786,
  0.1615720524017467,
  0.2183406113537118],
 [0.26200873362445415,
  0.1091703056768559,
  0.1965065502183406,
  0.43231441048034935]]

In [17]:
# Only the True Positive Values
return_tp(result1_1)

(0.967195654489296,
 0.13973799126637554,
 0.1615720524017467,
 0.43231441048034935)

**3. Summary Observations**

0. Demonstration of functions (completely same as those in libraries)

Will only comment parts that are different. Can see PoC1 for more details

In [18]:
# import pandas as pd
from collections import defaultdict as dd
# import numpy as np

def wholeseason(final_test_games, lm, selected_features, choice, pca):
    """ Helper function for running emperical test - returns tuple of leaderboard of players for the season (with votes) """
    
    players = dd(int)
    
    for file in final_test_games:
        
        df = pd.read_csv(f'./Data/{choice}/{file}')
        
        x_final = df[selected_features].replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)
        y_pred = lm.predict(pca.transform(x_final)) # must transform the data using same pca object as training object
        
        enumerated = [(i, score) for i, score in enumerate(y_pred)]
        enumerated.sort(key = lambda x:x[1], reverse = True)
        
        for j in range(3):
            players[df.loc[enumerated[j][0]]['Player']] += (3-j)
    
    leaderboard = sorted(list(players.items()), reverse = True, key = lambda x:x[1])
    
    return leaderboard

1. Emperical Experiment

In [19]:
leaderboard1 = wholeseason(final_test_games, lm_1, selected_features, choice, pca)

In [20]:
leaderboard1[0:15]

[('Clayton Oliver', 40),
 ('Oliver Wines', 37),
 ('Christian Petracca', 34),
 ('Jack Steele', 34),
 ('Darcy Parish', 31),
 ('Rory Laird', 29),
 ('Touk Miller', 28),
 ('Travis Boak', 27),
 ('Jackson Macrae', 26),
 ('Marcus Bontempelli', 26),
 ('Cameron Guthrie', 24),
 ('Jarryd Lyons', 24),
 ('Tom Mitchell', 24),
 ('Sam Walsh', 24),
 ('Callum Mills', 21)]

2. Predictor's r scores

In [21]:
print(lm_1.score(principalComponents, traindata_y_1))

0.21231256572941914
