# Proof of Concept 1. (Basic) Linear Regression using Normalised Data/Both Teams #
## For Brownlow Predictor Project ##

Trains up 4 models using the 4 Macro Rules of Feature Selection using Normalised Data/Both Team Columns Only (FS_Val = 0.2 and Includes Winloss)

**Author: `Lang (Ron) Chen` 2021.12-2022.1**

___

**0. Import Libraries**

In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

from BrownlowPredictorTools.predict import predict
from BrownlowPredictorTools.test import test
from BrownlowPredictorTools.return_tp import return_tp
from BrownlowPredictorTools.wholeseason import wholeseason
from BrownlowPredictorTools.feature_selection2 import feature_selection2

In [2]:
choice = 'NormalisedData'

In [3]:
filelist = os.listdir(f'./Data/{choice}')[1:]
# Remove the first file (an ipynb checkpoint file)

**1. Feature Selection**

In [4]:
# Gets list of emperical test games (full 2021 season)
final_test_games = [file for file in filelist if '2021' in file]

In [5]:
# Gathers full games list (except 2021) and performs a single Train-Test Split (note different from previous KFold)
test_train_games = [file for file in filelist if '2021' not in file]
train_games, test_games = train_test_split(test_train_games, train_size = 0.8, test_size = 0.2, random_state = 42)

In [6]:
# Read in pre-prepared sample data of trained data only 
# (the same rows as if we used concatenated all the data from the train_games list)
train_data = pd.read_csv('Train_Data (N).csv')

In [7]:
# Select Columns of Both Teams Stats only
cols = [col for col in train_data.columns if ('BTN' in col or 'Winloss' in col)]
cols

['Kicks BTN',
 'Handballs BTN',
 'Disposals BTN',
 'Marks BTN',
 'Goals BTN',
 'Behinds BTN',
 'Tackles BTN',
 'Hitouts BTN',
 'Goal Assists BTN',
 'Inside 50s BTN',
 'Clearances BTN',
 'Clangers BTN',
 'Rebound 50s BTN',
 'Frees For BTN',
 'Frees Agains BTN',
 'Contested Possessions BTN',
 'Uncontested Possessions BTN',
 'Effective Disposals BTN',
 'Contested Marks BTN',
 'Marks Inside 50 BTN',
 'One Percenters BTN',
 'Bounces BTN',
 'Centre Clearances BTN',
 'Stoppage Clearances BTN',
 'Score Involvements BTN',
 'Metres Gained BTN',
 'Turnovers BTN',
 'Intercepts BTN',
 'Tackles Inside 50 BTN',
 'Time On Ground % BTN',
 'Winloss',
 'Uncontested Marks BTN',
 'Marks Outside 50 BTN',
 'Tackles Outside 50 BTN',
 'Behind Assists BTN',
 'Ineffective Disposals BTN']

In [8]:
# Select Columns with correlation higher than 0.2 only
corr = dict()
for col in cols:
    corr[col] = train_data[[col, 'Brownlow Votes']].corr(method = 'pearson').loc[col]['Brownlow Votes']
corr

{'Kicks BTN': 0.3313800478557236,
 'Handballs BTN': 0.28243847810008826,
 'Disposals BTN': 0.3848565320688732,
 'Marks BTN': 0.16831020314626322,
 'Goals BTN': 0.251159108505028,
 'Behinds BTN': 0.09716864795537491,
 'Tackles BTN': 0.15068595304633947,
 'Hitouts BTN': 0.015309775486725926,
 'Goal Assists BTN': 0.1261109371515187,
 'Inside 50s BTN': 0.2586408367327942,
 'Clearances BTN': 0.30912151700818563,
 'Clangers BTN': -0.12145252365952025,
 'Rebound 50s BTN': 0.030801676287350914,
 'Frees For BTN': 0.15666458356544574,
 'Frees Agains BTN': -0.03362877080488017,
 'Contested Possessions BTN': 0.3522270859288417,
 'Uncontested Possessions BTN': 0.273743999465288,
 'Effective Disposals BTN': 0.34924408871707774,
 'Contested Marks BTN': 0.09026198685114628,
 'Marks Inside 50 BTN': 0.15444090495732277,
 'One Percenters BTN': -0.05126795234854484,
 'Bounces BTN': 0.07938363883800453,
 'Centre Clearances BTN': 0.25838896514746523,
 'Stoppage Clearances BTN': 0.2705823274328926,
 'Score I

In [9]:
corr = list(corr.items())
corr

[('Kicks BTN', 0.3313800478557236),
 ('Handballs BTN', 0.28243847810008826),
 ('Disposals BTN', 0.3848565320688732),
 ('Marks BTN', 0.16831020314626322),
 ('Goals BTN', 0.251159108505028),
 ('Behinds BTN', 0.09716864795537491),
 ('Tackles BTN', 0.15068595304633947),
 ('Hitouts BTN', 0.015309775486725926),
 ('Goal Assists BTN', 0.1261109371515187),
 ('Inside 50s BTN', 0.2586408367327942),
 ('Clearances BTN', 0.30912151700818563),
 ('Clangers BTN', -0.12145252365952025),
 ('Rebound 50s BTN', 0.030801676287350914),
 ('Frees For BTN', 0.15666458356544574),
 ('Frees Agains BTN', -0.03362877080488017),
 ('Contested Possessions BTN', 0.3522270859288417),
 ('Uncontested Possessions BTN', 0.273743999465288),
 ('Effective Disposals BTN', 0.34924408871707774),
 ('Contested Marks BTN', 0.09026198685114628),
 ('Marks Inside 50 BTN', 0.15444090495732277),
 ('One Percenters BTN', -0.05126795234854484),
 ('Bounces BTN', 0.07938363883800453),
 ('Centre Clearances BTN', 0.25838896514746523),
 ('Stoppage

In [10]:
selected_features = [col[0] for col in corr if col[1] > 0.2]
selected_features

['Kicks BTN',
 'Handballs BTN',
 'Disposals BTN',
 'Goals BTN',
 'Inside 50s BTN',
 'Clearances BTN',
 'Contested Possessions BTN',
 'Uncontested Possessions BTN',
 'Effective Disposals BTN',
 'Centre Clearances BTN',
 'Stoppage Clearances BTN',
 'Score Involvements BTN',
 'Metres Gained BTN',
 'Behind Assists BTN',
 'Ineffective Disposals BTN']

**2.Trains Models**

0. Demonstration of functions (completely same as those in libraries)

In [11]:
def feature_selection2(cols, select_mode, BT_OT):
    """ Helper function for feature selection based on rules of Feature Selection. First parameter is the , second parameter is the micro rules of feature 
    selection (1, 2, 3, 4), third parameter denotes whether macro rule of feature selection is 'BT_OT' """
    
    out = list()
    
    # dictionary listing items that cannot coexist with each other
    nonallowed = {'Kicks': ["Disposals"],
         'Handballs': ["Disposals"],
         'Disposals': ["Kicks", "Handballs", 'Ineffective Disposals', 'Contested Possessions', 'Uncontested Possessions', 'Effective Disposals'],
         'Marks': ["Uncontested Marks", 'Marks Outside 50', 'Marks Inside 50', 'Contested Marks'],
         'Tackles': ['Tackles Inside 50', 'Tackles Outside 50'],
         'Goal Assists': ["Score Involvements"],
         'Clearances': ["Centre Clearances", "Stoppage Clearances"],
         'Contested Possessions': ["Disposals"],
         'Uncontested Possessions': ["Disposals"],
         'Effective Disposals': ["Disposals"],
         'Contested Marks': ['Marks'],
         'Marks Inside 50': ['Marks'],
         'Centre Clearances': ['Clearances'],
         'Stoppage Clearances': ['Clearances'],
         'Score Involvements': ['Behind Assists', 'Goal Assists'],
         'Tackles Inside 50': ["Tackles"],
         'Uncontested Marks': ["Marks"],
         'Marks Outside 50': ["Marks"],
         'Tackles Outside 50': ["Tackles"],
         'Behind Assists BTN': ["Score Involvements"],
         'Ineffective Disposals': ["Disposals"]}    
    
    if select_mode == 1 and not BT_OT: # Micro mode 1: return all that is seen
        return cols
    
    illegal = list()
    
    if select_mode == 3: # Micro mode 3: ban summary statistics
        banned = ['Disposals', 'Marks', 'Tackles', 'Clearances', 'Score Involvements']
        illegal.extend(banned)
        
    elif select_mode == 4: # Micro mode 4: ban Disposals
        illegal.append('Disposals')
    
    
    
    if BT_OT and select_mode != 1:
        
        for col in cols:
            col_ = strip_end(col)

            if col_ not in illegal:

                if col_ in nonallowed:

                    for i in range(len(nonallowed[col_])):
                        illegal.append(nonallowed[col_][i])
                
                illegal.append(col_)
                out.append(col)
    
    
    elif BT_OT and select_mode == 1: 
        
        for col in cols:
            col_ = strip_end(col)

            if col_ not in illegal:
                
                # BT or OT: i.e. once BT uses something, then put the col's root name in illegal so OT can't be selected even if it surpassed FT_val
                illegal.append(col_) 
                out.append(col)
        
    else:
        
        for col in cols:
            col_ = strip_end(col)

            if col_ not in illegal:

                if col_ in nonallowed:

                    for i in range(len(nonallowed[col_])):
                        illegal.append(nonallowed[col_][i])

                out.append(col)
    
    return out






def strip_end(string):
    """ Helper function to remove the 'BTS' 'BTN' tag from end """
    
    tmp = string.split()
    out = str()
    
    for i in range(len(tmp)-1):
        out += tmp[i]
        out += ' '
    
    return out[:-1]

In [12]:
# import pandas as pd
# import numpy as np

def predict(test_games, lm, selected_features, choice):
    """ takes test_games and model to output predictions as a list and observations as a Series """
    
    prediction = list()
    
    testdata_y = pd.DataFrame()
    
    for file in test_games:
        
        # Open test game file
        df = pd.read_csv(f'./Data/{choice}/{file}')
        
        # Initialise a list for this game
        tmp = [0 for i in range(len(df))]
        
        # Make predictions
        x_final = df[selected_features].replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)
        y_pred = lm.predict(x_final)
        
        # Collect the actual observations
        testdata_y = pd.concat([testdata_y, df['Brownlow Votes']], axis=0)
        
        # Find top 3 scoring players for the game and allocate 3, 2, 1 votes to their respective index on the tmp list
        enumerated = [(i, score) for i, score in enumerate(y_pred)]
        enumerated.sort(key = lambda x:x[1], reverse = True)
        
        tmp[enumerated[0][0]] = 3
        tmp[enumerated[1][0]] = 2
        tmp[enumerated[2][0]] = 1
        
        # Can un-comment to observe what actual scores the linear regression is outputting 
#         print(enumerated[0:3])
#         print(enumerated[-3:])
        
        prediction = prediction + tmp
    
    return prediction, testdata_y

In [13]:
def test(predictions, testdata_y, nchoice):
    """ Calculates the tp/tn for 0 votes, 1 vote, 2 votes and 3 votes. Polymorphic as to how many choices there are """
    
    # instantiate tally
    result1 = list() # tp/tn/tp/fp calculated with respect to predictions (i.e. predicted 1, obs 0 => contribute to fp 1)
    result2 = list() # tp/tn/tp/fp calculated with respect to observations (i.e. predicted 1, obs 0 => contribute to fp 0)
    
    # Initialise the result1 and result2 tallys based on nchoice
    for i in range(nchoice):
        tmp1 = [0 for j in range(nchoice)]
        tmp2 = [0 for j in range(nchoice)]
        result1.append(tmp1)
        result2.append(tmp2)
    
    # Run through the predictions and add to tally according to whether it is tp/tn/fp/fn
    for i in range(len(predictions)):
        result1[predictions[i]][int(testdata_y.iloc[i][0])] += 1
        result2[int(testdata_y.iloc[i][0])][predictions[i]] += 1
    
    # Find the sum of each row and then take percentage based on it (because we are taking tp/fp/tn/fn with respect to either predictions or observations rather than total)
    for i in range(nchoice):
        
        sum_row1 = sum(result1[i])
        sum_row2 = sum(result2[i])
        
        for j in range(nchoice):
            result1[i][j] = result1[i][j]/sum_row1
            result2[i][j] = result2[i][j]/sum_row2
            
    return result1, result2

In [14]:
def return_tp(result):
    """ Helper function for just returning the true positive/true negative values"""
    
    return tuple([result[i][i] for i in range(len(result))])

Also avaliable is the predict_mass and test_mass functions used in the scripts - not used in Proof of Concepts

In [15]:
# import pandas as pd
# import numpy as np

def predict_mass(test_games, lm, selected_features, choice):
    """ same as predict except returns both the predict and observations in one dataframe """
    
    prediction = list()
    
    testdata_y = pd.DataFrame()
    
    for file in test_games:
        
        # Open test game file
        df = pd.read_csv(f'./Data/{choice}/{file}')
        
        # Initialise a list for this game
        tmp = [0 for i in range(len(df))]
        
        # Make predictions
        x_final = df[selected_features].replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)
        y_pred = lm.predict(x_final)
        
        # Collect the actual observations
        testdata_y = pd.concat([testdata_y, df['Brownlow Votes']], axis=0)
        
        # Find top 3 scoring players for the game and allocate 3, 2, 1 votes to their respective index on the tmp list
        enumerated = [(i, score) for i, score in enumerate(y_pred)]
        enumerated.sort(key = lambda x:x[1], reverse = True)
        
        tmp[enumerated[0][0]] = 3
        tmp[enumerated[1][0]] = 2
        tmp[enumerated[2][0]] = 1
        
        # Can un-comment to observe what actual scores the linear regression is outputting 
#         print(enumerated[0:3])
#         print(enumerated[-3:])
        
        prediction = prediction + tmp
        
    out = pd.DataFrame({'predictions': prediction, 'observations': list(testdata_y[testdata_y.columns[0]])})
    
    return out

In [16]:
def test_mass(out): 
    """ Helper function for returning tp/tn for mass testing (more efficient as does not collect other stats)"""
    
    tp0 = len(out[(out['predictions'] == out['observations'].astype(int)) & (out['predictions'] == 0)])/len(out[(out['predictions'] == 0)])
    tp1 = len(out[(out['predictions'] == out['observations'].astype(int)) & (out['predictions'] == 1)])/len(out[(out['predictions'] == 1)])
    tp2 = len(out[(out['predictions'] == out['observations'].astype(int)) & (out['predictions'] == 2)])/len(out[(out['predictions'] == 2)])
    tp3 = len(out[(out['predictions'] == out['observations'].astype(int)) & (out['predictions'] == 3)])/len(out[(out['predictions'] == 3)])
    
    return (tp0, tp1, tp2, tp3,)

1. Micro Rule of Feature Selection 1: 

*All cols that passed FS_val selected*

In [17]:
# Trains LR model
traindata_x_1 = train_data[selected_features]
traindata_x_1.index = range(0,len(traindata_x_1))
traindata_y_1 = train_data['Brownlow Votes']
traindata_y_1.index = range(0,len(traindata_y_1))

lm_1 = linear_model.LinearRegression()
traindata_x_1 = traindata_x_1.replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)
model_1 = lm_1.fit(traindata_x_1, traindata_y_1)

In [18]:
# Get predictions and observations
predictions_1, testdata_y_1 = predict(test_games, lm_1, selected_features, choice)

In [19]:
# Get True Positive/True Negative results
result1_1, result2_1 = test(predictions_1, testdata_y_1, 4)

In [20]:
# TP/TN based on what was predicted
result1_1

[[0.9698583448716583,
  0.013632974757695175,
  0.01139631483651081,
  0.00511236553413569],
 [0.6200873362445415,
  0.17903930131004367,
  0.09606986899563319,
  0.10480349344978165],
 [0.4148471615720524,
  0.1222707423580786,
  0.21397379912663755,
  0.24890829694323144],
 [0.20087336244541484,
  0.13973799126637554,
  0.22270742358078602,
  0.4366812227074236]]

In [21]:
# TP/TN based on what was observed
result2_1

[[0.9698583448716583,
  0.015124081371818084,
  0.010118223452976888,
  0.004899350303546704],
 [0.5589519650655022,
  0.17903930131004367,
  0.1222707423580786,
  0.13973799126637554],
 [0.4672489082969432,
  0.09606986899563319,
  0.21397379912663755,
  0.22270742358078602],
 [0.2096069868995633,
  0.10480349344978165,
  0.24890829694323144,
  0.4366812227074236]]

In [22]:
# Only the True Positive Values
return_tp(result1_1)

(0.9698583448716583,
 0.17903930131004367,
 0.21397379912663755,
 0.4366812227074236)

2. Micro Rule of Feature Selection 2

*-For those with dependency/triangle relationships (i.e. A=Disposals/B=Kicks/C=Handballs), if A comes first then B, C excluded. If B or C comes first then A excluded*

In [23]:
# Selects feature according to micro FS_Rule
selected_features_2 = feature_selection2(selected_features, 2, False)
selected_features_2

#All other operations hereonin same as 1.

['Kicks BTN',
 'Handballs BTN',
 'Goals BTN',
 'Inside 50s BTN',
 'Clearances BTN',
 'Contested Possessions BTN',
 'Uncontested Possessions BTN',
 'Effective Disposals BTN',
 'Score Involvements BTN',
 'Metres Gained BTN',
 'Ineffective Disposals BTN']

In [24]:
traindata_x_2 = train_data[selected_features_2]
traindata_x_2.index = range(0,len(traindata_x_2))
traindata_y_2 = train_data['Brownlow Votes']
traindata_y_2.index = range(0,len(traindata_y_2))

lm_2 = linear_model.LinearRegression()
traindata_x_2 = traindata_x_2.replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)
model_2 = lm_2.fit(traindata_x_2, traindata_y_2)

In [25]:
predictions_2, testdata_y_2 = predict(test_games, lm_2, selected_features_2, choice)

In [26]:
result1_2, result2_2 = test(predictions_2, testdata_y_2, 4)

In [27]:
result1_2

[[0.9698583448716583,
  0.013952497603578655,
  0.01107679199062733,
  0.00511236553413569],
 [0.6200873362445415,
  0.1703056768558952,
  0.1091703056768559,
  0.10043668122270742],
 [0.4366812227074236,
  0.1222707423580786,
  0.2052401746724891,
  0.23580786026200873],
 [0.17903930131004367,
  0.13537117903930132,
  0.2314410480349345,
  0.45414847161572053]]

In [28]:
result2_2

[[0.9698583448716583,
  0.015124081371818084,
  0.010650761529449356,
  0.004366812227074236],
 [0.5720524017467249,
  0.1703056768558952,
  0.1222707423580786,
  0.13537117903930132],
 [0.45414847161572053,
  0.1091703056768559,
  0.2052401746724891,
  0.2314410480349345],
 [0.2096069868995633,
  0.10043668122270742,
  0.23580786026200873,
  0.45414847161572053]]

In [29]:
return_tp(result1_2)

(0.9698583448716583,
 0.1703056768558952,
 0.2052401746724891,
 0.45414847161572053)

3. Micro Rule of Feature Selection 3: 

*All cols that passed FS_val selected but abandon all 'summary' cols such as Disposal/Tackles/Marks*

In [30]:
selected_features_3 = feature_selection2(selected_features, 3, False)
selected_features_3

['Kicks BTN',
 'Handballs BTN',
 'Goals BTN',
 'Inside 50s BTN',
 'Contested Possessions BTN',
 'Uncontested Possessions BTN',
 'Effective Disposals BTN',
 'Centre Clearances BTN',
 'Stoppage Clearances BTN',
 'Metres Gained BTN',
 'Behind Assists BTN',
 'Ineffective Disposals BTN']

In [31]:
traindata_x_3 = train_data[selected_features_3]
traindata_x_3.index = range(0,len(traindata_x_3))
traindata_y_3 = train_data['Brownlow Votes']
traindata_y_3.index = range(0,len(traindata_y_3))

lm_3 = linear_model.LinearRegression()
traindata_x_3 = traindata_x_3.replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)
model_3 = lm_3.fit(traindata_x_3, traindata_y_3)

In [32]:
predictions_3, testdata_y_3 = predict(test_games, lm_3, selected_features_3, choice)

In [33]:
result1_3, result2_3 = test(predictions_3, testdata_y_3, 4)

In [34]:
result1_3

[[0.9696453296410693,
  0.014165512834167643,
  0.011183299605921824,
  0.005005857918841197],
 [0.6419213973799127,
  0.14847161572052403,
  0.11353711790393013,
  0.09606986899563319],
 [0.42358078602620086,
  0.13100436681222707,
  0.19213973799126638,
  0.25327510917030566],
 [0.17903930131004367,
  0.13973799126637554,
  0.23580786026200873,
  0.44541484716157204]]

In [35]:
result2_3

[[0.9696453296410693,
  0.015656619448290554,
  0.010331238683565875,
  0.004366812227074236],
 [0.5807860262008734,
  0.14847161572052403,
  0.13100436681222707,
  0.13973799126637554],
 [0.4585152838427948,
  0.11353711790393013,
  0.19213973799126638,
  0.23580786026200873],
 [0.2052401746724891,
  0.09606986899563319,
  0.25327510917030566,
  0.44541484716157204]]

In [36]:
return_tp(result1_3)

(0.9696453296410693,
 0.14847161572052403,
 0.19213973799126638,
 0.44541484716157204)

4. Micro Rule of Feature Selection 4: 

*Exclude Disposals, otherwise as per rule 2*

In [37]:
selected_features_4 = feature_selection2(selected_features, 4, False)
selected_features_4

['Kicks BTN',
 'Handballs BTN',
 'Goals BTN',
 'Inside 50s BTN',
 'Clearances BTN',
 'Contested Possessions BTN',
 'Uncontested Possessions BTN',
 'Effective Disposals BTN',
 'Score Involvements BTN',
 'Metres Gained BTN',
 'Ineffective Disposals BTN']

In [38]:
traindata_x_4 = train_data[selected_features_4]
traindata_x_4.index = range(0,len(traindata_x_4))
traindata_y_4 = train_data['Brownlow Votes']
traindata_y_4.index = range(0,len(traindata_y_4))

lm_4 = linear_model.LinearRegression()
traindata_x_4 = traindata_x_4.replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)
model_4 = lm_4.fit(traindata_x_4, traindata_y_4)

In [39]:
predictions_4, testdata_y_4 = predict(test_games, lm_4, selected_features_4, choice)

In [40]:
result1_4, result2_4 = test(predictions_4, testdata_y_4, 4)

In [41]:
result1_4

[[0.9698583448716583,
  0.013952497603578655,
  0.01107679199062733,
  0.00511236553413569],
 [0.6200873362445415,
  0.1703056768558952,
  0.1091703056768559,
  0.10043668122270742],
 [0.4366812227074236,
  0.1222707423580786,
  0.2052401746724891,
  0.23580786026200873],
 [0.17903930131004367,
  0.13537117903930132,
  0.2314410480349345,
  0.45414847161572053]]

In [42]:
result2_4

[[0.9698583448716583,
  0.015124081371818084,
  0.010650761529449356,
  0.004366812227074236],
 [0.5720524017467249,
  0.1703056768558952,
  0.1222707423580786,
  0.13537117903930132],
 [0.45414847161572053,
  0.1091703056768559,
  0.2052401746724891,
  0.2314410480349345],
 [0.2096069868995633,
  0.10043668122270742,
  0.23580786026200873,
  0.45414847161572053]]

In [43]:
return_tp(result1_4)

(0.9698583448716583,
 0.1703056768558952,
 0.2052401746724891,
 0.45414847161572053)

**3. Summary Observations**

0. Demonstration of functions (completely same as in libraries)

In [44]:
# import pandas as pd
from collections import defaultdict as dd
# import numpy as np

def wholeseason(final_test_games, lm, selected_features, choice):
    """ Helper function for running emperical test - returns tuple of leaderboard of players for the season (with votes) """
    
    players = dd(int) # tally
    
    for file in final_test_games:
        
        # Open each final test season's game
        df = pd.read_csv(f'./Data/{choice}/{file}')
        
        # Run predictions
        x_final = df[selected_features].replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)
        y_pred = lm.predict(x_final)
        
        # Find top 3 scoring players of the game and rank them from highest score to lowest
        enumerated = [(i, score) for i, score in enumerate(y_pred)]
        enumerated.sort(key = lambda x:x[1], reverse = True)
        
        # Find their (top 3 players) names and insert them into the tally by adding 3 votes, 2 votes and 1 vote respectively
        for j in range(3):
            players[df.loc[enumerated[j][0]]['Player']] += (3-j)
    
    # Sort the leaderboard so top pollers are ranked first
    leaderboard = sorted(list(players.items()), reverse = True, key = lambda x:x[1])
    
    return leaderboard

1. Emperical Experiment

In [45]:
# Runs the season 2021 data onto predictor and gets top players
leaderboard1 = wholeseason(final_test_games, lm_1, selected_features, choice)
leaderboard2 = wholeseason(final_test_games, lm_2, selected_features_2, choice)
leaderboard3 = wholeseason(final_test_games, lm_3, selected_features_3, choice)
leaderboard4 = wholeseason(final_test_games, lm_4, selected_features_4, choice)

In [46]:
leaderboard1[0:15]

[('Oliver Wines', 34),
 ('Jack Steele', 33),
 ('Christian Petracca', 30),
 ('Clayton Oliver', 28),
 ('Darcy Parish', 28),
 ('Jarryd Lyons', 27),
 ('Tom Mitchell', 27),
 ('Jackson Macrae', 27),
 ('Marcus Bontempelli', 26),
 ('Rory Laird', 23),
 ('Jake Stringer', 21),
 ('Cameron Guthrie', 20),
 ('Touk Miller', 20),
 ('Luke Parker', 20),
 ('Sam Walsh', 19)]

In [47]:
leaderboard2[0:15]

[('Oliver Wines', 34),
 ('Jack Steele', 34),
 ('Clayton Oliver', 32),
 ('Jarryd Lyons', 28),
 ('Darcy Parish', 26),
 ('Marcus Bontempelli', 26),
 ('Rory Laird', 26),
 ('Christian Petracca', 25),
 ('Jackson Macrae', 25),
 ('Tom Mitchell', 24),
 ('Cameron Guthrie', 21),
 ('Touk Miller', 21),
 ('Sam Walsh', 20),
 ('Luke Parker', 20),
 ('Dominic Sheed', 19)]

In [48]:
leaderboard3[0:15]

[('Oliver Wines', 34),
 ('Jack Steele', 34),
 ('Clayton Oliver', 31),
 ('Tom Mitchell', 30),
 ('Christian Petracca', 29),
 ('Darcy Parish', 28),
 ('Marcus Bontempelli', 27),
 ('Jackson Macrae', 26),
 ('Jarryd Lyons', 25),
 ('Rory Laird', 25),
 ('Touk Miller', 21),
 ('Cameron Guthrie', 20),
 ('Dominic Sheed', 20),
 ('Thomas Liberatore', 19),
 ('Sam Walsh', 19)]

In [49]:
leaderboard4[0:15]

[('Oliver Wines', 34),
 ('Jack Steele', 34),
 ('Clayton Oliver', 32),
 ('Jarryd Lyons', 28),
 ('Darcy Parish', 26),
 ('Marcus Bontempelli', 26),
 ('Rory Laird', 26),
 ('Christian Petracca', 25),
 ('Jackson Macrae', 25),
 ('Tom Mitchell', 24),
 ('Cameron Guthrie', 21),
 ('Touk Miller', 21),
 ('Sam Walsh', 20),
 ('Luke Parker', 20),
 ('Dominic Sheed', 19)]

2. Predictor's r scores

In [50]:
print(lm_1.score(traindata_x_1, traindata_y_1))
print(lm_2.score(traindata_x_2, traindata_y_2))
print(lm_3.score(traindata_x_3, traindata_y_3))
print(lm_4.score(traindata_x_4, traindata_y_4))

0.23298806127226634
0.23093336046383292
0.23127768790264824
0.23093336046383292
