# Proof of Concept 3. Double Linear Regression using Normalised Data/Both Teams #
## For Brownlow Predictor Project ##

Trains up 4 models using the 4 Macro Rules of Feature Selection using Normalised Data/Both Team Columns Only (FS_Val = 0.2 and Includes Winloss)

Uses experimental method of doing two steps of Linear Regression: the first to pick out the three players most likely to get votes, and the second to allocate those three players 3, 2 and 1. (This Proof of Concept only demonstrates adj_votes = 2) 

**Author: `Lang (Ron) Chen` 2021.12-2022.1**

___

**0. Import Libraries**

In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

from BrownlowPredictorTools2.predict import predict1, predict2
from BrownlowPredictorTools2.test import test1, test2
from BrownlowPredictorTools.return_tp import return_tp
from BrownlowPredictorTools2.wholeseason import wholeseason
from BrownlowPredictorTools.feature_selection2 import feature_selection2

In [2]:
choice = 'NormalisedData'

In [3]:
filelist = os.listdir(f'./Data/{choice}')[1:]
# Remove the first file (an ipynb checkpoint file)

**1. Feature Selection**

In [4]:
# Gets list of emperical test games (full 2021 season)
final_test_games = [file for file in filelist if '2021' in file]

In [5]:
# Gathers full games list (except 2021) and performs a single Train-Test Split (note different from previous KFold)
test_train_games = [file for file in filelist if '2021' not in file]
train_games, test_games = train_test_split(test_train_games, train_size = 0.8, test_size = 0.2, random_state = 42)

In [6]:
# Read in pre-prepared sample data of trained data only 
# (the same rows as if we used concatenated all the data from the train_games list)
train_data = pd.read_csv('Train_Data (N).csv')

In [7]:
ADJ_VOTES = 2

# Means replace the all rows with votes's labels (Brownlow Votes) to 2 votes (the mean).
# Alternatives: 1 vote (min), 3 votes (max) 

Bootstrap

In [8]:
# Bootstraps data
        
# Picks out data labelled 1 vote, 2 votes, 3 votes 
zero = train_data[train_data['Brownlow Votes'] == 0]
one = train_data[train_data['Brownlow Votes'] == 1]
two = train_data[train_data['Brownlow Votes'] == 2]
three = train_data[train_data['Brownlow Votes'] == 3]
tagged1 = train_data[train_data['Brownlow Votes'] > 0]

# Replaces votes with 2
tagged1['Brownlow Votes'] = tagged1['Brownlow Votes'].replace([1, 2, 3], ADJ_VOTES)

# Concatenates to get training data for first step of linear regression
first_lr_data = pd.concat([zero, tagged1], axis = 0)

# Concatenates to get training data for second step of linear regression
second_lr_data = pd.concat([one, two, three], axis = 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


Feature Selection

*First LR*

In [9]:
cols = [col for col in first_lr_data.columns if 'BTN' in col]
cols

['Kicks BTN',
 'Handballs BTN',
 'Disposals BTN',
 'Marks BTN',
 'Goals BTN',
 'Behinds BTN',
 'Tackles BTN',
 'Hitouts BTN',
 'Goal Assists BTN',
 'Inside 50s BTN',
 'Clearances BTN',
 'Clangers BTN',
 'Rebound 50s BTN',
 'Frees For BTN',
 'Frees Agains BTN',
 'Contested Possessions BTN',
 'Uncontested Possessions BTN',
 'Effective Disposals BTN',
 'Contested Marks BTN',
 'Marks Inside 50 BTN',
 'One Percenters BTN',
 'Bounces BTN',
 'Centre Clearances BTN',
 'Stoppage Clearances BTN',
 'Score Involvements BTN',
 'Metres Gained BTN',
 'Turnovers BTN',
 'Intercepts BTN',
 'Tackles Inside 50 BTN',
 'Time On Ground % BTN',
 'Uncontested Marks BTN',
 'Marks Outside 50 BTN',
 'Tackles Outside 50 BTN',
 'Behind Assists BTN',
 'Ineffective Disposals BTN']

In [10]:
corr1 = dict()
for col in cols:
    corr1[col] = first_lr_data[[col, 'Brownlow Votes']].corr(method = 'pearson').loc[col]['Brownlow Votes']
corr1

{'Kicks BTN': 0.34254897608648405,
 'Handballs BTN': 0.29030397576598344,
 'Disposals BTN': 0.3967014843577754,
 'Marks BTN': 0.17775265182547748,
 'Goals BTN': 0.2523598064549557,
 'Behinds BTN': 0.09662104485601558,
 'Tackles BTN': 0.15930180626603438,
 'Hitouts BTN': 0.02092510488870956,
 'Goal Assists BTN': 0.12528034321935894,
 'Inside 50s BTN': 0.2632271751903158,
 'Clearances BTN': 0.3103631289602141,
 'Clangers BTN': -0.12867189957860564,
 'Rebound 50s BTN': 0.0361561147704681,
 'Frees For BTN': 0.158697279769273,
 'Frees Agains BTN': -0.03440033114228382,
 'Contested Possessions BTN': 0.3570183744319988,
 'Uncontested Possessions BTN': 0.2861417731894224,
 'Effective Disposals BTN': 0.3600097454375318,
 'Contested Marks BTN': 0.0923023114230299,
 'Marks Inside 50 BTN': 0.1551229988631366,
 'One Percenters BTN': -0.05231145263181946,
 'Bounces BTN': 0.08086235948731897,
 'Centre Clearances BTN': 0.2570033007396064,
 'Stoppage Clearances BTN': 0.2731161066848122,
 'Score Involve

In [11]:
corr1 = list(corr1.items())
corr1

[('Kicks BTN', 0.34254897608648405),
 ('Handballs BTN', 0.29030397576598344),
 ('Disposals BTN', 0.3967014843577754),
 ('Marks BTN', 0.17775265182547748),
 ('Goals BTN', 0.2523598064549557),
 ('Behinds BTN', 0.09662104485601558),
 ('Tackles BTN', 0.15930180626603438),
 ('Hitouts BTN', 0.02092510488870956),
 ('Goal Assists BTN', 0.12528034321935894),
 ('Inside 50s BTN', 0.2632271751903158),
 ('Clearances BTN', 0.3103631289602141),
 ('Clangers BTN', -0.12867189957860564),
 ('Rebound 50s BTN', 0.0361561147704681),
 ('Frees For BTN', 0.158697279769273),
 ('Frees Agains BTN', -0.03440033114228382),
 ('Contested Possessions BTN', 0.3570183744319988),
 ('Uncontested Possessions BTN', 0.2861417731894224),
 ('Effective Disposals BTN', 0.3600097454375318),
 ('Contested Marks BTN', 0.0923023114230299),
 ('Marks Inside 50 BTN', 0.1551229988631366),
 ('One Percenters BTN', -0.05231145263181946),
 ('Bounces BTN', 0.08086235948731897),
 ('Centre Clearances BTN', 0.2570033007396064),
 ('Stoppage Clear

In [12]:
selected_features1 = [col[0] for col in corr1 if col[1] > 0.2]
selected_features1

['Kicks BTN',
 'Handballs BTN',
 'Disposals BTN',
 'Goals BTN',
 'Inside 50s BTN',
 'Clearances BTN',
 'Contested Possessions BTN',
 'Uncontested Possessions BTN',
 'Effective Disposals BTN',
 'Centre Clearances BTN',
 'Stoppage Clearances BTN',
 'Score Involvements BTN',
 'Metres Gained BTN',
 'Behind Assists BTN',
 'Ineffective Disposals BTN']

*Second LR*

In [13]:
corr2 = dict()
for col in cols:
    corr2[col] = second_lr_data[[col, 'Brownlow Votes']].corr(method = 'pearson').loc[col]['Brownlow Votes']
corr2

{'Kicks BTN': 0.16806498165519884,
 'Handballs BTN': 0.11410782023284825,
 'Disposals BTN': 0.19265967566658948,
 'Marks BTN': 0.040029902523200135,
 'Goals BTN': 0.1118167497113381,
 'Behinds BTN': 0.06519021127478913,
 'Tackles BTN': 0.031672694163300666,
 'Hitouts BTN': -0.03054408421422031,
 'Goal Assists BTN': 0.08259384485804165,
 'Inside 50s BTN': 0.130298620435625,
 'Clearances BTN': 0.15623123854389537,
 'Clangers BTN': -0.025923232382053286,
 'Rebound 50s BTN': -0.023148361144809285,
 'Frees For BTN': 0.08207718476473534,
 'Frees Agains BTN': -0.017991706345267772,
 'Contested Possessions BTN': 0.19214196056197758,
 'Uncontested Possessions BTN': 0.09103031676176843,
 'Effective Disposals BTN': 0.17264034262690256,
 'Contested Marks BTN': 0.0378216125045446,
 'Marks Inside 50 BTN': 0.07494450916094313,
 'One Percenters BTN': -0.03729972021031512,
 'Bounces BTN': 0.036411041209316375,
 'Centre Clearances BTN': 0.13804380527703444,
 'Stoppage Clearances BTN': 0.1315271273965410

In [14]:
sort_corr2 = list(corr2.items())
sort_corr2

[('Kicks BTN', 0.16806498165519884),
 ('Handballs BTN', 0.11410782023284825),
 ('Disposals BTN', 0.19265967566658948),
 ('Marks BTN', 0.040029902523200135),
 ('Goals BTN', 0.1118167497113381),
 ('Behinds BTN', 0.06519021127478913),
 ('Tackles BTN', 0.031672694163300666),
 ('Hitouts BTN', -0.03054408421422031),
 ('Goal Assists BTN', 0.08259384485804165),
 ('Inside 50s BTN', 0.130298620435625),
 ('Clearances BTN', 0.15623123854389537),
 ('Clangers BTN', -0.025923232382053286),
 ('Rebound 50s BTN', -0.023148361144809285),
 ('Frees For BTN', 0.08207718476473534),
 ('Frees Agains BTN', -0.017991706345267772),
 ('Contested Possessions BTN', 0.19214196056197758),
 ('Uncontested Possessions BTN', 0.09103031676176843),
 ('Effective Disposals BTN', 0.17264034262690256),
 ('Contested Marks BTN', 0.0378216125045446),
 ('Marks Inside 50 BTN', 0.07494450916094313),
 ('One Percenters BTN', -0.03729972021031512),
 ('Bounces BTN', 0.036411041209316375),
 ('Centre Clearances BTN', 0.13804380527703444),


In [15]:
selected_features2 = [col[0] for col in sort_corr2 if col[1] > 0.2]
selected_features2

['Score Involvements BTN', 'Behind Assists BTN']

**2.Trains Models**

0. Demonstration of functions (completely same as those in libraries)

In [16]:
# import pandas as pd
# import numpy as np

def predict1(test_games, lm, selected_features, choice):
    """ takes test_games and model to output predictions (of whether a player is a votegetter) as a list and observations as a Series """
    
    prediction = list()
    
    testdata_y = pd.DataFrame()
    
    for file in test_games:
        
        # Open test game file
        df = pd.read_csv(f'./Data/{choice}/{file}')
        
        # Initialise a list for this game
        tmp = [0 for i in range(len(df))]
        
        # Make predictions for 3 players most likely to get votes
        x_final = df[selected_features].replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)
        y_pred = lm.predict(x_final)
        
        # prepares test data by replacing the votes with 1 (1 means True/votegetter)
        df['Brownlow Votes'] = df['Brownlow Votes'].replace([1, 2, 3], 1)
        testdata_y = pd.concat([testdata_y, df['Brownlow Votes']], axis=0)
        
        # records the top three scorers with 1 (denoting 'marked as True/votegetter')
        enumerated = [(i, score) for i, score in enumerate(y_pred)]
        enumerated.sort(key = lambda x:x[1], reverse = True)
        
        for i in range(3):
            tmp[enumerated[i][0]] = 1
        
        # Can un-comment to observe what actual scores the linear regression is outputting  
#         print(enumerated[0:3])
#         print(enumerated[-3:])
        
        prediction.extend(tmp)
        
    return prediction, testdata_y


In [17]:
# import pandas as pd
# import numpy as np

def predict2(test_games, lm, selected_features, choice):
    """ takes test_games and model to output predictions as a list and observations as a Series """
    
    prediction = list()
    
    testdata_y = pd.DataFrame()
    
    for file in test_games:
        
        # Open test game file
        df = pd.read_csv(f'./Data/{choice}/{file}')
        
        # slice out just the votegetters
        df = df[df['Brownlow Votes'] > 0]
        
        # initialises list
        tmp = [0 for i in range(len(df))]
        
        # Make predictions
        x_final = df[selected_features].replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)
        y_pred = lm.predict(x_final)
        
        # Collect the actual observations
        testdata_y = pd.concat([testdata_y, df['Brownlow Votes']], axis=0)
        
        # Find top 3 scoring players for the game and allocate 3, 2, 1 votes to their respective index on the tmp list
        enumerated = [(i, score) for i, score in enumerate(y_pred)]
        enumerated.sort(key = lambda x:x[1], reverse = True)
        
        tmp[enumerated[0][0]] = 3
        tmp[enumerated[1][0]] = 2
        tmp[enumerated[2][0]] = 1
        
        # Can un-comment to observe what actual scores the linear regression is outputting 
#         print(enumerated[0:3])
#         print(enumerated[-3:])
        
        prediction.extend(tmp)
    
    testdata_y.index = range(0, len(testdata_y))
    
    return prediction, testdata_y

In [18]:
def test1(predictions, testdata_y, nchoice):
    """ Calculates the tp/tn for no votes, have vote. Polymorphic as to how many choices there are """
    
    # instantiate tally
    result1 = list() # tp/tn/tp/fp calculated with respect to predictions (i.e. predicted 1, obs 0 => contribute to fp 1)
    result2 = list() # tp/tn/tp/fp calculated with respect to observations (i.e. predicted 1, obs 0 => contribute to fp 0)
    
    # Initialise the result1 and result2 tallys based on nchoice
    for i in range(nchoice):
        tmp1 = [0 for j in range(nchoice)]
        tmp2 = [0 for j in range(nchoice)]
        result1.append(tmp1)
        result2.append(tmp2)
        
    # Run through the predictions and add to tally according to whether it is tp/tn/fp/fn
    for i in range(len(predictions)):
        result1[predictions[i]][int(testdata_y.iloc[i][0])] += 1
        result2[int(testdata_y.iloc[i][0])][predictions[i]] += 1
    
     # Find the sum of each row and then take percentage based on it (because we are taking tp/fp/tn/fn with respect to either predictions or observations rather than total)
    for i in range(nchoice):
        
        sum_row1 = sum(result1[i])
        sum_row2 = sum(result2[i])
        
        for j in range(nchoice):
            result1[i][j] = result1[i][j]/sum_row1
            result2[i][j] = result2[i][j]/sum_row2
            
    return result1, result2

In [19]:
def test2(predictions, testdata_y, nchoice):
    """ Same as above except special adjustment for 1 vote, 2 votes 3 votes"""
    
    result1 = list()
    result2 = list()
    
    for i in range(nchoice):
        tmp1 = [0 for j in range(nchoice)]
        tmp2 = [0 for j in range(nchoice)]
        result1.append(tmp1)
        result2.append(tmp2)
    
    for i in range(len(predictions)):
        result1[predictions[i]-1][int(testdata_y.iloc[i][0])-1] += 1 # the use of -1 is a special adjustment
        result2[int(testdata_y.iloc[i][0])-1][predictions[i]-1] += 1
    
    for i in range(nchoice):
        
        sum_row1 = sum(result1[i])
        sum_row2 = sum(result2[i])
        
        for j in range(nchoice):
            result1[i][j] = result1[i][j]/sum_row1
            result2[i][j] = result2[i][j]/sum_row2
            
    return result1, result2

Also avaliable is the predict1_mass, predict2_mass, test1_mass and test2_mass function used in the scripts - not used in Proof of Concepts

In [20]:
# import pandas as pd
# import numpy as np

def predict1_mass(test_games, lm, selected_features, choice):
    """ takes test_games and model to output predictions (of whether a player is a votegetter) as a list and observations as a Series """
    
    prediction = list()
    
    testdata_y = pd.DataFrame()
    
    for file in test_games:
        
        # Open test game file
        df = pd.read_csv(f'./Data/{choice}/{file}')
        
        # Initialise a list for this game
        tmp = [0 for i in range(len(df))]
        
        # Make predictions for 3 players most likely to get votes
        x_final = df[selected_features].replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)
        y_pred = lm.predict(x_final)
        
        # prepares test data by replacing the votes with 1 (1 means True/votegetter)
        df['Brownlow Votes'] = df['Brownlow Votes'].replace([1, 2, 3], 1)
        testdata_y = pd.concat([testdata_y, df['Brownlow Votes']], axis=0)
        
        # records the top three scorers with 1 (denoting 'marked as True/votegetter')
        enumerated = [(i, score) for i, score in enumerate(y_pred)]
        enumerated.sort(key = lambda x:x[1], reverse = True)
        
        for i in range(3):
            tmp[enumerated[i][0]] = 1
        
        # Can un-comment to observe what actual scores the linear regression is outputting  
#         print(enumerated[0:3])
#         print(enumerated[-3:])
        
        prediction.extend(tmp)
        
    out = pd.DataFrame({'predictions': prediction, 'observations': list(testdata_y[testdata_y.columns[0]])})
    
    return out

In [21]:
# import pandas as pd
# import numpy as np

def predict2_mass(test_games, lm, selected_features, choice):
    """ takes test_games and model to output predictions as a list and observations as a Series """
    
    prediction = list()
    
    testdata_y = pd.DataFrame()
    
    for file in test_games:
        
        # Open test game file
        df = pd.read_csv(f'./Data/{choice}/{file}')
        
        # slice out just the votegetters
        df = df[df['Brownlow Votes'] > 0]
        
        # initialises list
        tmp = [0 for i in range(len(df))]
        
        # Make predictions
        x_final = df[selected_features].replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)
        y_pred = lm.predict(x_final)
        
        # Collect the actual observations
        testdata_y = pd.concat([testdata_y, df['Brownlow Votes']], axis=0)
        
        # Find top 3 scoring players for the game and allocate 3, 2, 1 votes to their respective index on the tmp list
        enumerated = [(i, score) for i, score in enumerate(y_pred)]
        enumerated.sort(key = lambda x:x[1], reverse = True)
        
        tmp[enumerated[0][0]] = 3
        tmp[enumerated[1][0]] = 2
        tmp[enumerated[2][0]] = 1
        
        # Can un-comment to observe what actual scores the linear regression is outputting 
#         print(enumerated[0:3])
#         print(enumerated[-3:])
        
        prediction.extend(tmp)
    
    testdata_y.index = range(0, len(testdata_y))
    
    out = pd.DataFrame({'predictions': prediction, 'observations': list(testdata_y[testdata_y.columns[0]])})
    
    return out

In [22]:
# import pandas as pd

def test1_mass(out, adj_vote):
    """ Helper function for returning tp/tn for mass testing for step 1 (more efficient as does not collect other stats)"""
    
    tp0 = len(out[(out['predictions'] == out['observations'].astype(int)) & (out['predictions'] == 0)])/len(out[(out['predictions'] == 0)])
    tp1 = len(out[(out['predictions'] == out['observations'].astype(int)) & (out['predictions'] == 1)])/len(out[(out['predictions'] == adj_vote)])
    
    return (tp0, tp1,)

In [23]:
# import pandas as pd

def test2_mass(out):
    """ Helper function for returning tp/tn for mass testing for step 2 (more efficient as does not collect other stats)"""
    
    tp1 = len(out[(out['predictions'] == out['observations'].astype(int)) & (out['predictions'] == 1)])/len(out[(out['predictions'] == 1)])
    tp2 = len(out[(out['predictions'] == out['observations'].astype(int)) & (out['predictions'] == 2)])/len(out[(out['predictions'] == 2)])
    tp3 = len(out[(out['predictions'] == out['observations'].astype(int)) & (out['predictions'] == 3)])/len(out[(out['predictions'] == 3)])
    
    return (tp1, tp2, tp3,)

1. Micro Rule of Feature Selection 1: 

*All cols that passed FS_val selected*

In [24]:
# Trains LR model for step 1
traindataf_x_1 = first_lr_data[selected_features1]
traindataf_x_1.index = range(0,len(first_lr_data))
traindataf_y_1 = first_lr_data['Brownlow Votes']
traindataf_y_1.index = range(0,len(first_lr_data))

lm_f_1 = linear_model.LinearRegression()
traindataf_x_1 = traindataf_x_1.replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)
modelf_1 = lm_f_1.fit(traindataf_x_1, traindataf_y_1)

In [25]:
# Get predictions and observations for step 1
predictionsf_1, testdataf_y_1 = predict1(test_games, lm_f_1, selected_features1, choice)

In [26]:
# Get True Positive/True Negative results for step 1
resultf1_1, resultf2_1 = test1(predictionsf_1, testdataf_y_1, 2)

In [27]:
# TP/TN based on what was predicted for step 1
resultf1_1

[[0.9702843753328363, 0.029715624667163702],
 [0.40611353711790393, 0.5938864628820961]]

In [28]:
# TP/TN based on what was observed for step 1
resultf2_1

[[0.9702843753328363, 0.029715624667163702],
 [0.40611353711790393, 0.5938864628820961]]

In [29]:
# Only the True Positive Values for step 1
return_tp(resultf1_1)

(0.9702843753328363, 0.5938864628820961)

In [30]:
# Trains LR model for step 2
traindatas_x_1 = second_lr_data[selected_features2]
traindatas_x_1.index = range(0,len(second_lr_data))
traindatas_y_1 = second_lr_data['Brownlow Votes']
traindatas_y_1.index = range(0,len(second_lr_data))

lm_s_1 = linear_model.LinearRegression()
traindatas_x_1 = traindatas_x_1.replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)
models_1 = lm_s_1.fit(traindatas_x_1, traindatas_y_1)

In [31]:
# Get predictions and observations for step 2
predictionss_1, testdatas_y_1 = predict2(test_games, lm_s_1, selected_features2, choice)

In [32]:
# Get True Positive/True Negative results for step 2
results1_1, results2_1 = test2(predictionss_1, testdatas_y_1, 3)

In [33]:
# TP/TN based on what was predicted for step 2
results1_1

[[0.42358078602620086, 0.3406113537117904, 0.23580786026200873],
 [0.31877729257641924, 0.34934497816593885, 0.3318777292576419],
 [0.2576419213973799, 0.31004366812227074, 0.43231441048034935]]

In [34]:
# TP/TN based on what was observed for step 2
results2_1

[[0.42358078602620086, 0.31877729257641924, 0.2576419213973799],
 [0.3406113537117904, 0.34934497816593885, 0.31004366812227074],
 [0.23580786026200873, 0.3318777292576419, 0.43231441048034935]]

In [35]:
# Only the True Positive Values for step 2
return_tp(results1_1)

(0.42358078602620086, 0.34934497816593885, 0.43231441048034935)

2. Micro Rule of Feature Selection 2

*-For those with dependency/triangle relationships (i.e. A=Disposals/B=Kicks/C=Handballs), if A comes first then B, C excluded. If B or C comes first then A excluded*

In [36]:
# Selects feature according to micro FS_Rule for step-1
selected_features1_2 = feature_selection2(selected_features1, 2, False)
selected_features1_2

['Kicks BTN',
 'Handballs BTN',
 'Goals BTN',
 'Inside 50s BTN',
 'Clearances BTN',
 'Contested Possessions BTN',
 'Uncontested Possessions BTN',
 'Effective Disposals BTN',
 'Score Involvements BTN',
 'Metres Gained BTN',
 'Ineffective Disposals BTN']

In [37]:
# Selects feature according to micro FS_Rule for step-2
selected_features2_2 = feature_selection2(selected_features2, 2, False)
selected_features2_2
#All other operations hereonin same as 1.

['Score Involvements BTN']

In [38]:
traindataf_x_2 = first_lr_data[selected_features1_2]
traindataf_x_2.index = range(0,len(first_lr_data))
traindataf_y_2 = first_lr_data['Brownlow Votes']
traindataf_y_2.index = range(0,len(first_lr_data))

lm_f_2 = linear_model.LinearRegression()
traindataf_x_2 = traindataf_x_2.replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)
modelf_2 = lm_f_2.fit(traindataf_x_2, traindataf_y_2)

predictionsf_2, testdataf_y_2 = predict1(test_games, lm_f_2, selected_features1_2, choice)

resultf1_2, resultf2_2 = test1(predictionsf_2, testdataf_y_2, 2)

In [39]:
resultf1_2

[[0.9697518372563638, 0.03024816274363617],
 [0.413391557496361, 0.586608442503639]]

In [40]:
resultf2_2

[[0.9697518372563638, 0.03024816274363617],
 [0.413391557496361, 0.586608442503639]]

In [41]:
return_tp(resultf1_2)

(0.9697518372563638, 0.586608442503639)

In [42]:
traindatas_x_2 = second_lr_data[selected_features2_2]
traindatas_x_2.index = range(0,len(second_lr_data))
traindatas_y_2 = second_lr_data['Brownlow Votes']
traindatas_y_2.index = range(0,len(second_lr_data))

lm_s_2 = linear_model.LinearRegression()
traindatas_x_2 = traindatas_x_2.replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)
models_2 = lm_s_2.fit(traindatas_x_2, traindatas_y_2)

In [43]:
predictionss_2, testdatas_y_2 = predict2(test_games, lm_s_2, selected_features2_2, choice)

In [44]:
results1_2, results2_2 = test2(predictionss_2, testdatas_y_2, 3)

In [45]:
results1_2

[[0.43231441048034935, 0.32751091703056767, 0.24017467248908297],
 [0.31877729257641924, 0.38427947598253276, 0.29694323144104806],
 [0.24890829694323144, 0.28820960698689957, 0.462882096069869]]

In [46]:
results2_2

[[0.43231441048034935, 0.31877729257641924, 0.24890829694323144],
 [0.32751091703056767, 0.38427947598253276, 0.28820960698689957],
 [0.24017467248908297, 0.29694323144104806, 0.462882096069869]]

In [47]:
return_tp(results1_2)

(0.43231441048034935, 0.38427947598253276, 0.462882096069869)

3. Micro Rule of Feature Selection 3: 

*All cols that passed FS_val selected but abandon all 'summary' cols such as Disposal/Tackles/Marks*

In [48]:
selected_features1_3 = feature_selection2(selected_features1, 3, False)
selected_features1_3

['Kicks BTN',
 'Handballs BTN',
 'Goals BTN',
 'Inside 50s BTN',
 'Contested Possessions BTN',
 'Uncontested Possessions BTN',
 'Effective Disposals BTN',
 'Centre Clearances BTN',
 'Stoppage Clearances BTN',
 'Metres Gained BTN',
 'Behind Assists BTN',
 'Ineffective Disposals BTN']

In [49]:
selected_features2_3 = feature_selection2(selected_features2, 3, False)
selected_features2_3

['Behind Assists BTN']

In [50]:
traindataf_x_3 = first_lr_data[selected_features1_3]
traindataf_x_3.index = range(0,len(first_lr_data))
traindataf_y_3 = first_lr_data['Brownlow Votes']
traindataf_y_3.index = range(0,len(first_lr_data))

lm_f_3 = linear_model.LinearRegression()
traindataf_x_3 = traindataf_x_3.replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)
modelf_3 = lm_f_3.fit(traindataf_x_3, traindataf_y_3)

predictionsf_3, testdataf_y_3 = predict1(test_games, lm_f_3, selected_features1_3, choice)

resultf1_3, resultf2_3 = test1(predictionsf_3, testdataf_y_3, 2)

In [51]:
resultf1_3

[[0.9695388220257748, 0.030461177974225158],
 [0.4163027656477438, 0.5836972343522562]]

In [52]:
resultf2_3

[[0.9695388220257748, 0.030461177974225158],
 [0.4163027656477438, 0.5836972343522562]]

In [53]:
return_tp(resultf1_3)

(0.9695388220257748, 0.5836972343522562)

In [54]:
traindatas_x_3 = second_lr_data[selected_features2_3]
traindatas_x_3.index = range(0,len(second_lr_data))
traindatas_y_3 = second_lr_data['Brownlow Votes']
traindatas_y_3.index = range(0,len(second_lr_data))

lm_s_3 = linear_model.LinearRegression()
traindatas_x_3 = traindatas_x_3.replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)
models_3 = lm_s_3.fit(traindatas_x_3, traindatas_y_3)

predictionss_3, testdatas_y_3 = predict2(test_games, lm_s_3, selected_features2_3, choice)

results1_3, results2_3 = test2(predictionss_3, testdatas_y_3, 3)

In [55]:
results1_3

[[0.4148471615720524, 0.33624454148471616, 0.24890829694323144],
 [0.3624454148471616, 0.3537117903930131, 0.2838427947598253],
 [0.22270742358078602, 0.31004366812227074, 0.4672489082969432]]

In [56]:
results2_3

[[0.4148471615720524, 0.3624454148471616, 0.22270742358078602],
 [0.33624454148471616, 0.3537117903930131, 0.31004366812227074],
 [0.24890829694323144, 0.2838427947598253, 0.4672489082969432]]

In [57]:
return_tp(results1_3)

(0.4148471615720524, 0.3537117903930131, 0.4672489082969432)

4. Micro Rule of Feature Selection 4: 

*Exclude Disposals, otherwise as per rule 2*

In [58]:
selected_features1_4 = feature_selection2(selected_features1, 4, False)
selected_features1_4

['Kicks BTN',
 'Handballs BTN',
 'Goals BTN',
 'Inside 50s BTN',
 'Clearances BTN',
 'Contested Possessions BTN',
 'Uncontested Possessions BTN',
 'Effective Disposals BTN',
 'Score Involvements BTN',
 'Metres Gained BTN',
 'Ineffective Disposals BTN']

In [59]:
selected_features2_4 = feature_selection2(selected_features1, 4, False)
selected_features2_4

['Kicks BTN',
 'Handballs BTN',
 'Goals BTN',
 'Inside 50s BTN',
 'Clearances BTN',
 'Contested Possessions BTN',
 'Uncontested Possessions BTN',
 'Effective Disposals BTN',
 'Score Involvements BTN',
 'Metres Gained BTN',
 'Ineffective Disposals BTN']

In [60]:
traindataf_x_4 = first_lr_data[selected_features1_4]
traindataf_x_4.index = range(0,len(first_lr_data))
traindataf_y_4 = first_lr_data['Brownlow Votes']
traindataf_y_4.index = range(0,len(first_lr_data))

lm_f_4 = linear_model.LinearRegression()
traindataf_x_4 = traindataf_x_4.replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)
modelf_4 = lm_f_4.fit(traindataf_x_4, traindataf_y_4)

predictionsf_4, testdataf_y_4 = predict1(test_games, lm_f_4, selected_features1_4, choice)

resultf1_4, resultf2_4 = test1(predictionsf_4, testdataf_y_4, 2)

In [61]:
resultf1_4

[[0.9697518372563638, 0.03024816274363617],
 [0.413391557496361, 0.586608442503639]]

In [62]:
resultf2_4

[[0.9697518372563638, 0.03024816274363617],
 [0.413391557496361, 0.586608442503639]]

In [63]:
return_tp(resultf1_4)

(0.9697518372563638, 0.586608442503639)

In [64]:
traindatas_x_4 = second_lr_data[selected_features2_4]
traindatas_x_4.index = range(0,len(second_lr_data))
traindatas_y_4 = second_lr_data['Brownlow Votes']
traindatas_y_4.index = range(0,len(second_lr_data))

lm_s_4 = linear_model.LinearRegression()
traindatas_x_4 = traindatas_x_4.replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)
models_4 = lm_s_4.fit(traindatas_x_4, traindatas_y_4)

predictionss_4, testdatas_y_4 = predict2(test_games, lm_s_4, selected_features2_4, choice)

results1_4, results2_4 = test2(predictionss_4, testdatas_y_4, 3)

In [65]:
results1_4

[[0.5545851528384279, 0.3318777292576419, 0.11353711790393013],
 [0.25327510917030566, 0.37117903930131, 0.37554585152838427],
 [0.19213973799126638, 0.29694323144104806, 0.5109170305676856]]

In [66]:
results2_4

[[0.5545851528384279, 0.25327510917030566, 0.19213973799126638],
 [0.3318777292576419, 0.37117903930131, 0.29694323144104806],
 [0.11353711790393013, 0.37554585152838427, 0.5109170305676856]]

In [67]:
return_tp(results1_4)

(0.5545851528384279, 0.37117903930131, 0.5109170305676856)

**3. Summary Observations**

0. Demonstration of functions (completely same as those in libraries)

In [68]:
# import pandas as pd
from collections import defaultdict as dd
# import numpy as np

def wholeseason(final_test_games, lm1, lm2, selected_features1, selected_features2, choice):
    """ Helper function for running emperical test - returns tuple of leaderboard of players for the season (with votes) """
    
    players = dd(int)
    
    for file in final_test_games:
        
        # Open each final test season's game
        df = pd.read_csv(f'./Data/{choice}/{file}')
        
        # Run predictions for step 1
        x_final = df[selected_features1].replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)
        y_pred = lm1.predict(x_final)
        
        # find top 3 scoring players of the game for step 1 - the predicted votegetters
        enumerated = [(i, score) for i, score in enumerate(y_pred)]
        enumerated.sort(key = lambda x:x[1], reverse = True)
        
        # Find index for the predicted votegetters and make relevant dataframe
        secondround = [x[0] for x in enumerated[0:3]]
        
        df2 = df.iloc[secondround]
        df2.index = list(range(3))
        
        # Run predictions for step 2
        y_pred2 = lm2.predict(df2[selected_features2])
        
        # Rank the top 3 players of the game from highest score to lowest
        enumerated2 = [(i, score) for i, score in enumerate(y_pred2)]
        enumerated2.sort(key = lambda x:x[1], reverse = True)
        
         # Find their (top 3 players) names and insert them into the tally by adding 3 votes, 2 votes and 1 vote respectively
        for j in range(3):
            index = secondround[enumerated2[j][0]]
            players[df.loc[index]['Player']] += (3-j)
    
    # Sort the leaderboard so top pollers are ranked first
    leaderboard = sorted(list(players.items()), reverse = True, key = lambda x:x[1])
    
    return leaderboard

1. Emperical Experiment

In [69]:
leaderboard1 = wholeseason(final_test_games, lm_f_1, lm_s_1, selected_features1, selected_features2, choice)
leaderboard2 = wholeseason(final_test_games, lm_f_2, lm_s_2, selected_features1_2, selected_features2_2, choice)
leaderboard3 = wholeseason(final_test_games, lm_f_3, lm_s_3, selected_features1_3, selected_features2_3, choice)
leaderboard4 = wholeseason(final_test_games, lm_f_4, lm_s_4, selected_features1_4, selected_features2_4, choice)

In [70]:
leaderboard1[0:15]

[('Christian Petracca', 32),
 ('Oliver Wines', 31),
 ('Clayton Oliver', 27),
 ('Darcy Parish', 27),
 ('Jarryd Lyons', 26),
 ('Jack Steele', 26),
 ('Marcus Bontempelli', 24),
 ('Tom Mitchell', 21),
 ('Jackson Macrae', 21),
 ('David Mundy', 21),
 ('Sam Walsh', 20),
 ('Luke Parker', 20),
 ('Jake Stringer', 20),
 ('Cameron Guthrie', 18),
 ('Travis Boak', 18)]

In [71]:
leaderboard2[0:15]

[('Clayton Oliver', 30),
 ('Christian Petracca', 29),
 ('Oliver Wines', 27),
 ('Jack Steele', 27),
 ('Jarryd Lyons', 26),
 ('Darcy Parish', 26),
 ('Sam Walsh', 24),
 ('Marcus Bontempelli', 23),
 ('Thomas Liberatore', 21),
 ('Travis Boak', 21),
 ('David Mundy', 21),
 ('Cameron Guthrie', 19),
 ('Jackson Macrae', 19),
 ('Tom Mitchell', 18),
 ('Rory Laird', 18)]

In [72]:
leaderboard3[0:15]

[('Christian Petracca', 34),
 ('Oliver Wines', 31),
 ('Clayton Oliver', 28),
 ('Darcy Parish', 28),
 ('Jack Steele', 26),
 ('Jarryd Lyons', 25),
 ('Sam Walsh', 22),
 ('Tom Mitchell', 21),
 ('Jackson Macrae', 21),
 ('Marcus Bontempelli', 21),
 ('Cameron Guthrie', 20),
 ('David Mundy', 20),
 ('Rory Laird', 19),
 ('Callum Mills', 18),
 ('Thomas Liberatore', 18)]

In [73]:
leaderboard4[0:15]

[('Oliver Wines', 33),
 ('Jack Steele', 32),
 ('Darcy Parish', 30),
 ('Jarryd Lyons', 29),
 ('Clayton Oliver', 29),
 ('Christian Petracca', 27),
 ('Tom Mitchell', 26),
 ('Touk Miller', 24),
 ('Jackson Macrae', 24),
 ('Marcus Bontempelli', 23),
 ('Rory Laird', 22),
 ('Cameron Guthrie', 21),
 ('Sam Walsh', 21),
 ('Travis Boak', 20),
 ('Taylor Adams', 18)]

2. Predictor's r scores

In [74]:
print(lm_f_1.score(traindataf_x_1, traindataf_y_1))
print(lm_s_1.score(traindatas_x_1, traindatas_y_1))
print(lm_f_2.score(traindataf_x_2, traindataf_y_2))
print(lm_s_2.score(traindatas_x_2, traindatas_y_2))
print(lm_f_3.score(traindataf_x_3, traindataf_y_3))
print(lm_s_3.score(traindatas_x_3, traindatas_y_3))
print(lm_f_4.score(traindataf_x_4, traindataf_y_4))
print(lm_s_4.score(traindatas_x_4, traindatas_y_4))

0.24076122459350024
0.05801737930048212
0.23898619444693103
0.057988964505007116
0.2391997414305683
0.050570156027257
0.23898619444693103
0.1330176528809971


## Note: A few improvements could be made on this notebook: ##

*1. A good way to get real tp1, tp2, tp3 stats may be to use the actual predicted indexes from predict1 in predict2. Current model only tests lm2's ability to rank correctly given the top 3 players, and assumes it has similar capabilities for any three players*