# 7.3 Building Final Model: ML

In [1]:
import pandas as pd
import pickle
from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_squared_error

In [2]:
import lightgbm as LGB
from sklearn.ensemble import GradientBoostingRegressor

In [3]:
import warnings
warnings.simplefilter('ignore')

## Read in and Prepare Data

In [4]:
train_data3 = pd.read_csv('../data/curated/modelling/N_Train_3.csv')
val_data3 = pd.read_csv('../data/curated/modelling/N_Validate_3.csv')
test_data3 = pd.read_csv('../data/curated/modelling/N_Test_3.csv')

train_data2 = pd.read_csv('../data/curated/modelling/N_Train_2.csv')
val_data2 = pd.read_csv('../data/curated/modelling/N_Validate_2.csv')
test_data2 = pd.read_csv('../data/curated/modelling/N_Test_2.csv')

train_data1 = pd.read_csv('../data/curated/modelling/N_Train_1.csv')
val_data1 = pd.read_csv('../data/curated/modelling/N_Validate_1.csv')
test_data1 = pd.read_csv('../data/curated/modelling/N_Test_1.csv')

In [5]:
train3_x = train_data3.drop(['3'], axis=1)
train3_y = train_data3['3']
val3_x = val_data3.drop(['3'], axis=1)
val3_y = val_data3['3']

train2_x = train_data2.drop(['2'], axis=1)
train2_y = train_data2['2']
val2_x = val_data2.drop(['2'], axis=1)
val2_y = val_data2['2']

train1_x = train_data1.drop(['1'], axis=1)
train1_y = train_data1['1']
val1_x = val_data1.drop(['1'], axis=1)
val1_y = val_data1['1']

## Train Models

### 3 Votes

In [6]:
gbr = GradientBoostingRegressor
lgb = LGB

In [7]:
model3 = gbr(learning_rate=0.01, 
    n_estimators=200, 
    subsample = 0.5, 
    max_features=0.5, 
    ccp_alpha=0, 
    max_depth=5, 
    random_state = 19260817)

model3.fit(train3_x, train3_y)

In [8]:
pred_3 = model3.predict(val3_x)
r2_score(val3_y, pred_3)

0.12857760288686337

### 2 Votes

In [9]:
params={'num_iterations': 50, 'max_depth': 5, 'bagging_fraction': 0.5, 'feature_fraction': 0.75}

model2 = lgb.train(params, lgb.Dataset(train2_x, label=train2_y))

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 41373, number of used features: 29
[LightGBM] [Info] Start training from score 0.022648


In [10]:
pred_2 = model2.predict(val2_x)
r2_score(val2_y, pred_2)

0.10003083852819239

### 1 Vote

In [11]:
params={'num_iterations': 50, 'max_depth': 5, 'bagging_fraction': 0.25, 'feature_fraction': 0.5}

model1 = lgb.train(params, lgb.Dataset(train1_x, label=train1_y))

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12243
[LightGBM] [Info] Number of data points in the train set: 41373, number of used features: 49
[LightGBM] [Info] Start training from score 0.932057


In [12]:
pred_1 = model1.predict(val1_x)
r2_score(val1_y, pred_1)

0.49328453200941724

## Export models

In [13]:
with open('../models/final_models/model3.pickle', 'wb') as f:
    pickle.dump(model3, f)

In [14]:
with open('../models/final_models/model2.pickle', 'wb') as f:
    pickle.dump(model2, f)

In [15]:
with open('../models/final_models/model1.pickle', 'wb') as f:
    pickle.dump(model1, f)

# Sample Inference

In [22]:
import sys
import os
import pickle
py_file_location = "../scripts"
sys.path.append(os.path.abspath(py_file_location))

import ZhongShan

In [23]:
with open(f'../models/AFL_pipeline_N.pickle', 'rb') as f:
    sanmin = pickle.load(f)
    
model3_COLS = sanmin.final_features['3']
model2_COLS = sanmin.final_features['2']
model1_COLS = sanmin.final_features['1']

model3_COLS = [x for x in model3_COLS if x not in ['3', '2', '1']]
model2_COLS = [x for x in model2_COLS if x not in ['3', '2', '1']]
model1_COLS = [x for x in model1_COLS if x not in ['3', '2', '1']]

In [24]:
import os
from collections import defaultdict as dd
manip_type = 'NormalisedData'

csv_list = os.listdir(f'../data/curated/{manip_type}')
csv_list.sort()

tally = dd(int)

data = pd.DataFrame()
for file in csv_list[1:]:
    if '2022' in file:
        data = pd.read_csv(f'../data/curated/{manip_type}/{file}')

        player = data['Player']
        pred3 = model3.predict(data[model3_COLS])
        pred2 = model2.predict(data[model2_COLS])
        pred1 = model1.predict(data[model1_COLS])
        pred = pd.DataFrame({'player': player, '3': pred3, '2': pred2, '1': pred1})

        three_votes = list(pred.sort_values('3', ascending = False)['player'])[0]

        two_votes = list(pred.sort_values('2', ascending = False)['player'])[0]
        if two_votes == three_votes:
            two_votes = list(pred.sort_values('2', ascending = False)['player'])[1]

        one_vote = list(pred.sort_values('1', ascending = False)['player'])[0]
        if one_vote in (three_votes, two_votes):
            one_vote = list(pred.sort_values('2', ascending = False)['player'])[1]
            
            if one_vote == two_votes:
                one_vote = list(pred.sort_values('2', ascending = False)['player'])[2]

        tally[three_votes] += 3
        tally[two_votes] += 2
        tally[one_vote] += 1
        

In [25]:
tally_list = list(tally.items())
tally_list.sort(key = lambda x:x[1], reverse=True)

In [26]:
tally_list

[('Clayton Oliver', 35),
 ('Patrick Cripps', 28),
 ('Lachie Neale', 22),
 ('Jeremy Cameron', 21),
 ('Touk Miller', 21),
 ('Christian Petracca', 20),
 ('Charlie Curnow', 20),
 ('Rory Laird', 20),
 ('Darcy Parish', 19),
 ('Sam Walsh', 17),
 ('Chad Warner', 17),
 ('Andrew Brayshaw', 16),
 ('Stephen Coniglio', 16),
 ('Taylor Walker', 16),
 ('Zachary Merrett', 16),
 ('Bailey Smith', 14),
 ('Thomas Liberatore', 14),
 ('Brad Crouch', 13),
 ('Hugh McCluggage', 13),
 ('Joshua Kelly', 12),
 ('Tom Hawkins', 12),
 ('Shai Bolton', 12),
 ('Tom Lynch', 12),
 ('Dion Prestia', 11),
 ('Max King', 11),
 ('Nick Larkey', 11),
 ('Callum Mills', 11),
 ('Josh Dunkley', 11),
 ('Luke Parker', 10),
 ('Tim Taranto', 10),
 ('Daniel Rich', 10),
 ('Karl Amon', 10),
 ('Will Brodie', 10),
 ('Todd Marshall', 9),
 ('Aaron Naughton', 9),
 ('Mitchell Lewis', 9),
 ('Peter Wright', 9),
 ('Jack Higgins', 9),
 ('Marcus Bontempelli', 9),
 ('Connor Rozee', 9),
 ('Oliver Wines', 9),
 ('Ben Keays', 8),
 ('George Hewett', 8),
 ('P

In [27]:
import os
from collections import defaultdict as dd
manip_type = 'NormalisedData'

csv_list = os.listdir(f'../data/curated/{manip_type}')
csv_list.sort()

tally = dd(int)

data = pd.DataFrame()
for file in csv_list[1:]:
    if '2022' in file:
        data = pd.read_csv(f'../data/curated/{manip_type}/{file}')

        player = data['Player']
        pred3 = model3.predict(data[model3_COLS])
        pred = pd.DataFrame({'player': player, '3': pred3})

        three_votes = list(pred.sort_values('3', ascending = False)['player'])[0]

        two_votes = list(pred.sort_values('3', ascending = False)['player'])[1]

        one_vote = list(pred.sort_values('3', ascending = False)['player'])[2]

        tally[three_votes] += 3
        tally[two_votes] += 2
        tally[one_vote] += 1
        

In [28]:
tally_list = list(tally.items())
tally_list.sort(key = lambda x:x[1], reverse=True)

In [29]:
tally_list

[('Clayton Oliver', 37),
 ('Patrick Cripps', 29),
 ('Rory Laird', 29),
 ('Christian Petracca', 27),
 ('Touk Miller', 25),
 ('Zachary Merrett', 24),
 ('Lachie Neale', 24),
 ('Jeremy Cameron', 21),
 ('Darcy Parish', 21),
 ('Oliver Wines', 20),
 ('Jackson Macrae', 19),
 ('Sam Walsh', 19),
 ('Charlie Curnow', 18),
 ('Andrew Brayshaw', 16),
 ('Luke Parker', 16),
 ('Will Brodie', 16),
 ('Noah Anderson', 16),
 ('Ben Keays', 15),
 ('Dion Prestia', 15),
 ('Brad Crouch', 14),
 ('Stephen Coniglio', 14),
 ('Tom Mitchell', 14),
 ('Bailey Smith', 14),
 ('Taylor Walker', 14),
 ('Thomas Liberatore', 14),
 ('Hugh McCluggage', 14),
 ('Cameron Guthrie', 13),
 ('Joshua Kelly', 13),
 ('Luke D-Uniacke', 13),
 ('Chad Warner', 12),
 ('Tom Hawkins', 12),
 ('Shai Bolton', 12),
 ('Travis Boak', 11),
 ('Connor Rozee', 11),
 ('Callum Mills', 11),
 ('Isaac Heeney', 10),
 ('Jy Simpkin', 10),
 ('Todd Marshall', 10),
 ('Peter Wright', 10),
 ('Jack Steele', 9),
 ('Max King', 9),
 ('Jade Gresham', 9),
 ('Lance Franklin'