# 7.3 Building Final Model: ML

In [1]:
import pandas as pd
import pickle
from sklearn.metrics import r2_score

In [30]:
from PyTorch2Sklearn.MLP import MLP
from PyTorch2Sklearn.Transformer import Transformer
import torch.nn as nn
from PyTorch2Sklearn.utils.data import TabularDataset, TabularDataFactory
from sklearn.ensemble import GradientBoostingRegressor

In [5]:
import warnings
warnings.simplefilter('ignore')

## Read in and Prepare Data

In [6]:
train_data = pd.read_csv('../data/curated/modelling/train.csv')
val_data = pd.read_csv('../data/curated/modelling/val.csv')
test_data = pd.read_csv('../data/curated/modelling/test.csv')

with open('../models/feature_importance_ordering.pickle', 'rb') as f:
    feature_importance_ordering = pickle.load(f)

## Train Models

In [25]:
gbr = GradientBoostingRegressor(learning_rate=0.01,
                                   n_estimators=800,
                                   subsample=0.25,
                                   max_features=0.7,
                                   min_samples_split=64,
                                   max_depth=96,
                                   random_state=42)

gbr.fit(train_data[list(list(feature_importance_ordering.keys())[
           36])], train_data['target'])

In [28]:
gbr_pred = gbr.predict(val_data[list(list(feature_importance_ordering.keys())[
    36])])
r2_score(val_data['target'], gbr_pred)

0.9035036492718063

### 2 Votes

In [31]:
params = {'hidden_dim': 16, 'num_transformer_layers': 1, 'num_mlp_layers': 1, 'dropout': 0, 'batch_size': 128, 'nhead': 2, 'share_embedding_mlp': False, 'use_cls': False, 'epochs': 10, 'lr': 0.0001, 'batchnorm': False, 'grad_clip': False, 'random_state': 42, 'loss': nn.MSELoss(), 'input_dim': 37, 'output_dim': 1, 'mode': 'Regression', 'verbose': True, 'TabularDataFactory': TabularDataFactory,
          'TabularDataset': TabularDataset, }

tf = Transformer(**params)
tf.fit(train_data[list(list(feature_importance_ordering.keys())[
           36])], train_data['target'])

100%|██████████| 10/10 [02:48<00:00, 16.86s/it]


In [34]:
tf_pred = tf.predict(val_data[list(list(feature_importance_ordering.keys())[
    36])])
r2_score(val_data['target'], tf_pred)

0.9019904467979647

### 1 Vote

In [35]:
params = {'hidden_dim': 256, 'hidden_layers': 2, 'dropout': 0, 'batch_size': 32, 'epochs': 15, 'lr': 0.0001, 'batchnorm': False, 'grad_clip': False, 'random_state': 42, 'loss': nn.MSELoss(), 'input_dim': 37, 'output_dim': 1, 'mode': 'Regression', 'verbose': True, 'TabularDataFactory': TabularDataFactory,
          'TabularDataset': TabularDataset, }

mlp = MLP(**params)
mlp.fit(train_data[list(list(feature_importance_ordering.keys())[
           36])], train_data['target'])

100%|██████████| 15/15 [01:14<00:00,  4.96s/it]


In [37]:
mlp_pred = mlp.predict(train_data[list(list(feature_importance_ordering.keys())[
    36])])
r2_score(train_data['target'], mlp_pred)

0.9092463304640118

## Export models

In [38]:
with open('../models/final_models/gbr.pickle', 'wb') as f:
    pickle.dump(gbr, f)

In [39]:
with open('../models/final_models/transformer.pickle', 'wb') as f:
    pickle.dump(tf, f)

In [40]:
with open('../models/final_models/mlp.pickle', 'wb') as f:
    pickle.dump(mlp, f)

# Sample Inference

In [87]:
import os
from collections import defaultdict as dd
manip_type = 'NormalisedData'

csv_list = os.listdir(f'../data/curated/{manip_type}')
csv_list.sort()

tally = dd(int)

data = pd.DataFrame()
for file in csv_list[1:]:
    if '2023' in file:
        data = pd.read_csv(f'../data/curated/{manip_type}/{file}')

        player = data['Player']
        pred = tf.predict(
            data[list(list(feature_importance_ordering.keys())[36])])
        pred = pd.DataFrame({'player': player, 'predicted_score': pred})

        three_votes = list(pred.sort_values(
            'predicted_score', ascending=False)['player'])[0]

        two_votes = list(pred.sort_values(
            'predicted_score', ascending=False)['player'])[1]

        one_vote = list(pred.sort_values(
            'predicted_score', ascending=False)['player'])[2]

        tally[three_votes] += 3
        tally[two_votes] += 2
        tally[one_vote] += 1

tally_list = list(tally.items())
tally_list.sort(key=lambda x: x[1], reverse=True)

tally_list

[('Marcus Bontempelli', 28),
 ('Nick Daicos', 27),
 ('Tim Taranto', 27),
 ('Rory Laird', 27),
 ('Rowan Marshall', 26),
 ('Andrew Brayshaw', 24),
 ('Zak Butters', 24),
 ('Christian Petracca', 24),
 ('Thomas Stewart', 22),
 ('Errol Gulden', 22),
 ('James Sicily', 21),
 ('Tom Green', 20),
 ('Jordan Dawson', 20),
 ('Noah Anderson', 19),
 ('Lachie Neale', 18),
 ('Connor Rozee', 18),
 ('Josh Dunkley', 17),
 ('Caleb Serong', 17),
 ('Timothy English', 16),
 ('Brad Crouch', 16),
 ('Thomas Liberatore', 16),
 ('Luke D-Uniacke', 15),
 ('Charlie Curnow', 15),
 ('Zachary Merrett', 15),
 ('Taylor Walker', 15),
 ('Jack Viney', 15),
 ('Stephen Coniglio', 14),
 ('Nick Blakey', 14),
 ('Clayton Oliver', 14),
 ('Darcy Parish', 13),
 ('Max Gawn', 13),
 ('Shai Bolton', 13),
 ('Patrick Cripps', 13),
 ('Nic Newman', 13),
 ('Toby Greene', 12),
 ('Luke Ryan', 12),
 ('Adam Cerra', 12),
 ('Jack Sinclair', 11),
 ('Chad Warner', 10),
 ('Dan Houston', 10),
 ('Dustin Martin', 10),
 ('Jordan De Goey', 9),
 ('Harry Shee