# PREDICTOR ##

Platform for running real predictions

**Author: `Lang (Ron) Chen` 2021.12-2022.1**

___

**0. Import Libraries**

In [1]:
import pandas as pd
import pickle
from collections import defaultdict as dd
import numpy as np
import os

In [2]:
choiceDict = {'NormalisedData': 'N', 'StandardisedData': 'S', 'RankStandardisedData': 'RS', 
          'PercentageData': 'P'}

**1. Select Year and Import Relevent Files**

In [3]:
MODEL = 1
# MODEL = 2
# MODEL = 3

In [4]:
YEAR = 2021

In [5]:
with open(f'M{MODEL}.pickle', 'rb') as f:
    obj = pickle.load(f)

In [6]:
lm = obj[0]
selected_features = obj[1]
choice = obj[2]

**2. Prepare Data**

In [7]:
# Function to scrape and manipulate data to be inserted here

In [8]:
filelist = os.listdir(f'../Data/{choice}')[1:]
filelist.sort()
filelist = filelist[1:]

final_test_games = [file for file in filelist if f'{YEAR}' in file]

**3. Run Predictions**

In [9]:
def wholeseason(final_test_games, lm, selected_features, choice):
    """ Helper function for running emperical test - returns tuple of leaderboard of players for the season (with votes) """
    
    roundByRound = pd.DataFrame({'Game': list(), 'Three': list(), 'Two': list(), 'One': list()})
    
    players = dd(int) # tally
    
    for file in final_test_games:
        
        # Open each final test season's game
        df = pd.read_csv(f'../Data/{choice}/{file}')
        
        # Run predictions
        x_final = df[selected_features].replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)
        y_pred = lm.predict(x_final)
        
        # Find top 3 scoring players of the game
        enumerated = [(i, score) for i, score in enumerate(y_pred)]
        enumerated.sort(key = lambda x:x[1], reverse = True)
        
        
        # Find their (top 3 players) names and insert them into the tally by adding 3 votes, 2 votes and 1 vote respectively
        for j in range(3):
            players[df.loc[enumerated[j][0]]['Player']] += (3-j)
        
        # Also record the votes by game
        tmp = pd.DataFrame({'Game': [file.strip(f' ({choiceDict[choice]})')], 'Three': [df.loc[enumerated[0][0]]['Player']], 
                            'Two': [df.loc[enumerated[1][0]]['Player']], 'One': [df.loc[enumerated[2][0]]['Player']]})
        
        roundByRound = pd.concat([roundByRound, tmp])
    
    # Sort the leaderboard so top pollers are ranked first
    leaderboard = sorted(list(players.items()), reverse = True, key = lambda x:x[1])
    
    return leaderboard, roundByRound

In [10]:
leaderboard, roundByRound = wholeseason(final_test_games, lm, selected_features, choice)

**4. Output predictions to CSV**

In [11]:
leaderboard[0:15]

[('Jack Steele', 39),
 ('Oliver Wines', 34),
 ('Clayton Oliver', 29),
 ('Marcus Bontempelli', 29),
 ('Christian Petracca', 28),
 ('Darcy Parish', 28),
 ('Jarryd Lyons', 26),
 ('Luke Parker', 24),
 ('Jackson Macrae', 21),
 ('Rory Laird', 21),
 ('Tom Mitchell', 20),
 ('Travis Boak', 20),
 ('Touk Miller', 19),
 ('Sam Walsh', 18),
 ('Jake Stringer', 18)]

In [12]:
names = [x[0] for x in leaderboard]
votes = [x[1] for x in leaderboard]
leaderboarddf = pd.DataFrame({'Player': names, 'Votes': votes})

leaderboarddf.to_csv(f'./Predictions/M{MODEL} {YEAR} leaderboard.csv', index = False)

In [13]:
roundByRound.to_csv(f'./Predictions/M{MODEL} {YEAR} roundByRound.csv', index = False)