# Packages

In [2]:
import sys, os
import pandas as pd
import numpy as np

# Scikit-learn
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

# Explicitly require this experimental feature
from sklearn.experimental import enable_halving_search_cv # noqa
# Now you can import normally from model_selection
from sklearn.model_selection import HalvingGridSearchCV

# XGBoost
import xgboost as xgb

# Warnings
import warnings
warnings.filterwarnings("ignore")
    
# Timings
%load_ext autotime
# %unload_ext autotime

# Progress bar
from tqdm import tqdm

time: 8.94 ms (started: 2021-09-17 13:49:03 +01:00)


## Read in cleaned data from local directory

(Temporary dataset *Will be updated soon)

In [6]:
# Redefine Index
df = pd.read_csv("/Users/samharrison/Documents/data_sci/fpl_points_predictor/data/cleaned_data_2020_21_season.csv")
df = df.set_index(['player_name','position','team_title','event','opponent_team_title'])

# TEMPORARY LINEs OF CODE
# REMOVES OBSERVED POINTS FOR GW38 - SO THAT WE HAVE SOME "UNOBSERVED GW'S"
df.loc[df.index.get_level_values('event').isin([37, 38]), 'total_points'] = np.nan
# SETS FINISHED COL TO "FALSE"
df.loc[df.index.get_level_values('event').isin([37, 38]), 'finished'] = False

print(df.shape)
df.head(4)

(14326, 47)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,finished,chance_of_playing_this_round,chance_of_playing_next_round,home_flag,goalkeeper_flag,defender_flag,midfielder_flag,forward_flag,goals_WMA,shots_WMA,...,xGBuildup_pgw,team_xG_pgw,team_goals_pgw,team_xGA_pgw,team_goals_against_pgw,opponent_xG_pgw,opponent_goals_pgw,opponent_xGA_pgw,opponent_goals_against_pgw,total_points
player_name,position,team_title,event,opponent_team_title,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
David Luiz Moreira Marinho,defender,Arsenal,3,Liverpool,True,50.0,100.0,0,0,1,0,0,0.0,0.0,...,0.0,1.745945,2.5,1.095049,0.5,2.703115,3.0,0.586998,1.5,1.0
David Luiz Moreira Marinho,defender,Arsenal,4,Sheffield United,True,50.0,100.0,1,0,1,0,0,0.0,0.0,...,0.015401,1.556687,2.0,1.642812,1.333333,1.110602,0.0,1.294917,1.333333,2.0
David Luiz Moreira Marinho,defender,Arsenal,5,Manchester City,True,50.0,100.0,0,0,1,0,0,0.0,0.0,...,0.126772,1.334298,2.0,1.268358,1.25,1.234542,1.5,1.540377,1.75,2.0
David Luiz Moreira Marinho,defender,Arsenal,6,Leicester,True,50.0,100.0,1,0,1,0,0,0.0,0.8,...,0.110798,1.235103,1.6,1.300732,1.2,1.5876,2.4,1.245883,1.6,1.0


time: 185 ms (started: 2021-09-17 13:49:37 +01:00)


## Define:
* The upcoming **gameweek number**.

In [7]:
# Find the upcoming gameweek
gameweek_num = df[df['finished']==False].index.get_level_values('event').min()
print(gameweek_num)

37
time: 19.9 ms (started: 2021-09-17 13:49:39 +01:00)


## Define:
* The **parameter grid** we will search across for the best possible settings.

*[Note: This grid was found during the **3_research** section.]*

In [8]:
# Create the parameter grid based on the results of random search (from the research part of this project)
param_grid = {
    'eta': np.linspace(0.01, 0.1, num = 5),
    'gamma': np.logspace(-10, -8, 5), 
    'max_depth': [3, 4, 6],
    'min_child_weight': [10, 12, 15], 
    'colsample_bytree': [0.45, 0.5, 0.55]}

time: 989 µs (started: 2021-09-17 13:49:40 +01:00)


## Run model

In [9]:
def get_preds(gameweek_num):

    """Returns the predictions for the next upcoming gameweeks.
    
    :param: int64 gameweek_num: The upcoming gameweek number, and 
            the first gameweek in the test range. E.g. if we're at 
            the gameweek 5 is next, we predict gameweeks 5,6,7,8 and 9. 

    :rtype: DataFrame train: Training dataset containing predictions.
            DataFrame test: Test dataset containing predictions.
    """    

    # Initialise gameweek ranges
    prev_gw = gameweek_num-1
    all_gameweeks = list(range(0,prev_gw+6))
    train_gameweeks = list(range(0,prev_gw+1))
    test_gameweeks = list(range(prev_gw+1,prev_gw+6))

    # Get all gameweeks in both sets of ranges
    df_all_gameweeks = df[(df.index.get_level_values('event').isin(all_gameweeks))]

    # Rename target variable
    df_all_gameweeks = df_all_gameweeks.rename(columns={'total_points':'total_points_actual'})
    
    # Drop "chance variables". 
    # Note: These will be reintroduced later
    df_all_gameweeks = df_all_gameweeks.drop(columns={'finished','chance_of_playing_this_round','chance_of_playing_next_round'})
    
    # Standardise the independent variables
    df_all_gameweeks.loc[:, df_all_gameweeks.columns != 'total_points_actual'] = StandardScaler().fit_transform(
                                            df_all_gameweeks.loc[:, df_all_gameweeks.columns != 'total_points_actual'])

    # Train-Test split the data
    df_train = df_all_gameweeks[(df_all_gameweeks.index.get_level_values('event').isin(train_gameweeks))]
    df_test = df_all_gameweeks[(df_all_gameweeks.index.get_level_values('event').isin(test_gameweeks))]

    # Define independent and dependent variables
    X_train = df_train.loc[:, df_train.columns != 'total_points_actual']
    y_train_actual = df_train['total_points_actual']     
    X_test = df_test.loc[:, df_test.columns != 'total_points_actual']
    y_test_actual = df_test['total_points_actual'] 
        
    # Initialise XGBoost
    xbgr = xgb.XGBRegressor()

    # Setup search heuristic using parameter grid from earlier
    sh = HalvingGridSearchCV(xbgr, param_grid, cv = 5, factor = 5, 
                min_resources ='exhaust', n_jobs = -1, verbose = 2, random_state = 42).fit(X_train, y_train_actual) 

    # Fit XGBoost best estimator to training data
    model = sh.best_estimator_.fit(X_train, y_train_actual) 

    # Make predictions    
    y_train_pred = cross_val_predict(model, X_train, y_train_actual, cv=10)
    y_test_pred = model.predict(X_test)

    # Create prediction column for train/test DataFrames
    df_train['total_points_predicted'] = y_train_pred
    df_test['total_points_predicted'] = y_test_pred

    return df_train, df_test

time: 1.13 ms (started: 2021-09-17 13:49:45 +01:00)


Call function. 

In [10]:
# Get predictions
df_train, df_test = get_preds(gameweek_num)

n_iterations: 5
n_required_iterations: 5
n_possible_iterations: 5
min_resources_: 21
max_resources_: 13502
aggressive_elimination: False
factor: 5
----------
iter: 0
n_candidates: 675
n_resources: 21
Fitting 5 folds for each of 675 candidates, totalling 3375 fits
----------
iter: 1
n_candidates: 135
n_resources: 105
Fitting 5 folds for each of 135 candidates, totalling 675 fits
----------
iter: 2
n_candidates: 27
n_resources: 525
Fitting 5 folds for each of 27 candidates, totalling 135 fits
----------
iter: 3
n_candidates: 6
n_resources: 2625
Fitting 5 folds for each of 6 candidates, totalling 30 fits
----------
iter: 4
n_candidates: 2
n_resources: 13125
Fitting 5 folds for each of 2 candidates, totalling 10 fits
time: 4min 32s (started: 2021-09-17 13:49:46 +01:00)


## Reintroduce "chance_of_playing_this/next_round" variables, and multiply the predictions made by that chance

**Recall:** In the 3_research section, we dropped the variables **chance_of_playing_this_round** and **chance_of_playing_next_round** since they were not reflective of the gameweek defined. However, now, we can use these variables to our advantage. In particular, we can **multiply the predictions made by these features(/probabilities)**. 

**Example:** Let's say: 

<center> 'Jamie Vardy is 75% likely to play next week (due to injury) but if does play we expect him to score 4 points. <br> Instead, we predict Jamie Vardy should get: 0.75 x 4 = 3 points.'</center>

I believe using the features this way should only improve the final predictions we make. However, I also believe it is important to give the users **visibility** of this process. If a FPL Manager (i.e. a User) wants to take the risk to play Jamie Vardy in the above example, I would like them to be able to explicitly see the above calculation. This way, the FPL Manager is more informed about whether to play Jamie Vardy!


In [11]:
# Reset indexes and join chance features
df_test_chance_feat = pd.merge(df_test.reset_index(), 
                               df.reset_index()[['player_name','position','team_title','event','opponent_team_title',
                                                   'chance_of_playing_this_round','chance_of_playing_next_round']], 
                               how='left', 
                               on=['player_name','position','team_title','event','opponent_team_title'])

# Before preceding further, we round our predictions to 2 d.p. for clarity with the user
df_test_chance_feat['total_points_predicted'] = round(df_test_chance_feat['total_points_predicted'], 2)

# Initialise new cols
adj_total_points_predicted_col = []
points_calculation_col = []

# Create new adjusted total points predicted column (created row by row)
for i in range(0, len(df_test_chance_feat)):
    game = df_test_chance_feat.iloc[i]
    
    # If the game is 'this gameweek' (i.e. upcoming gameweek)
    if (game['event']==gameweek_num)&(game['chance_of_playing_this_round'] != 100):
        # Multiply the prev. points predicted by chance
        adj_total_points_predicted = round(game['total_points_predicted']*game['chance_of_playing_this_round']/100, 2)
        adj_total_points_predicted_col.append(adj_total_points_predicted)
        
        # Explain calcuation to user
        points_calculation_col.append(''.join((
                                      "Player has "+str(game['chance_of_playing_this_round'])+"% chance of playing. ",
                                      "The model's prediction of "+str(game['total_points_predicted'])+" points has been adjusted to ",
                                      str(game['chance_of_playing_this_round']/100)+"*"+str(game['total_points_predicted'])+"=",
                                      str(adj_total_points_predicted))))
        
    # If the game is 'next gameweek' (i.e. upcoming gameweek+1) - multiply the prev. points predicted by chance
    elif game['event']==(gameweek_num+1)&(game['chance_of_playing_next_round'] != 100):
        adj_total_points_predicted_col.append(game['total_points_predicted']*game['chance_of_playing_next_round']/100)
        
        # Explain calcuation to user
        points_calculation_col.append(''.join((
                                      "Player has "+str(game['chance_of_playing_next_round'])+"% chance of playing. ",
                                      "The model's prediction of "+str(game['total_points_predicted'])+" points has been adjusted to ",
                                      str(game['chance_of_playing_next_round']/100)+"*"+str(game['total_points_predicted'])+"=",
                                      str(adj_total_points_predicted))))
        
    # Otherwise, we just append the prev. points predicted
    else:
        adj_total_points_predicted_col.append(game['total_points_predicted']) 
        points_calculation_col.append('NaN')
df_test_chance_feat['adj_total_points_predicted'] = adj_total_points_predicted_col
df_test_chance_feat['points_calculation'] = points_calculation_col
df_test_chance_feat


Unnamed: 0,player_name,position,team_title,event,opponent_team_title,home_flag,goalkeeper_flag,defender_flag,midfielder_flag,forward_flag,...,opponent_xG_pgw,opponent_goals_pgw,opponent_xGA_pgw,opponent_goals_against_pgw,total_points_actual,total_points_predicted,chance_of_playing_this_round,chance_of_playing_next_round,adj_total_points_predicted,points_calculation
0,David Luiz Moreira Marinho,defender,Arsenal,37,Crystal Palace,-0.998605,-0.270535,1.344179,-0.884218,-0.398002,...,-1.016711,-0.517638,0.783546,0.662212,,1.10,50.0,100.0,0.55,Player has 50.0% chance of playing. The model'...
1,David Luiz Moreira Marinho,defender,Arsenal,38,Brighton,1.001397,-0.270535,1.344179,-0.884218,-0.398002,...,0.273254,-0.571806,-0.855171,-0.498690,,1.09,50.0,100.0,1.09,
2,Pierre-Emerick Aubameyang,midfielder,Arsenal,37,Crystal Palace,-0.998605,-0.270535,-0.743948,1.130943,-0.398002,...,-1.016711,-0.517638,0.783546,0.662212,,2.10,100.0,100.0,2.10,
3,Pierre-Emerick Aubameyang,midfielder,Arsenal,38,Brighton,1.001397,-0.270535,-0.743948,1.130943,-0.398002,...,0.273254,-0.571806,-0.855171,-0.498690,,2.20,100.0,100.0,2.20,
4,Cédric Soares,defender,Arsenal,37,Crystal Palace,-0.998605,-0.270535,1.344179,-0.884218,-0.398002,...,-1.016711,-0.517638,0.783546,0.662212,,0.87,100.0,100.0,0.87,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
819,Owen Otasowie,midfielder,Wolverhampton Wanderers,38,Manchester United,1.001397,-0.270535,-0.743948,1.130943,-0.398002,...,0.777376,0.939491,-0.687690,-0.560789,,0.83,25.0,100.0,0.83,
820,Theo Corbeanu,midfielder,Wolverhampton Wanderers,37,Everton,-0.998605,-0.270535,-0.743948,1.130943,-0.398002,...,-0.050429,-0.217003,-0.116464,-0.486615,,0.74,0.0,100.0,0.00,Player has 0.0% chance of playing. The model's...
821,Theo Corbeanu,midfielder,Wolverhampton Wanderers,38,Manchester United,1.001397,-0.270535,-0.743948,1.130943,-0.398002,...,0.777376,0.939491,-0.687690,-0.560789,,0.74,0.0,100.0,0.74,
822,Willian José Da Silva,forward,Wolverhampton Wanderers,37,Everton,-0.998605,-0.270535,-0.743948,-0.884218,2.512551,...,-0.050429,-0.217003,-0.116464,-0.486615,,1.56,100.0,100.0,1.56,


time: 271 ms (started: 2021-09-17 13:54:18 +01:00)


# Examine Predictions

In [19]:
df.index.get_level_values('player_name
                          ')

KeyError: 'Level Mo Salah not found'

time: 8.4 ms (started: 2021-09-17 14:01:55 +01:00)


In [16]:
df['total_points'].describe()

count    13502.000000
mean         2.047919
std          2.864131
min         -7.000000
25%          0.000000
50%          1.000000
75%          2.000000
max         24.000000
Name: total_points, dtype: float64

time: 5.74 ms (started: 2021-09-17 13:57:50 +01:00)


In [14]:
df_test_chance_feat['adj_total_points_predicted'].describe()

count    824.000000
mean       1.394551
std        0.584374
min        0.000000
25%        1.010000
50%        1.400000
75%        1.790000
max        3.360000
Name: adj_total_points_predicted, dtype: float64

time: 9.85 ms (started: 2021-09-17 13:57:20 +01:00)


# <u>Next Steps<u/>

## 1. Investigate narrow range of predictions
* Is this normal?
* What are the possible causes?

## 2. Look @ Front End