# Packages

In [1]:
import sys, os
import pandas as pd
import numpy as np

# Scikit-learn
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler

# Explicitly require this experimental feature
from sklearn.experimental import enable_halving_search_cv # noqa
# Now you can import normally from model_selection
from sklearn.model_selection import HalvingGridSearchCV

# XGBoost
import xgboost as xgb

# Warnings
import warnings
warnings.filterwarnings("ignore")
    
# Timings
%load_ext autotime
# %unload_ext autotime

# Progress bar
from tqdm import tqdm

time: 24.6 ms (started: 2021-09-21 16:38:23 +01:00)


## Read in cleaned data from local directory

(Temporary dataset *Will be updated soon)

In [2]:
# Redefine Index
df = pd.read_csv("/Users/samharrison/Documents/data_sci/fpl_points_predictor/data/cleaned_data.csv")
df = df.set_index(['player_name','position','team_title','event','opponent_team_title'])

print(df.shape)
df.head(4)

(4686, 47)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,finished,value,chance_of_playing_next_round,home_flag,goalkeeper_flag,defender_flag,midfielder_flag,forward_flag,goals_WMA,shots_WMA,...,xGBuildup_pgw,team_xG_pgw,team_goals_pgw,team_xGA_pgw,team_goals_against_pgw,opponent_xG_pgw,opponent_goals_pgw,opponent_xGA_pgw,opponent_goals_against_pgw,total_points
player_name,position,team_title,event,opponent_team_title,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
Bernd Leno,goalkeeper,Arsenal,2,Chelsea,True,50.0,100.0,1,1,0,0,0,0.0,0.0,...,0.097028,1.02385,0.0,1.88818,2.0,1.18709,3.0,0.321701,0.0,2.0
Bernd Leno,goalkeeper,Arsenal,3,Manchester City,True,49.0,100.0,0,1,0,0,0,0.0,0.0,...,0.048514,0.826975,0.0,2.76475,2.0,2.272565,2.5,0.542574,0.5,1.0
Bernd Leno,goalkeeper,Arsenal,4,Norwich,True,49.0,100.0,1,1,0,0,0,0.0,0.0,...,0.032343,0.590251,0.0,3.334077,3.0,0.93305,0.333333,1.833423,3.333333,0.0
Bernd Leno,goalkeeper,Arsenal,5,Burnley,True,49.0,100.0,0,1,0,0,0,0.0,0.0,...,0.024257,1.113976,0.25,2.644881,2.25,1.504393,0.75,1.912348,2.0,0.0


time: 76.6 ms (started: 2021-09-21 16:38:23 +01:00)


## Define:
* The upcoming **gameweek number**.

In [3]:
# Find the upcoming gameweek
gameweek_num = df[df['finished']==False].index.get_level_values('event').min()
print(gameweek_num)

6
time: 11.1 ms (started: 2021-09-21 16:38:23 +01:00)


## Define:
* The **parameter grid** we will search across for the best possible settings.

*[Note: This grid was found during the **3_research** section.]*

In [4]:
# Create the parameter grid based on the results of random search (from the research part of this project)
param_grid = {
    'eta': np.linspace(0.01, 0.1, num = 5),
    'gamma': np.logspace(-10, -8, 5), 
    'max_depth': [3, 4, 6],
    'min_child_weight': [10, 12, 15], 
    'colsample_bytree': [0.45, 0.5, 0.55]}

time: 491 µs (started: 2021-09-21 16:38:23 +01:00)


## Run model

In [5]:
def get_preds(gameweek_num):

    """Returns the predictions for the upcoming gameweeks.
    
    :param: int64 gameweek_num: The upcoming gameweek number, and 
            the first gameweek in the test range. E.g. if we're at 
            the gameweek 5 is next, we predict gameweeks 5,6,7,8 and 9. 

    :rtype: DataFrame train: Training dataset containing predictions.
            DataFrame test: Test dataset containing predictions.
    """    

    # Initialise gameweek ranges
    prev_gw = gameweek_num-1
    all_gameweeks = list(range(0,prev_gw+6))
    train_gameweeks = list(range(0,prev_gw+1))
    test_gameweeks = list(range(prev_gw+1,prev_gw+6))

    # Get all gameweeks in both sets of ranges
    df_all_gameweeks = df[(df.index.get_level_values('event').isin(all_gameweeks))]

    # Rename target variable
    df_all_gameweeks = df_all_gameweeks.rename(columns={'total_points':'total_points_actual'})

    # Drop "chance variables". 
    # Note: These will be reintroduced later
    df_all_gameweeks = df_all_gameweeks.drop(columns={'finished','value','chance_of_playing_next_round'})

    # Standardise the independent/predictor variables
    feat_to_not_scale = ['home_flag','goalkeeper_flag','defender_flag','midfielder_flag','total_points_actual']
    df_all_gameweeks.loc[:, ~df_all_gameweeks.columns.isin(feat_to_not_scale)] = StandardScaler().fit_transform(
                                            df_all_gameweeks.loc[:, ~df_all_gameweeks.columns.isin(feat_to_not_scale)])
    # Train-Test split the data
    df_train = df_all_gameweeks[(df_all_gameweeks.index.get_level_values('event').isin(train_gameweeks))]
    df_test = df_all_gameweeks[(df_all_gameweeks.index.get_level_values('event').isin(test_gameweeks))]

    # Standardise the dependent/target variable (i.e. we will inverse this later)
    target_scaler = MinMaxScaler()
    target_scaler.fit(df_train[['total_points_actual']])
    y_train = target_scaler.transform(df_train[['total_points_actual']])
    y_test = target_scaler.transform(df_test[['total_points_actual']])

    # Define independent/predictor variables
    X_train = df_train.loc[:, df_train.columns != 'total_points_actual']
    X_test = df_test.loc[:, df_test.columns != 'total_points_actual']

    # Initialise XGBoost
    xbgr = xgb.XGBRegressor()

    # Setup search heuristic using parameter grid from earlier
    sh = HalvingGridSearchCV(xbgr, param_grid, cv = 5, factor = 5, 
                min_resources ='exhaust', n_jobs = -1, verbose = 2, random_state = 42).fit(X_train, y_train) 

    # Fit XGBoost best estimator to training data
    model = sh.best_estimator_.fit(X_train, y_train) 

    # Obtain predictions on training data via. Cross Validation 
    y_train_pred = cross_val_predict(model, X_train, y_train, cv=10)

    # Invert transform on predictions for both datasets
    y_test_pred = model.predict(X_test)
    y_test_pred = target_scaler.inverse_transform([y_test_pred])
    y_train_pred = target_scaler.inverse_transform([y_train_pred])

    # Create prediction column for train/test DataFrames
    df_train['total_points_predicted'] = y_train_pred[0]
    df_test['total_points_predicted'] = y_test_pred[0]

    return df_train, df_test

time: 1.19 ms (started: 2021-09-21 16:38:23 +01:00)


Call function. 

In [6]:
# Get predictions
df_train, df_test = get_preds(gameweek_num)

n_iterations: 4
n_required_iterations: 5
n_possible_iterations: 4
min_resources_: 10
max_resources_: 2041
aggressive_elimination: False
factor: 5
----------
iter: 0
n_candidates: 675
n_resources: 10
Fitting 5 folds for each of 675 candidates, totalling 3375 fits
----------
iter: 1
n_candidates: 135
n_resources: 50
Fitting 5 folds for each of 135 candidates, totalling 675 fits
----------
iter: 2
n_candidates: 27
n_resources: 250
Fitting 5 folds for each of 27 candidates, totalling 135 fits
----------
iter: 3
n_candidates: 6
n_resources: 1250
Fitting 5 folds for each of 6 candidates, totalling 30 fits
time: 4min 12s (started: 2021-09-21 16:38:23 +01:00)


## Reintroduce "chance_of_playing_next_round" and multiply the predictions made by this 'chance'. Also reintroduce player values
**Recall:** In the 3_research section, we dropped the variable **chance_of_playing_next_round** since it did not reflect the gameweek defined. However, now, we can use these variables to our advantage. In particular, we can **multiply the predictions made by these features(/probabilities)**. 

**Example:** Let's say: 

<center> 'Jamie Vardy is 75% likely to play next week (due to injury) but if does play we expect him to score 4 points. <br> Instead, we predict Jamie Vardy should get: 0.75 x 4 = 3 points.'</center>

I believe using the features this way should only improve the final predictions we make. However, I also believe it is important to give the users **visibility** of this process. If a FPL Manager (i.e. a User) wants to take the risk to play Jamie Vardy in the above example, I would like them to be able to explicitly see the above calculation. This way, the FPL Manager is more informed about whether to play Jamie Vardy!


In [7]:
def adjust_test_preds(df_test):
    
    """Adjusts the test predictions for the upcoming gameweek; namely,
    "chance_of_playing_next_round" is used to modify predictions so that 
    they better reflect things like injuries etc.
    
    :param: DataFrame df_test: The test data pre-adjustment.

    :rtype: DataFrame df_test: The test data, now adjusted.
    """    

    # Reset indexes and join chance & value features
    df_test = pd.merge(df_test.reset_index(), 
                       df.reset_index()[['player_name','position','team_title','event',
                       'opponent_team_title','value','chance_of_playing_next_round']], 
                       how='left', 
                       on=['player_name','position','team_title','event','opponent_team_title'])

    # Before preceding further, we round our predictions to 2 d.p. for clarity with the user
    df_test['total_points_predicted'] = round(df_test['total_points_predicted'], 2)

    # Initialise new cols
    adj_total_points_predicted_col = []
    points_calculation_col = []

    # Create new adjusted total points predicted column (created row by row)
    for i in range(0, len(df_test)):
        game = df_test.iloc[i]

        # If the game is 'next gameweek' (i.e. upcoming gameweek+1) - multiply the prev. points predicted by chance
        if (game['event']==gameweek_num)&(game['chance_of_playing_next_round'] != 100):
            adj_total_points_predicted = round(game['total_points_predicted']*game['chance_of_playing_next_round']/100, 2)
            adj_total_points_predicted_col.append(game['total_points_predicted']*game['chance_of_playing_next_round']/100)

            # Explain calcuation to user
            points_calculation_col.append(''.join((
                                          "Player has "+str(game['chance_of_playing_next_round'])+"% chance of playing. ",
                                          "The model's prediction of "+str(game['total_points_predicted'])+" points has been adjusted to ",
                                          str(game['chance_of_playing_next_round']/100)+"*"+str(game['total_points_predicted'])+"=",
                                          str(adj_total_points_predicted))))

        # Otherwise, we just append the prev. points predicted
        else:
            adj_total_points_predicted_col.append(game['total_points_predicted']) 
            points_calculation_col.append('NaN')
    df_test['adj_total_points_predicted'] = adj_total_points_predicted_col
    df_test['points_calculation'] = points_calculation_col
    
    return df_test


time: 1.97 ms (started: 2021-09-21 16:42:35 +01:00)


Call function.

In [8]:
# Get (final) predictions
df_test = adjust_test_preds(df_test)
df_test.head(4)

Unnamed: 0,player_name,position,team_title,event,opponent_team_title,home_flag,goalkeeper_flag,defender_flag,midfielder_flag,forward_flag,...,opponent_xG_pgw,opponent_goals_pgw,opponent_xGA_pgw,opponent_goals_against_pgw,total_points_actual,total_points_predicted,value,chance_of_playing_next_round,adj_total_points_predicted,points_calculation
0,Bernd Leno,goalkeeper,Arsenal,6,Tottenham,1,1,0,0,-0.372134,...,-0.5566,-0.68604,1.241248,0.000999,,2.11,,100.0,2.11,
1,Bernd Leno,goalkeeper,Arsenal,7,Brighton,0,1,0,0,-0.372134,...,-0.398349,-0.039406,-0.438823,-0.594968,,1.64,,100.0,1.64,
2,Bernd Leno,goalkeeper,Arsenal,8,Crystal Palace,1,1,0,0,-0.372134,...,-0.592941,-0.555626,-0.560475,-0.062855,,1.66,,100.0,1.66,
3,Bernd Leno,goalkeeper,Arsenal,9,Aston Villa,1,1,0,0,-0.372134,...,-1.102585,-0.229592,-0.984681,-0.362168,,1.63,,100.0,1.63,


time: 608 ms (started: 2021-09-21 16:42:35 +01:00)


## Combine the predictions made on both datasets and prepare for export

All we're doing here is combining the two datasets now containing predictions. We join them together for ease when we export next. 

In [9]:
def prepare_data_for_export(df_train, df_test):

    # Reset indexes and join value feature
    df_train = pd.merge(df_train.reset_index(), 
                       df.reset_index()[['player_name','position','team_title','event',
                       'opponent_team_title','value']], 
                       how='left', 
                       on=['player_name','position','team_title','event','opponent_team_title'])

    # Redefine training DataFrame (and test DataFrame) with only variables we want to output
    df_train = df_train.reset_index()[['player_name','position','team_title','event','opponent_team_title','value',
                        'total_points_actual','total_points_predicted']]
    df_test = df_test[['player_name','position','team_title','event','opponent_team_title','value',
            'total_points_actual','adj_total_points_predicted','points_calculation']]

    # Rename 'adj_total_points_predicted' column, and create new (empty) 'points_calculation' col - since we're about to concat
    df_test = df_test.rename(columns={'adj_total_points_predicted':'total_points_predicted'})
    df_train['points_calculation'] = np.nan

    # Create 'train_test' column for filtering later on
    df_test['train_test'] = 'test'
    df_train['train_test'] = 'train'

    # Concatenate DataFrames and sort by player and gameweek
    df_predictions = pd.concat([df_test, df_train]).sort_values(by=['player_name','event']).reset_index(drop=True)
    df_predictions.insert(0, 'train_test', df_predictions.pop('train_test'))
    df_predictions.head(6)

    # Update market value 'value' col to historic valuations
    value_col = []
    for i in range(0,len(df_predictions)):
        row = df_predictions.iloc[i]

        # If training data append historic market value
        if row['train_test']=='train':
            value_col.append(row['value'])
            # Temporarily set value for future assignment
            value_temp = row['value']

        # If test data - assign value to most recent 
        elif row['train_test']=='test':
            value_col.append(value_temp)
        else:
            pass
    df_predictions['value'] = value_col

    # Create 'more_info_flag' col (helps to format injury info in Tableau)
    more_info_flag_col = []
    for i in range(0,len(df_predictions)):
        row = df_predictions.iloc[i]

        # If that player has any sort of injury:
        if len(df_predictions[df_predictions['player_name']==row['player_name']]['points_calculation'].unique())>2:
    #         print(df_predictions[df_predictions['player_name']==row['player_name']]['points_calculation'].unique())
            more_info_flag_col.append(1)
        else:
            more_info_flag_col.append(0)
            pass
    df_predictions['more_info_flag'] = more_info_flag_col
    
    # Divide 'value' by 10 for correct scale
    df_predictions['value'] = df_predictions['value']/10
    
    # Replace long-team names with abbreviations (for better display)
    for col in ['team_title','opponent_team_title']:
        df_predictions[col] = df_predictions[col].replace('Arsenal','ARS')
        df_predictions[col] = df_predictions[col].replace('Brighton','BRI')
        df_predictions[col] = df_predictions[col].replace('Burnley','BUR')
        df_predictions[col] = df_predictions[col].replace('Chelsea','CHE')
        df_predictions[col] = df_predictions[col].replace('Crystal Palace','CRY')
        df_predictions[col] = df_predictions[col].replace('Everton','EVE')
        df_predictions[col] = df_predictions[col].replace('Norwich','NOR')
        df_predictions[col] = df_predictions[col].replace('Brentford','BRE')
        df_predictions[col] = df_predictions[col].replace('Leicester','LEI')
        df_predictions[col] = df_predictions[col].replace('Liverpool','LIV')
        df_predictions[col] = df_predictions[col].replace('Wolverhampton Wanderers','WOV')
        df_predictions[col] = df_predictions[col].replace('Leeds','LEE')
        df_predictions[col] = df_predictions[col].replace('Watford','WAT')
        df_predictions[col] = df_predictions[col].replace('Tottenham','TOT')
        df_predictions[col] = df_predictions[col].replace('West Ham','WHU')
        df_predictions[col] = df_predictions[col].replace('Manchester United','MUN')
        df_predictions[col] = df_predictions[col].replace('Manchester City','MCI')
        df_predictions[col] = df_predictions[col].replace('Newcastle United','NEW')
        df_predictions[col] = df_predictions[col].replace('Aston Villa','AVL')
        df_predictions[col] = df_predictions[col].replace('Southampton','SOU')
    
    return df_predictions

    

time: 2.75 ms (started: 2021-09-21 16:42:36 +01:00)


Call function.

In [10]:
# Prepare data for export to Tableau
df_predictions = prepare_data_for_export(df_train, df_test)
df_predictions.head(4)

Unnamed: 0,train_test,player_name,position,team_title,event,opponent_team_title,value,total_points_actual,total_points_predicted,points_calculation,more_info_flag
0,train,Aaron Connolly,forward,BRI,2,WAT,5.5,1.0,0.519296,,0
1,train,Aaron Connolly,forward,BRI,3,EVE,5.5,0.0,1.935048,,0
2,train,Aaron Connolly,forward,BRI,4,BRE,5.4,0.0,1.467636,,0
3,train,Aaron Connolly,forward,BRI,5,LEI,5.4,0.0,1.715438,,0


time: 5.96 s (started: 2021-09-21 16:42:36 +01:00)


## Overwrite prediction data in local directory

In [11]:
df_predictions.to_csv(index=True, path_or_buf="/Users/samharrison/Documents/data_sci/fpl_points_predictor/data/predictions.csv") 

time: 49.1 ms (started: 2021-09-21 16:42:42 +01:00)


We also store a historic version of the prediction data. 

In [12]:
df_predictions.to_csv(index=True, path_or_buf="/Users/samharrison/Documents/data_sci/fpl_points_predictor/data/data_archive/predictions_gw"+str(gameweek_num)+".csv")

time: 42.9 ms (started: 2021-09-21 16:42:42 +01:00)


Finally, we also overwrite the .csv in Google Drive (this will update the Google Sheet).

In [13]:
df_predictions.to_csv(index=True, path_or_buf="/Users/samharrison/My Drive/fpl_points_predictor/predictions.csv")

time: 37.3 ms (started: 2021-09-21 16:45:45 +01:00)
