# Packages

In [1]:
import pandas as pd

# Timing for each cell to run
%load_ext autotime
# %unload_ext autotime

# Warnings & display
import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 60)

time: 614 µs (started: 2021-09-14 16:58:54 +01:00)


# Data Cleaning

## Read in raw data from local directory

In [2]:
df_raw_data = pd.read_csv("/Users/samharrison/Documents/data_sci/fpl_points_predictor/data/raw_data.csv")
df_raw_data.head(2)

Unnamed: 0,FPL_id,Understat_id,player_name,team,team_title,position,element_type,event,finished,opponent_team_title,opponent_team,home_or_away,value,status,chance_of_playing_this_round,chance_of_playing_next_round,total_points,goals_season,shots_season,xG_season,time_season,xA_season,assists_season,key_passes_season,npg_season,npxG_season,xGChain_season,xGBuildup_season,goals_WMA,shots_WMA,xG_WMA,time_WMA,xA_WMA,assists_WMA,key_passes_WMA,npg_WMA,npxG_WMA,xGChain_WMA,xGBuildup_WMA,team_xG_season,team_goals_season,team_xGA_season,team_goals_against_season,team_xG_WMA,team_goals_WMA,team_xGA_WMA,team_goals_against_WMA,opponent_xG_season,opponent_goals_season,opponent_xGA_season,opponent_goals_against_season,opponent_xG_WMA,opponent_goals_WMA,opponent_xGA_WMA,opponent_goals_against_WMA
0,1,181,Bernd Leno,1,Arsenal,goalkeeper,1,1,True,Brentford,3,A,50.0,a,,,1.0,0.0,0.0,0.0,90.0,0.0,0.0,0.0,0.0,0.0,0.097028,0.097028,0.0,0.0,0.0,90.0,0.0,0.0,0.0,0.0,0.0,0.097028,0.097028,,,,,,,,,,,,,,,,
1,1,181,Bernd Leno,1,Arsenal,goalkeeper,1,2,True,Chelsea,6,H,50.0,a,,,2.0,0.0,0.0,0.0,90.0,0.0,0.0,0.0,0.0,0.0,0.097028,0.097028,0.0,0.0,0.0,90.0,0.0,0.0,0.0,0.0,0.0,0.097028,0.097028,1.02385,0.0,1.88818,2.0,1.02385,0.0,1.88818,2.0,1.18709,3.0,0.321701,0.0,1.18709,3.0,0.321701,0.0


time: 147 ms (started: 2021-09-14 16:58:54 +01:00)


## Manipulate DataFrame

In [3]:
# Drop unnecesary acolumns 
df = df_raw_data.drop(columns={'FPL_id','Understat_id','team','opponent_team','element_type','value'})

# Make home_or_away feature a binary variable
df.insert(8, 'home_flag', df['home_or_away'].apply(lambda x: 1 if x == 'H' else 0))
df = df.drop(columns={'home_or_away'})

# Make binary position variables
df.insert(8, 'goalkeeper_flag', df['position'].apply(lambda x: 1 if x == 'goalkeeper' else 0))
df.insert(9, 'defender_flag', df['position'].apply(lambda x: 1 if x == 'defender' else 0))
df.insert(10, 'midfielder_flag', df['position'].apply(lambda x: 1 if x == 'midfielder' else 0))
df.insert(11, 'forward_flag', df['position'].apply(lambda x: 1 if x == 'forward' else 0))

# Set index and reorder columns
df = df.set_index(['player_name','position','team_title','event','opponent_team_title'])
df.insert(2, 'chance_of_playing_this_round', df.pop('chance_of_playing_this_round'))
df.insert(3, 'chance_of_playing_next_round', df.pop('chance_of_playing_next_round'))
df.insert(47, 'total_points', df.pop('total_points'))

# Trim the dataset to look forward only 5 gameweeks into the future
prev_gw = df[df['finished']==True].index.get_level_values('event').max()
gameweek_range = list(range(0,prev_gw+6))
df = df[df.index.get_level_values('event').isin(gameweek_range)]

df.loc['Mohamed Salah'].head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,finished,status,chance_of_playing_this_round,chance_of_playing_next_round,home_flag,goalkeeper_flag,defender_flag,midfielder_flag,forward_flag,goals_season,shots_season,xG_season,time_season,xA_season,assists_season,key_passes_season,npg_season,npxG_season,xGChain_season,xGBuildup_season,goals_WMA,shots_WMA,xG_WMA,time_WMA,xA_WMA,assists_WMA,key_passes_WMA,npg_WMA,npxG_WMA,xGChain_WMA,xGBuildup_WMA,team_xG_season,team_goals_season,team_xGA_season,team_goals_against_season,team_xG_WMA,team_goals_WMA,team_xGA_WMA,team_goals_against_WMA,opponent_xG_season,opponent_goals_season,opponent_xGA_season,opponent_goals_against_season,opponent_xG_WMA,opponent_goals_WMA,opponent_xGA_WMA,opponent_goals_against_WMA,total_points
position,team_title,event,opponent_team_title,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
midfielder,Liverpool,1,Norwich,True,a,,,0,0,0,1,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,17.0
midfielder,Liverpool,2,Burnley,True,a,,,1,0,0,1,0,1.0,5.0,0.239134,90.0,0.868721,2.0,3.0,1.0,0.239134,1.069634,0.100011,1.0,5.0,0.239134,90.0,0.868721,2.0,3.0,1.0,0.239134,1.069634,0.100011,1.78728,3.0,1.3333,0.0,1.78728,3.0,1.3333,0.0,1.79548,1.0,1.6853,2.0,1.79548,1.0,1.6853,2.0,3.0


time: 128 ms (started: 2021-09-14 16:58:58 +01:00)


## Handle null values

In [4]:
# Remove u = "unavailable" players from dataset
df = df[df['status']!='u'].drop(columns={'status'})

# Remove players/rows where the player hasn't registered any minutes for ease (this will also remove GW1 for all players)
df = df[~df['time_season'].isna()]

# Replace null values in playing chance columns with 100% 
df['chance_of_playing_this_round'] = df['chance_of_playing_this_round'].fillna(100)
df['chance_of_playing_next_round'] = df['chance_of_playing_next_round'].fillna(100)

# Remove rows where it was the opponents team's first game of the season
df = df[~df['opponent_xG_season'].isna()]
 
# Check there are no null values remaining
if df.drop(columns={'total_points'}).isna().sum().sum() == 0:
    print('There are no more remaining null values\n')
else:
    print('There exists null values - Investigate\n')

There are no more remaining null values

time: 15.8 ms (started: 2021-09-14 16:59:01 +01:00)


## Transform the  "_season" variables
Notice, for a given player, our "_season" variables get larger every gameweek. This was deliberate. These variables were created to capture the season-long performance of our players. However, by this design, we should probably transform these variables first before modelling with them. Namely, we will transform every "_season" variable via. the mapping, 

<img src="img/mapping.png" /> 

defined by:

<img src="img/mapping_def.png" />    

Without this transformation, these variables are very heavily time-dependent which we do not want. In essence, what we are doing here is calculating the "per gameweek" rate of each variable. Therefore, we will label these variables using **"_pgw"** as our notation.

In [5]:
# Define season variables
season_variables = ['goals_season','shots_season','xG_season','time_season','xA_season','assists_season',
                    'key_passes_season','npg_season','npxG_season','xGChain_season','xGBuildup_season', 'team_xG_season', 
                    'team_goals_season','team_xGA_season','team_goals_against_season','opponent_xG_season',
                    'opponent_goals_season','opponent_xGA_season','opponent_goals_against_season']

# Define event (i.e. gameweek) column
df['event'] = df.index.get_level_values('event')

# For every season variable, transform via. the mapping defined above
for var in season_variables:
    df[f'{var[:-7]}_pgw'] = df[var]/(df['event']-1)

# Drop old season variables
df = df.loc[:,~df.columns.isin(season_variables)]
df = df.drop(columns={'event'})

# Reorder columns
print(df.shape)
df.insert(46, 'total_points', df.pop('total_points'))
df.head(2)

(2761, 47)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,finished,chance_of_playing_this_round,chance_of_playing_next_round,home_flag,goalkeeper_flag,defender_flag,midfielder_flag,forward_flag,goals_WMA,shots_WMA,xG_WMA,time_WMA,xA_WMA,assists_WMA,key_passes_WMA,npg_WMA,npxG_WMA,xGChain_WMA,xGBuildup_WMA,team_xG_WMA,team_goals_WMA,team_xGA_WMA,team_goals_against_WMA,opponent_xG_WMA,opponent_goals_WMA,opponent_xGA_WMA,opponent_goals_against_WMA,goals_pgw,shots_pgw,xG_pgw,time_pgw,xA_pgw,assists_pgw,key_passes_pgw,npg_pgw,npxG_pgw,xGChain_pgw,xGBuildup_pgw,team_xG_pgw,team_goals_pgw,team_xGA_pgw,team_goals_against_pgw,opponent_xG_pgw,opponent_goals_pgw,opponent_xGA_pgw,opponent_goals_against_pgw,total_points
player_name,position,team_title,event,opponent_team_title,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
Bernd Leno,goalkeeper,Arsenal,2,Chelsea,True,100.0,100.0,1,1,0,0,0,0.0,0.0,0.0,90.0,0.0,0.0,0.0,0.0,0.0,0.097028,0.097028,1.02385,0.0,1.88818,2.0,1.18709,3.0,0.321701,0.0,0.0,0.0,0.0,90.0,0.0,0.0,0.0,0.0,0.0,0.097028,0.097028,1.02385,0.0,1.88818,2.0,1.18709,3.0,0.321701,0.0,2.0
Bernd Leno,goalkeeper,Arsenal,3,Manchester City,True,100.0,100.0,0,1,0,0,0,0.0,0.0,0.0,90.0,0.0,0.0,0.0,0.0,0.0,0.032343,0.032343,0.761349,0.0,3.05694,2.0,2.328137,3.333333,0.370053,0.333333,0.0,0.0,0.0,90.0,0.0,0.0,0.0,0.0,0.0,0.048514,0.048514,0.826975,0.0,2.76475,2.0,2.272565,2.5,0.542574,0.5,1.0


time: 49.7 ms (started: 2021-09-14 16:59:05 +01:00)


## Overwrite cleaned data in local directory

In [6]:
df.to_csv(index=True, path_or_buf="/Users/samharrison/Documents/data_sci/fpl_points_predictor/data/cleaned_data.csv") 

time: 154 ms (started: 2021-09-14 16:59:09 +01:00)
