# Packages

In [None]:
import pandas as pd

# Timing for each cell to run
%load_ext autotime
# %unload_ext autotime

# Warnings & display
import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 60)

# Data Cleaning

## Read in raw data from local directory

In [None]:
df_raw_data = pd.read_csv("/Users/samharrison/Documents/data_sci/fpl_points_predictor/data/raw_data.csv")
df_raw_data.head(2)

## Manipulate DataFrame

In [None]:
# Drop unnecesary acolumns 
df = df_raw_data.drop(columns={'FPL_id','Understat_id','team','opponent_team','element_type','value'})

# Make home_or_away feature a binary variable
df.insert(8, 'home_flag', df['home_or_away'].apply(lambda x: 1 if x == 'H' else 0))
df = df.drop(columns={'home_or_away'})

# Make binary position variables
df.insert(8, 'goalkeeper_flag', df['position'].apply(lambda x: 1 if x == 'goalkeeper' else 0))
df.insert(9, 'defender_flag', df['position'].apply(lambda x: 1 if x == 'defender' else 0))
df.insert(10, 'midfielder_flag', df['position'].apply(lambda x: 1 if x == 'midfielder' else 0))
df.insert(11, 'forward_flag', df['position'].apply(lambda x: 1 if x == 'forward' else 0))

# Set index and reorder columns
df = df.set_index(['player_name','position','team_title','event','opponent_team_title'])
df.insert(2, 'chance_of_playing_this_round', df.pop('chance_of_playing_this_round'))
df.insert(3, 'chance_of_playing_next_round', df.pop('chance_of_playing_next_round'))
df.insert(47, 'total_points', df.pop('total_points'))

# Trim the dataset to look forward only 5 gameweeks into the future
prev_gw = df[df['finished']==True].index.get_level_values('event').max()
gameweek_range = list(range(0,prev_gw+6))
df = df[df.index.get_level_values('event').isin(gameweek_range)]

df.loc['Mohamed Salah'].head(2)

## Handle null values

In [None]:
# Remove u = "unavailable" players from dataset
df = df[df['status']!='u'].drop(columns={'status'})

# Remove players/rows where the player hasn't registered any minutes for ease (this will also remove GW1 for all players)
df = df[~df['time_season'].isna()]

# Replace null values in playing chance columns with 100% 
df['chance_of_playing_this_round'] = df['chance_of_playing_this_round'].fillna(100)
df['chance_of_playing_next_round'] = df['chance_of_playing_next_round'].fillna(100)

# Remove rows where it was the opponents team's first game of the season
df = df[~df['opponent_xG_season'].isna()]
 
# Check there are no null values remaining
if df.drop(columns={'total_points'}).isna().sum().sum() == 0:
    print('There are no more remaining null values\n')
else:
    print('There exists null values - Investigate\n')

## Transform the  "_season" variables
Notice, for a given player, our "_season" variables get larger every gameweek. This was deliberate. These variables were created to capture the season-long performance of our players. However, by this design, we should probably transform these variables first before modelling with them. Namely, we will transform every "_season" variable via. the mapping, 

<img src="img/mapping.png" />    

defined by:

<img src="img/mapping_def.png" />    

Without this transformation, these variables are very heavily time-dependent which we do not want. In essence, what we are doing here is calculating the "per gameweek" rate of each variable. Therefore, we will label these variables using **"_pgw"** as our notation.

<img src="img/mapping.png" />    

In [None]:
# Define season variables
season_variables = ['goals_season','shots_season','xG_season','time_season','xA_season','assists_season',
                    'key_passes_season','npg_season','npxG_season','xGChain_season','xGBuildup_season', 'team_xG_season', 
                    'team_goals_season','team_xGA_season','team_goals_against_season','opponent_xG_season',
                    'opponent_goals_season','opponent_xGA_season','opponent_goals_against_season']

# Define event (i.e. gameweek) column
df['event'] = df.index.get_level_values('event')

# For every season variable, transform via. the mapping defined above
for var in season_variables:
    df[f'{var[:-7]}_pgw'] = df[var]/(df['event']-1)

# Drop old season variables
df = df.loc[:,~df.columns.isin(season_variables)]
df = df.drop(columns={'event'})

# Reorder columns
print(df.shape)
df.insert(46, 'total_points', df.pop('total_points'))
df.head(2)

In [None]:
df.to_csv(index=True, path_or_buf="/Users/samharrison/Documents/data_sci/fpl_points_predictor/data/cleaned_data.csv") 