# Preprossesing 

The aim is to create a model for prodicting points for players in fpl. Since each position gets points based on different parameters, it is decided that one regression model is to be implemented for each position (GK, DEF, MID, FWD). As a result of this, info from different .csv-files must be fetched. Further more this allows for unneccesary coloumns to be removed (xG is not relevant for GK's in the same way that number fo saves os not relevant for FWD's). 

In [34]:
import pandas as pd
import os
from utils import *
import csv

### Test set
The test set consists of the final 15 games of the 2023-2024 season, about 20% of the dataset 

In [35]:
TEST_SET_GWS = 15 #final 15 GWs of 23-24 season is the test set
NUM_GWS = 38

### FWD
Creating the dataframe for forwards. This dataframe consists of all data from the last 6 games for every player at every gameweek, as well as oponinent-team data 

In [62]:
def get_FWD_player_data() -> pd.DataFrame:
    """
    Get the data of all the FW players
    :return: data of all the FW players
    """
    FWD_cols = [
                'name', 'position', 'team', 'xP', 'assists', 'bonus', 'bps', 
                'creativity', 'expected_assists', 'expected_goal_involvements', 
                'expected_goals', 'goals_scored', 'ict_index', 'influence', 'minutes', 
                'opponent_team', 'own_goals', 'penalties_missed', 'red_cards', 
                'selected', 'team_a_score', 'team_h_score', 'threat', 'total_points', 
                'transfers_balance', 'transfers_in', 'transfers_out', 'value', 'was_home', 
                'yellow_cards', 'GW'
                ]
    
    # Getting data from 22/23 season    
    df_22_23 = pd.read_csv("data/2022-23/gws/merged_gw.csv", usecols=FWD_cols)
    df_22_23['season'] = '2022-23'

    # Getting data from 23/24 season
    df_23_24 = pd.read_csv("data/2023-24/gws/merged_gw.csv", usecols=FWD_cols)
    #excluding the test set
    df_23_24.drop(df_23_24[df_23_24['GW'] >= NUM_GWS-TEST_SET_GWS].index, inplace=True)
    df_23_24['season'] = '2023-24'
    
    # Merging the data
    df = pd.concat([df_22_23, df_23_24])

    # Filtering only the FWD players
    df = df[df['position'] == "FWD"]

    # Adding information about the opponent team, looking at strength differences
    team_info_cols = [
                     'id', 'name', 'strength', 'strength_attack_home', 'strength_attack_away', 
                     'strength_defence_home', 'strength_defence_away'
                     ]
    
    for index, row in df.iterrows():
        team_info_df = pd.read_csv(f"data/{row['season']}/teams.csv", usecols=team_info_cols)
        if row['was_home']:
            df.at[index, 'attack_strenght_difference'] = team_info_df.loc[team_info_df['name'] == row['team'], 'strength_attack_home'].values[0] - team_info_df.loc[team_info_df['id'] == row['opponent_team'], 'strength_defence_away'].values[0]
        else:
            df.at[index, 'attack_strenght_difference'] = team_info_df.loc[team_info_df['name'] == row['team'], 'strength_attack_away'].values[0] - team_info_df.loc[team_info_df['id'] == row['opponent_team'], 'strength_defence_home'].values[0]
        
        df.at[index, 'strength_difference'] = team_info_df.loc[team_info_df['name'] == row['team'], 'strength'].values[0] - team_info_df.loc[team_info_df['id'] == row['opponent_team'], 'strength'].values[0]
    
    # Adding lagged features
    NUM_LAGS = 5
    lagged_features = [
                'xP', 'assists', 'bonus', 'bps', 'creativity', 
                'expected_assists', 'expected_goal_involvements', 
                'expected_goals', 'goals_scored', 'ict_index', 'influence', 
                'minutes', 'own_goals', 'penalties_missed', 'red_cards', 
                'selected', 'team_a_score', 'team_h_score', 'threat', 
                'total_points', 'transfers_balance', 'transfers_in', 
                'transfers_out', 'value', 'was_home', 'yellow_cards'
    ]

    df.sort_values(by=['name', 'season', 'GW'], ascending=[True, True, True], inplace=True)
    lagged_columns = {}

    for feature in lagged_features:
        for i in range(1, NUM_LAGS + 1):
            # Create a new column with lagged values
            lagged_columns[f'{feature}_lag{i}'] = df.groupby('name')[feature].shift(i)

    # Combine the original DataFrame with the new lagged features at once
    lagged_df = pd.DataFrame(lagged_columns)
    df = pd.concat([df, lagged_df], axis=1)

    # Removing coloumns with info about the future
    coloumns_to_remove = [
                'name', 'position', 'team', 'xP', 'assists', 'bonus', 'bps', 
                'creativity', 'expected_assists', 'expected_goal_involvements', 
                'expected_goals', 'goals_scored', 'ict_index', 'influence', 'minutes', 
                'opponent_team', 'own_goals', 'penalties_missed', 'red_cards', 
                'selected', 'team_a_score', 'team_h_score', 'threat', 
                'transfers_balance', 'transfers_in', 'transfers_out', 'value', 
                'yellow_cards', 'GW', 'season'
                ]
    
    df.drop(coloumns_to_remove, axis=1, inplace=True)

    df.info()

    return df

    """ coloums_to_print = ['name', 'season', 'GW', 'xP', 'xP_lag1', 'xP_lag2', 'xP_lag3', 'xP_lag4', 'xP_lag5']

    print(df.iloc[560:620][coloums_to_print])
 """

get_FWD_player_data()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 5185 entries, 228 to 15712
Columns: 133 entries, was_home to yellow_cards_lag5
dtypes: bool(1), float64(127), object(5)
memory usage: 5.3+ MB


Unnamed: 0,was_home,attack_strenght_difference,strength_difference,xP_lag1,xP_lag2,xP_lag3,xP_lag4,xP_lag5,assists_lag1,assists_lag2,...,was_home_lag1,was_home_lag2,was_home_lag3,was_home_lag4,was_home_lag5,yellow_cards_lag1,yellow_cards_lag2,yellow_cards_lag3,yellow_cards_lag4,yellow_cards_lag5
228,True,-50.0,1.0,,,,,,,,...,,,,,,,,,,
866,False,65.0,0.0,0.0,,,,,0.0,,...,True,,,,,0.0,,,,
1487,True,-80.0,0.0,0.0,0.0,,,,0.0,0.0,...,False,True,,,,0.0,0.0,,,
2178,True,-285.0,0.0,0.0,0.0,0.0,,,0.0,0.0,...,True,False,True,,,0.0,0.0,0.0,,
2884,False,60.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,...,True,True,False,True,,0.0,0.0,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12626,False,-10.0,-1.0,3.3,3.3,3.7,4.2,1.5,0.0,0.0,...,True,False,False,True,True,0.0,0.0,0.0,0.0,0.0
13365,True,-340.0,-2.0,3.0,3.3,3.3,3.7,4.2,0.0,0.0,...,False,True,False,False,True,0.0,0.0,0.0,0.0,0.0
14138,False,-130.0,-2.0,2.5,3.0,3.3,3.3,3.7,0.0,0.0,...,True,False,True,False,False,0.0,0.0,0.0,0.0,0.0
14919,True,-70.0,0.0,2.3,2.5,3.0,3.3,3.3,0.0,0.0,...,False,True,False,True,False,0.0,0.0,0.0,0.0,0.0
