In [26]:
# manipulate data
import pandas as pd
import numpy as np 

# utilitaires
from tqdm import tqdm 
import ast
import warnings
warnings.filterwarnings('ignore')

# modeling 
from sklearn.model_selection import train_test_split
#import xgboost as xgb

# evaluate the model
from sklearn.metrics import mean_squared_error

## Import and prepare data

In [27]:
df = pd.read_csv(r'C:\Users\Aqsone\Desktop\football-data\data\raw\freekick_pass_shot.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,50_50,bad_behaviour_card,ball_receipt_outcome,ball_recovery_recovery_failure,block_deflection,block_save_block,carry_end_location,clearance_aerial_won,clearance_body_part,...,goalkeeper_lost_out,shot_follows_dribble,half_start_late_video_start,player_off_permanent,pass_backheel,goalkeeper_lost_in_play,half_end_early_video_end,goalkeeper_penalty_saved_to_post,goalkeeper_saved_to_post,shot_kick_off
0,0,,,,,,,,,,...,,,,,,,,,,
1,1,,,,,,,,,,...,,,,,,,,,,
2,2,,,,,,,,,,...,,,,,,,,,,
3,3,,,,,,,,,,...,,,,,,,,,,
4,4,,,,,,,,,,...,,,,,,,,,,


In [28]:
# drop columns filled with nans
df.dropna(axis=1, how='all', inplace = True)

In [29]:
# recover shots and free kick data
shots_from_fk = []
fk_assist_shot = []

for i in tqdm(range(len(df))):
    if (df['pass_type'][i] == 'Free Kick'):
        rel_shot = df['pass_assisted_shot_id'][i] 
        if pd.notna(rel_shot):
            shots_from_fk.append(rel_shot)
            fk_assist_shot.append(df['id'][i])
            
print("Number of Shots taken from a Pass Free Kick :", len(shots_from_fk))

fk_data = df[df['id'].isin(fk_assist_shot)].drop('Unnamed: 0', axis = 1)
shot_after_fk = df[df['id'].isin(shots_from_fk)].drop('Unnamed: 0', axis = 1)

  0%|          | 0/106908 [00:00<?, ?it/s]

100%|██████████| 106908/106908 [00:00<00:00, 118505.18it/s]

Number of Shots taken from a Pass Free Kick : 2986





In [30]:
# select usefull shot columns
used_cols = ['id', 'shot_key_pass_id', 'duration', 'location', 'minute', 'period', 'position', 'shot_body_part', 'shot_freeze_frame', 'shot_technique', 
             'shot_open_goal', 'shot_statsbomb_xg']

shot_after_fk = shot_after_fk[used_cols]
shot_after_fk.rename(columns={'id':'shot_id', 
                              'shot_key_pass_id':'fk_id', 
                              'duration':'shot_duration', 
                              'location':'shot_location', 
                              'position':'shooter_position'}, 
                    inplace=True)

print("Number of shot columns: ", len(shot_after_fk.columns))
shot_after_fk.head()

Number of shot columns:  12


Unnamed: 0,shot_id,fk_id,shot_duration,shot_location,minute,period,shooter_position,shot_body_part,shot_freeze_frame,shot_technique,shot_open_goal,shot_statsbomb_xg
42,935e22db-fc36-4f7d-80de-91ae9a498112,5347d385-7e79-4fca-bb96-3b5acc98132f,0.043629,"[109.3, 36.4]",37,1,Left Center Back,Head,"[{'location': [109.2, 33.6], 'player': {'id': ...",Normal,,0.04779
43,0b766590-dff1-4670-baae-b960fab33f6d,8a0629a4-fa79-40f3-b262-b992f030d1d4,0.495152,"[101.5, 44.6]",48,1,Right Attacking Midfield,Right Foot,"[{'location': [115.2, 41.6], 'player': {'id': ...",Normal,,0.058214
45,db21527d-2460-46c2-a63f-1a08b143f752,d4bb8cc2-bc4f-4689-b24f-ec9bb2af09dd,0.80247,"[104.3, 55.1]",74,2,Right Center Midfield,Head,"[{'location': [106.6, 47.8], 'player': {'id': ...",Normal,,0.014924
116,9f87daa3-46c6-4c28-989d-3118a5696f2e,1d412622-9200-48a2-8b85-2db51069293f,0.857323,"[106.2, 36.9]",51,2,Left Center Back,Head,"[{'location': [108.2, 39.1], 'player': {'id': ...",Normal,,0.057205
146,1bcb16a6-d77b-4121-bdb7-98169432365c,a72f0216-330d-4391-88d5-4b0b88ace69b,0.104718,"[112.1, 41.4]",49,2,Center Back,Head,"[{'location': [114.8, 32.8], 'player': {'id': ...",Normal,,0.085958


In [31]:
# select usefull free kick columns
used_cols = ['id', 'duration', 'location', 'pass_angle', 'pass_height', 'pass_length', 'pass_switch', 'position']

fk_data = fk_data[used_cols]
fk_data.rename(columns={'id':'fk_id', 
                        'duration':'fk_duration', 
                        'location':'fk_location', 
                        'position':'fk_taker_position'}, 
                inplace=True)

print("Number of free kick columns: ", len(fk_data.columns))
fk_data.head()

Number of free kick columns:  8


Unnamed: 0,fk_id,fk_duration,fk_location,pass_angle,pass_height,pass_length,pass_switch,fk_taker_position
28,5347d385-7e79-4fca-bb96-3b5acc98132f,1.418802,"[91.8, 5.0]",1.062347,High Pass,35.947323,,Left Wing Back
30,8a0629a4-fa79-40f3-b262-b992f030d1d4,1.03145,"[114.6, 61.5]",-2.230199,Ground Pass,21.382704,,Left Wing Back
37,d4bb8cc2-bc4f-4689-b24f-ec9bb2af09dd,2.715643,"[70.5, 13.2]",0.891994,High Pass,53.833538,True,Right Back
109,1d412622-9200-48a2-8b85-2db51069293f,1.154653,"[89.8, 12.2]",0.984665,High Pass,29.648777,,Left Wing Back
133,a72f0216-330d-4391-88d5-4b0b88ace69b,1.555444,"[115.9, 9.6]",1.689729,High Pass,32.026237,,Left Wing Back


In [32]:
# merge the two dataframes
action_data = shot_after_fk.merge(fk_data, how='left', on='fk_id')

print('Number of columns :', len(action_data.columns))

Number of columns : 19


### Preprocessing

In [33]:
# drop id columns
action_data.drop(['shot_id', 'fk_id'], axis=1, inplace=True) 

# convert locations to new columns
def clean_location(string_loc):
    split = string_loc[1:-1].split(',')
    return [float(split[0]), float(split[1])]

action_data['shot_location'] = action_data['shot_location'].apply(clean_location)
action_data['fk_location'] = action_data['fk_location'].apply(clean_location)

# shot_x = []
# shot_y = []
# for loc in action_data['shot_location']:
#     shot_x.append(loc[0])
#     shot_y.append(loc[1])
# action_data['shot_x'] = shot_x
# action_data['shot_y'] = shot_y

# fk_x = []
# fk_y = []
# for loc in action_data['fk_location']:
#     fk_x.append(loc[0])
#     fk_y.append(loc[1])
# action_data['fk_x'] = fk_x
# action_data['fk_y'] = fk_y

# action_data.drop(['shot_location', 'fk_location'], axis=1, inplace=True)

# encode boolean variables in 0/1
action_data['shot_open_goal'] = np.where(action_data['shot_open_goal'], 1, 0)
action_data['pass_switch'] = np.where(action_data['pass_switch'], 1, 0)

# drop freeze frame, store it and get dummies 
f_frame = action_data['shot_freeze_frame']
action_data_encoded = pd.get_dummies(action_data.drop(['shot_location', 'fk_location', 'shot_freeze_frame'], axis=1), dtype=int)
action_data_encoded['shot_freeze_frame'] = f_frame
action_data_encoded['shot_location'] = action_data.shot_location
action_data_encoded['fk_location'] = action_data.fk_location

In [34]:
action_data_encoded.dtypes

shot_duration                        float64
minute                                 int64
period                                 int64
shot_open_goal                         int32
shot_statsbomb_xg                    float64
                                      ...   
fk_taker_position_Right Wing           int32
fk_taker_position_Right Wing Back      int32
shot_freeze_frame                     object
shot_location                         object
fk_location                           object
Length: 74, dtype: object

In [35]:
action_data_encoded.to_csv(r'C:\Users\Aqsone\Desktop\football-data\data\raw\encoded_action_data.csv')

In [38]:
action_data_encoded.columns[40:]

Index(['shot_technique_Lob', 'shot_technique_Normal',
       'shot_technique_Overhead Kick', 'shot_technique_Volley',
       'pass_height_Ground Pass', 'pass_height_High Pass',
       'pass_height_Low Pass', 'fk_taker_position_Center Attacking Midfield',
       'fk_taker_position_Center Back',
       'fk_taker_position_Center Defensive Midfield',
       'fk_taker_position_Center Forward', 'fk_taker_position_Center Midfield',
       'fk_taker_position_Goalkeeper',
       'fk_taker_position_Left Attacking Midfield',
       'fk_taker_position_Left Back', 'fk_taker_position_Left Center Back',
       'fk_taker_position_Left Center Forward',
       'fk_taker_position_Left Center Midfield',
       'fk_taker_position_Left Defensive Midfield',
       'fk_taker_position_Left Midfield', 'fk_taker_position_Left Wing',
       'fk_taker_position_Left Wing Back',
       'fk_taker_position_Right Attacking Midfield',
       'fk_taker_position_Right Back', 'fk_taker_position_Right Center Back',
       '

In [39]:
action_data_encoded.columns[:40]

Index(['shot_duration', 'minute', 'period', 'shot_open_goal',
       'shot_statsbomb_xg', 'fk_duration', 'pass_angle', 'pass_length',
       'pass_switch', 'shooter_position_Center Attacking Midfield',
       'shooter_position_Center Back',
       'shooter_position_Center Defensive Midfield',
       'shooter_position_Center Forward', 'shooter_position_Center Midfield',
       'shooter_position_Goalkeeper',
       'shooter_position_Left Attacking Midfield',
       'shooter_position_Left Back', 'shooter_position_Left Center Back',
       'shooter_position_Left Center Forward',
       'shooter_position_Left Center Midfield',
       'shooter_position_Left Defensive Midfield',
       'shooter_position_Left Midfield', 'shooter_position_Left Wing',
       'shooter_position_Left Wing Back',
       'shooter_position_Right Attacking Midfield',
       'shooter_position_Right Back', 'shooter_position_Right Center Back',
       'shooter_position_Right Center Forward',
       'shooter_position_Right

In [46]:
from src.dataset import read_data

graphs = read_data(data_path=r'C:\Users\Aqsone\Desktop\football-data\data\processed\processed_actions_data.pkl')

In [58]:
len(graphs)

2986

In [57]:
graphs[0].pos

tensor([[109.3000,  36.4000],
        [ 91.8000,   5.0000],
        [109.2000,  33.6000],
        [112.1000,  36.4000],
        [110.1000,  36.5000],
        [100.5000,  20.3000],
        [110.9000,  40.9000],
        [110.8000,  38.7000],
        [112.0000,  37.0000],
        [112.4000,  33.1000],
        [112.4000,  41.6000],
        [110.0000,  41.6000],
        [112.1000,  43.9000],
        [104.0000,  32.1000],
        [102.3000,  17.7000],
        [112.2000,  49.4000],
        [110.0000,  30.5000],
        [118.9000,  39.1000]])