In [204]:
# manipulate data
import pandas as pd
import numpy as np 

# utilitaires
from tqdm import tqdm 
import ast
import warnings
warnings.filterwarnings('ignore')

# modeling 
from sklearn.model_selection import train_test_split
#import xgboost as xgb

# evaluate the model
from sklearn.metrics import mean_squared_error

## Import and prepare data

In [206]:
df = pd.read_csv('../data/freekick_pass_shot.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,50_50,bad_behaviour_card,ball_receipt_outcome,ball_recovery_recovery_failure,block_deflection,block_save_block,carry_end_location,clearance_aerial_won,clearance_body_part,...,goalkeeper_lost_out,shot_follows_dribble,half_start_late_video_start,player_off_permanent,pass_backheel,goalkeeper_lost_in_play,half_end_early_video_end,goalkeeper_penalty_saved_to_post,goalkeeper_saved_to_post,shot_kick_off
0,0,,,,,,,,,,...,,,,,,,,,,
1,1,,,,,,,,,,...,,,,,,,,,,
2,2,,,,,,,,,,...,,,,,,,,,,
3,3,,,,,,,,,,...,,,,,,,,,,
4,4,,,,,,,,,,...,,,,,,,,,,


In [207]:
# drop columns filled with nans
df.dropna(axis=1, how='all', inplace = True)

In [208]:
# recover shots and free kick data
shots_from_fk = []
fk_assist_shot = []

for i in tqdm(range(len(df))):
    if (df['pass_type'][i] == 'Free Kick'):
        rel_shot = df['pass_assisted_shot_id'][i] 
        if pd.notna(rel_shot):
            shots_from_fk.append(rel_shot)
            fk_assist_shot.append(df['id'][i])
            
print("Number of Shots taken from a Pass Free Kick :", len(shots_from_fk))

fk_data = df[df['id'].isin(fk_assist_shot)].drop('Unnamed: 0', axis = 1)
shot_after_fk = df[df['id'].isin(shots_from_fk)].drop('Unnamed: 0', axis = 1)

100%|██████████| 106908/106908 [00:03<00:00, 33053.65it/s]


Number of Shots taken from a Pass Free Kick : 2986


In [209]:
# select usefull shot columns
used_cols = ['id', 'shot_key_pass_id', 'duration', 'location', 'minute', 'period', 'position', 'shot_body_part', 'shot_freeze_frame', 'shot_technique', 
             'shot_open_goal', 'shot_statsbomb_xg']

shot_after_fk = shot_after_fk[used_cols]
shot_after_fk.rename(columns={'id':'shot_id', 
                              'shot_key_pass_id':'fk_id', 
                              'duration':'shot_duration', 
                              'location':'shot_location', 
                              'position':'shooter_position'}, 
                    inplace=True)

print("Number of shot columns: ", len(shot_after_fk.columns))
shot_after_fk.head()

Number of shot columns:  12


Unnamed: 0,shot_id,fk_id,shot_duration,shot_location,minute,period,shooter_position,shot_body_part,shot_freeze_frame,shot_technique,shot_open_goal,shot_statsbomb_xg
42,935e22db-fc36-4f7d-80de-91ae9a498112,5347d385-7e79-4fca-bb96-3b5acc98132f,0.043629,"[109.3, 36.4]",37,1,Left Center Back,Head,"[{'location': [109.2, 33.6], 'player': {'id': ...",Normal,,0.04779
43,0b766590-dff1-4670-baae-b960fab33f6d,8a0629a4-fa79-40f3-b262-b992f030d1d4,0.495152,"[101.5, 44.6]",48,1,Right Attacking Midfield,Right Foot,"[{'location': [115.2, 41.6], 'player': {'id': ...",Normal,,0.058214
45,db21527d-2460-46c2-a63f-1a08b143f752,d4bb8cc2-bc4f-4689-b24f-ec9bb2af09dd,0.80247,"[104.3, 55.1]",74,2,Right Center Midfield,Head,"[{'location': [106.6, 47.8], 'player': {'id': ...",Normal,,0.014924
116,9f87daa3-46c6-4c28-989d-3118a5696f2e,1d412622-9200-48a2-8b85-2db51069293f,0.857323,"[106.2, 36.9]",51,2,Left Center Back,Head,"[{'location': [108.2, 39.1], 'player': {'id': ...",Normal,,0.057205
146,1bcb16a6-d77b-4121-bdb7-98169432365c,a72f0216-330d-4391-88d5-4b0b88ace69b,0.104718,"[112.1, 41.4]",49,2,Center Back,Head,"[{'location': [114.8, 32.8], 'player': {'id': ...",Normal,,0.085958


In [210]:
# select usefull free kick columns
used_cols = ['id', 'duration', 'location', 'pass_angle', 'pass_height', 'pass_length', 'pass_switch', 'position']

fk_data = fk_data[used_cols]
fk_data.rename(columns={'id':'fk_id', 
                        'duration':'fk_duration', 
                        'location':'fk_location', 
                        'position':'fk_taker_position'}, 
                inplace=True)

print("Number of free kick columns: ", len(fk_data.columns))
fk_data.head()

Number of free kick columns:  8


Unnamed: 0,fk_id,fk_duration,fk_location,pass_angle,pass_height,pass_length,pass_switch,fk_taker_position
28,5347d385-7e79-4fca-bb96-3b5acc98132f,1.418802,"[91.8, 5.0]",1.062347,High Pass,35.947323,,Left Wing Back
30,8a0629a4-fa79-40f3-b262-b992f030d1d4,1.03145,"[114.6, 61.5]",-2.230199,Ground Pass,21.382704,,Left Wing Back
37,d4bb8cc2-bc4f-4689-b24f-ec9bb2af09dd,2.715643,"[70.5, 13.2]",0.891994,High Pass,53.833538,True,Right Back
109,1d412622-9200-48a2-8b85-2db51069293f,1.154653,"[89.8, 12.2]",0.984665,High Pass,29.648777,,Left Wing Back
133,a72f0216-330d-4391-88d5-4b0b88ace69b,1.555444,"[115.9, 9.6]",1.689729,High Pass,32.026237,,Left Wing Back


In [211]:
# merge the two dataframes
action_data = shot_after_fk.merge(fk_data, how='left', on='fk_id')

print('Number of columns :', len(action_data.columns))

Number of columns : 19


### Preprocessing

In [213]:
# drop id columns
action_data.drop(['shot_id', 'fk_id'], axis=1, inplace=True) 

# convert locations to new columns
def clean_location(string_loc):
    split = string_loc[1:-1].split(',')
    return [float(split[0]), float(split[1])]

action_data['shot_location'] = action_data['shot_location'].apply(clean_location)
action_data['fk_location'] = action_data['fk_location'].apply(clean_location)

shot_x = []
shot_y = []
for loc in action_data['shot_location']:
    shot_x.append(loc[0])
    shot_y.append(loc[1])
action_data['shot_x'] = shot_x
action_data['shot_y'] = shot_y

fk_x = []
fk_y = []
for loc in action_data['fk_location']:
    fk_x.append(loc[0])
    fk_y.append(loc[1])
action_data['fk_x'] = fk_x
action_data['fk_y'] = fk_y

action_data.drop(['shot_location', 'fk_location'], axis=1, inplace=True)

# encode boolean variables in 0/1
action_data['shot_open_goal'] = np.where(action_data['shot_open_goal'], 1, 0)
action_data['pass_switch'] = np.where(action_data['pass_switch'], 1, 0)

# drop freeze frame, store it and get dummies 
f_frame = action_data['shot_freeze_frame']
action_data = pd.get_dummies(action_data.drop('shot_freeze_frame', axis=1), dtype=int)
action_data['shot_freeze_frame'] = f_frame

In [214]:
action_data

Unnamed: 0,shot_duration,minute,period,shot_open_goal,shot_statsbomb_xg,fk_duration,pass_angle,pass_length,pass_switch,shot_x,...,fk_taker_position_Right Attacking Midfield,fk_taker_position_Right Back,fk_taker_position_Right Center Back,fk_taker_position_Right Center Forward,fk_taker_position_Right Center Midfield,fk_taker_position_Right Defensive Midfield,fk_taker_position_Right Midfield,fk_taker_position_Right Wing,fk_taker_position_Right Wing Back,shot_freeze_frame
0,0.043629,37,1,1,0.047790,1.418802,1.062347,35.947323,1,109.3,...,0,0,0,0,0,0,0,0,0,"[{'location': [109.2, 33.6], 'player': {'id': ..."
1,0.495152,48,1,1,0.058214,1.031450,-2.230199,21.382704,1,101.5,...,0,0,0,0,0,0,0,0,0,"[{'location': [115.2, 41.6], 'player': {'id': ..."
2,0.802470,74,2,1,0.014924,2.715643,0.891994,53.833538,1,104.3,...,0,1,0,0,0,0,0,0,0,"[{'location': [106.6, 47.8], 'player': {'id': ..."
3,0.857323,51,2,1,0.057205,1.154653,0.984665,29.648777,1,106.2,...,0,0,0,0,0,0,0,0,0,"[{'location': [108.2, 39.1], 'player': {'id': ..."
4,0.104718,49,2,1,0.085958,1.555444,1.689729,32.026237,1,112.1,...,0,0,0,0,0,0,0,0,0,"[{'location': [114.8, 32.8], 'player': {'id': ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2981,1.562781,40,1,1,0.111835,2.240200,-0.597682,45.848118,1,111.0,...,0,0,1,0,0,0,0,0,0,"[{'location': [111.5, 44.5], 'player': {'id': ..."
2982,1.292559,2,1,1,0.026938,0.981424,-0.919099,12.200410,1,98.4,...,0,0,0,0,0,0,0,0,0,"[{'location': [102.4, 15.4], 'player': {'id': ..."
2983,0.812665,111,4,1,0.095215,2.015200,-1.628336,43.471947,1,113.3,...,0,0,0,0,0,0,0,0,0,"[{'location': [118.8, 38.3], 'player': {'id': ..."
2984,0.930874,69,2,1,0.103957,1.689300,0.895606,34.717430,1,107.5,...,0,0,0,0,0,0,0,0,0,"[{'location': [90.5, 52.0], 'player': {'id': 1..."


In [217]:
action_data.to_csv('../data/actions_data.csv')