In [26]:
# manipulate data
import pandas as pd
import numpy as np 

# utilitaires
from tqdm import tqdm 
import ast
import math
import warnings
warnings.filterwarnings('ignore')

# modeling 
from sklearn.model_selection import train_test_split
#import xgboost as xgb

# evaluate the model
from sklearn.metrics import mean_squared_error

## Import and prepare data

In [2]:
df = pd.read_csv(r'C:\Users\Aqsone\Desktop\football-data\data\raw\freekick_pass_shot.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,50_50,bad_behaviour_card,ball_receipt_outcome,ball_recovery_recovery_failure,block_deflection,block_save_block,carry_end_location,clearance_aerial_won,clearance_body_part,...,goalkeeper_lost_out,shot_follows_dribble,half_start_late_video_start,player_off_permanent,pass_backheel,goalkeeper_lost_in_play,half_end_early_video_end,goalkeeper_penalty_saved_to_post,goalkeeper_saved_to_post,shot_kick_off
0,0,,,,,,,,,,...,,,,,,,,,,
1,1,,,,,,,,,,...,,,,,,,,,,
2,2,,,,,,,,,,...,,,,,,,,,,
3,3,,,,,,,,,,...,,,,,,,,,,
4,4,,,,,,,,,,...,,,,,,,,,,


In [3]:
# drop columns filled with nans
df.dropna(axis=1, how='all', inplace = True)

In [4]:
# recover shots and free kick data
shots_from_fk = []
fk_assist_shot = []

for i in tqdm(range(len(df))):
    if (df['pass_type'][i] == 'Free Kick'):
        rel_shot = df['pass_assisted_shot_id'][i] 
        if pd.notna(rel_shot):
            shots_from_fk.append(rel_shot)
            fk_assist_shot.append(df['id'][i])
            
print("Number of Shots taken from a Pass Free Kick :", len(shots_from_fk))

fk_data = df[df['id'].isin(fk_assist_shot)].drop('Unnamed: 0', axis = 1)
shot_after_fk = df[df['id'].isin(shots_from_fk)].drop('Unnamed: 0', axis = 1)

100%|██████████| 106908/106908 [00:00<00:00, 172912.23it/s]

Number of Shots taken from a Pass Free Kick : 2986





In [5]:
# select usefull shot columns
used_cols = ['id', 'shot_key_pass_id', 'duration', 'location', 'minute', 'period', 'position', 'shot_body_part', 'shot_freeze_frame', 'shot_technique', 
             'shot_open_goal', 'shot_statsbomb_xg']

shot_after_fk = shot_after_fk[used_cols]
shot_after_fk.rename(columns={'id':'shot_id', 
                              'shot_key_pass_id':'fk_id', 
                              'duration':'shot_duration', 
                              'location':'shot_location', 
                              'position':'shooter_position'}, 
                    inplace=True)

print("Number of shot columns: ", len(shot_after_fk.columns))
shot_after_fk.head()

Number of shot columns:  12


Unnamed: 0,shot_id,fk_id,shot_duration,shot_location,minute,period,shooter_position,shot_body_part,shot_freeze_frame,shot_technique,shot_open_goal,shot_statsbomb_xg
42,935e22db-fc36-4f7d-80de-91ae9a498112,5347d385-7e79-4fca-bb96-3b5acc98132f,0.043629,"[109.3, 36.4]",37,1,Left Center Back,Head,"[{'location': [109.2, 33.6], 'player': {'id': ...",Normal,,0.04779
43,0b766590-dff1-4670-baae-b960fab33f6d,8a0629a4-fa79-40f3-b262-b992f030d1d4,0.495152,"[101.5, 44.6]",48,1,Right Attacking Midfield,Right Foot,"[{'location': [115.2, 41.6], 'player': {'id': ...",Normal,,0.058214
45,db21527d-2460-46c2-a63f-1a08b143f752,d4bb8cc2-bc4f-4689-b24f-ec9bb2af09dd,0.80247,"[104.3, 55.1]",74,2,Right Center Midfield,Head,"[{'location': [106.6, 47.8], 'player': {'id': ...",Normal,,0.014924
116,9f87daa3-46c6-4c28-989d-3118a5696f2e,1d412622-9200-48a2-8b85-2db51069293f,0.857323,"[106.2, 36.9]",51,2,Left Center Back,Head,"[{'location': [108.2, 39.1], 'player': {'id': ...",Normal,,0.057205
146,1bcb16a6-d77b-4121-bdb7-98169432365c,a72f0216-330d-4391-88d5-4b0b88ace69b,0.104718,"[112.1, 41.4]",49,2,Center Back,Head,"[{'location': [114.8, 32.8], 'player': {'id': ...",Normal,,0.085958


In [6]:
# select usefull free kick columns
used_cols = ['id', 'duration', 'location', 'pass_angle', 'pass_height', 'pass_length', 'pass_switch', 'position']

fk_data = fk_data[used_cols]
fk_data.rename(columns={'id':'fk_id', 
                        'duration':'fk_duration', 
                        'location':'fk_location', 
                        'position':'fk_taker_position'}, 
                inplace=True)

print("Number of free kick columns: ", len(fk_data.columns))
fk_data.head()

Number of free kick columns:  8


Unnamed: 0,fk_id,fk_duration,fk_location,pass_angle,pass_height,pass_length,pass_switch,fk_taker_position
28,5347d385-7e79-4fca-bb96-3b5acc98132f,1.418802,"[91.8, 5.0]",1.062347,High Pass,35.947323,,Left Wing Back
30,8a0629a4-fa79-40f3-b262-b992f030d1d4,1.03145,"[114.6, 61.5]",-2.230199,Ground Pass,21.382704,,Left Wing Back
37,d4bb8cc2-bc4f-4689-b24f-ec9bb2af09dd,2.715643,"[70.5, 13.2]",0.891994,High Pass,53.833538,True,Right Back
109,1d412622-9200-48a2-8b85-2db51069293f,1.154653,"[89.8, 12.2]",0.984665,High Pass,29.648777,,Left Wing Back
133,a72f0216-330d-4391-88d5-4b0b88ace69b,1.555444,"[115.9, 9.6]",1.689729,High Pass,32.026237,,Left Wing Back


In [7]:
# merge the two dataframes
action_data = shot_after_fk.merge(fk_data, how='left', on='fk_id')

print('Number of columns :', len(action_data.columns))

Number of columns : 19


### Preprocessing

In [8]:
# drop id columns
action_data.drop(['shot_id', 'fk_id'], axis=1, inplace=True) 

# convert locations to new columns
def clean_location(string_loc):
    split = string_loc[1:-1].split(',')
    return [float(split[0]), float(split[1])]

action_data['shot_location'] = action_data['shot_location'].apply(clean_location)
action_data['fk_location'] = action_data['fk_location'].apply(clean_location)

# shot_x = []
# shot_y = []
# for loc in action_data['shot_location']:
#     shot_x.append(loc[0])
#     shot_y.append(loc[1])
# action_data['shot_x'] = shot_x
# action_data['shot_y'] = shot_y

# fk_x = []
# fk_y = []
# for loc in action_data['fk_location']:
#     fk_x.append(loc[0])
#     fk_y.append(loc[1])
# action_data['fk_x'] = fk_x
# action_data['fk_y'] = fk_y

# action_data.drop(['shot_location', 'fk_location'], axis=1, inplace=True)

# encode boolean variables in 0/1
action_data['shot_open_goal'] = np.where(action_data['shot_open_goal'], 1, 0)
action_data['pass_switch'] = np.where(action_data['pass_switch'], 1, 0)

# drop freeze frame, store it and get dummies 
f_frame = action_data['shot_freeze_frame']
action_data_encoded = pd.get_dummies(action_data.drop(['shot_location', 'fk_location', 'shot_freeze_frame'], axis=1), dtype=int)
action_data_encoded['shot_freeze_frame'] = f_frame
action_data_encoded['shot_location'] = action_data.shot_location
action_data_encoded['fk_location'] = action_data.fk_location

In [9]:
action_data_encoded.dtypes

shot_duration                        float64
minute                                 int64
period                                 int64
shot_open_goal                         int32
shot_statsbomb_xg                    float64
                                      ...   
fk_taker_position_Right Wing           int32
fk_taker_position_Right Wing Back      int32
shot_freeze_frame                     object
shot_location                         object
fk_location                           object
Length: 74, dtype: object

In [35]:
action_data_encoded.to_csv(r'C:\Users\Aqsone\Desktop\football-data\data\raw\encoded_action_data.csv')

In [38]:
action_data_encoded.columns[40:]

Index(['shot_technique_Lob', 'shot_technique_Normal',
       'shot_technique_Overhead Kick', 'shot_technique_Volley',
       'pass_height_Ground Pass', 'pass_height_High Pass',
       'pass_height_Low Pass', 'fk_taker_position_Center Attacking Midfield',
       'fk_taker_position_Center Back',
       'fk_taker_position_Center Defensive Midfield',
       'fk_taker_position_Center Forward', 'fk_taker_position_Center Midfield',
       'fk_taker_position_Goalkeeper',
       'fk_taker_position_Left Attacking Midfield',
       'fk_taker_position_Left Back', 'fk_taker_position_Left Center Back',
       'fk_taker_position_Left Center Forward',
       'fk_taker_position_Left Center Midfield',
       'fk_taker_position_Left Defensive Midfield',
       'fk_taker_position_Left Midfield', 'fk_taker_position_Left Wing',
       'fk_taker_position_Left Wing Back',
       'fk_taker_position_Right Attacking Midfield',
       'fk_taker_position_Right Back', 'fk_taker_position_Right Center Back',
       '

In [39]:
action_data_encoded.columns[:40]

Index(['shot_duration', 'minute', 'period', 'shot_open_goal',
       'shot_statsbomb_xg', 'fk_duration', 'pass_angle', 'pass_length',
       'pass_switch', 'shooter_position_Center Attacking Midfield',
       'shooter_position_Center Back',
       'shooter_position_Center Defensive Midfield',
       'shooter_position_Center Forward', 'shooter_position_Center Midfield',
       'shooter_position_Goalkeeper',
       'shooter_position_Left Attacking Midfield',
       'shooter_position_Left Back', 'shooter_position_Left Center Back',
       'shooter_position_Left Center Forward',
       'shooter_position_Left Center Midfield',
       'shooter_position_Left Defensive Midfield',
       'shooter_position_Left Midfield', 'shooter_position_Left Wing',
       'shooter_position_Left Wing Back',
       'shooter_position_Right Attacking Midfield',
       'shooter_position_Right Back', 'shooter_position_Right Center Back',
       'shooter_position_Right Center Forward',
       'shooter_position_Right

In [63]:
data = pd.read_csv(r'C:\Users\Aqsone\Desktop\football-data\data\raw\encoded_action_data_xgb.csv')
data['freeze_frame'] = f_frame

In [64]:
def euclidean_distance(loc1, loc2):
    return math.sqrt((loc1[0] - loc2[0]) ** 2 + (loc1[1] - loc2[1]) ** 2)

def vector_from_points(p1, p2):
    return [p2[0] - p1[0], p2[1] - p1[1]]

def dot_product(v1, v2):
    return v1[0] * v2[0] + v1[1] * v2[1]

def magnitude(v):
    return math.sqrt(v[0]**2 + v[1]**2)

def goal_player_angle(loc1, loc2):
    player_player = vector_from_points(loc1, loc2)
    player_goal = vector_from_points(loc2, [120, 40])
    dot = dot_product(player_player, player_goal)
    mag1 = magnitude(player_player)
    mag2 = magnitude(player_goal)
    cos_theta = dot / (mag1 * mag2)
    cos_theta = max(-1, min(1, cos_theta))
    theta = math.acos(cos_theta)
    return math.degrees(theta)

players_close_dist = []
players_goal_angle = []
players_teammates = []

for i in range(len(data)):
    dist = []
    angle = []
    teammate = []

    target_location = [data['shot_x'][i], data['shot_y'][i]]
    players = ast.literal_eval(data.freeze_frame[i])
    for player in players:
        player['distance'] = euclidean_distance(player['location'], target_location)
        player['angle'] = goal_player_angle(player['location'], target_location)

    sorted_players = sorted(players, key=lambda x: x['distance'])

    closest_4_players = sorted_players[:4]

    for player in closest_4_players:
        dist.append(player['distance'])
        angle.append(player['angle'])
        teammate.append(1 if player['teammate'] else 0)

    players_close_dist.append(dist)
    players_goal_angle.append(angle)
    players_teammates.append(teammate)

data['close_players_distance'] = players_close_dist
data['close_players_angle'] = players_goal_angle
data['teammates'] = players_teammates

In [65]:
distance_expanded = pd.DataFrame(data['close_players_distance'].tolist(), columns=[f'distance_player_{i+1}' for i in range(data['close_players_distance'].apply(len).max())])
angle_expanded = pd.DataFrame(data['close_players_angle'].tolist(), columns=[f'angle_player_{i+1}' for i in range(data['close_players_angle'].apply(len).max())])
teammate_expanded = pd.DataFrame(data['teammates'].tolist(), columns=[f'teammates_player_{i+1}' for i in range(data['teammates'].apply(len).max())])

df = pd.concat([data.drop(columns=['close_players_distance', 'close_players_angle', 'teammates']), distance_expanded, angle_expanded, teammate_expanded], axis=1)

In [70]:
df.dropna(inplace = True)

In [73]:
for col in df.columns[-3:]:
    df[col] = df[col].astype(int)

In [74]:
df

Unnamed: 0.1,Unnamed: 0,shot_duration,minute,period,shot_open_goal,shot_statsbomb_xg,fk_duration,pass_angle,pass_length,pass_switch,...,distance_player_3,distance_player_4,angle_player_1,angle_player_2,angle_player_3,angle_player_4,teammates_player_1,teammates_player_2,teammates_player_3,teammates_player_4
0,0,0.043629,37,1,1,0.047790,1.418802,1.062347,35.947323,1,...,2.765863,2.800000,168.529576,141.706783,173.933367,161.404559,0,0,0,1
1,1,0.495152,48,1,1,0.058214,1.031450,-2.230199,21.382704,1,...,11.886968,13.685394,177.376010,169.948224,146.917940,167.960002,0,1,1,0
2,2,0.802470,74,2,1,0.014924,2.715643,0.891994,53.833538,1,...,1.529706,4.204759,69.314601,96.009006,122.574057,131.157679,0,1,0,1
3,3,0.857323,51,2,1,0.057205,1.154653,0.984665,29.648777,1,...,2.816026,2.973214,130.419164,67.473570,150.835016,144.934312,1,0,1,0
4,4,0.104718,49,2,1,0.085958,1.555444,1.689729,32.026237,1,...,3.505710,3.546830,169.950651,126.614400,96.778861,11.452086,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2981,2981,1.562781,40,1,1,0.111835,2.240200,-0.597682,45.848118,1,...,2.109502,4.052160,84.610688,120.323607,2.260502,107.938894,0,0,0,0
2982,2982,1.292559,2,1,1,0.026938,0.981424,-0.919099,12.200410,1,...,3.780212,8.676405,178.443788,99.512851,110.901521,95.647906,0,0,0,0
2983,2983,0.812665,111,4,1,0.095215,2.015200,-1.628336,43.471947,1,...,5.474486,5.500909,99.009355,63.009216,30.927028,50.652276,0,0,0,1
2984,2984,0.930874,69,2,1,0.103957,1.689300,0.895606,34.717430,1,...,2.400000,4.301163,37.255289,52.744711,97.744711,64.662865,0,0,0,0


In [75]:
df.drop(['Unnamed: 0', 'freeze_frame', 'shot_duration'], axis = 1, inplace = True)

In [76]:
df.to_csv(r'C:\Users\Aqsone\Desktop\football-data\data\raw\encoded_action_data_xgb_v2.csv', index = False)

In [77]:
df

Unnamed: 0,minute,period,shot_open_goal,shot_statsbomb_xg,fk_duration,pass_angle,pass_length,pass_switch,shot_x,shot_y,...,distance_player_3,distance_player_4,angle_player_1,angle_player_2,angle_player_3,angle_player_4,teammates_player_1,teammates_player_2,teammates_player_3,teammates_player_4
0,37,1,1,0.047790,1.418802,1.062347,35.947323,1,109.3,36.4,...,2.765863,2.800000,168.529576,141.706783,173.933367,161.404559,0,0,0,1
1,48,1,1,0.058214,1.031450,-2.230199,21.382704,1,101.5,44.6,...,11.886968,13.685394,177.376010,169.948224,146.917940,167.960002,0,1,1,0
2,74,2,1,0.014924,2.715643,0.891994,53.833538,1,104.3,55.1,...,1.529706,4.204759,69.314601,96.009006,122.574057,131.157679,0,1,0,1
3,51,2,1,0.057205,1.154653,0.984665,29.648777,1,106.2,36.9,...,2.816026,2.973214,130.419164,67.473570,150.835016,144.934312,1,0,1,0
4,49,2,1,0.085958,1.555444,1.689729,32.026237,1,112.1,41.4,...,3.505710,3.546830,169.950651,126.614400,96.778861,11.452086,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2981,40,1,1,0.111835,2.240200,-0.597682,45.848118,1,111.0,34.0,...,2.109502,4.052160,84.610688,120.323607,2.260502,107.938894,0,0,0,0
2982,2,1,1,0.026938,0.981424,-0.919099,12.200410,1,98.4,26.7,...,3.780212,8.676405,178.443788,99.512851,110.901521,95.647906,0,0,0,0
2983,111,4,1,0.095215,2.015200,-1.628336,43.471947,1,113.3,34.3,...,5.474486,5.500909,99.009355,63.009216,30.927028,50.652276,0,0,0,1
2984,69,2,1,0.103957,1.689300,0.895606,34.717430,1,107.5,38.3,...,2.400000,4.301163,37.255289,52.744711,97.744711,64.662865,0,0,0,0
