In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from mplsoccer.pitch import Pitch

In [2]:
# Load the data
cwd = os.getcwd()
STATSBOMB = os.path.join(cwd, 'data', 'statsbomb')
df_statsbomb_event = pd.read_parquet(os.path.join(STATSBOMB, 'event.parquet'))
df_statsbomb_related = pd.read_parquet(os.path.join(STATSBOMB, 'related.parquet'))
df_statsbomb_match = pd.read_parquet(os.path.join(STATSBOMB, 'match.parquet'))

In [3]:
# Get goalkeeper for shots
df_gk = df_statsbomb_related.loc[(df_statsbomb_related.type_name == 'Shot') & 
                                 (df_statsbomb_related.type_name_related == 'Goal Keeper'), ['id', 'id_related']]
df_gk = df_gk.merge((df_statsbomb_event[['id', 'player_id', 'type_name', 'sub_type_name']]
                     .rename({'id': 'id_related'}, axis=1)), how='left', on='id_related')

In [4]:
# Pitches for coordinate conversion
pitch_statsbomb = Pitch(pitch_type='statsbomb', figsize=(16, 9))
pitch_statsperform = Pitch(pitch_type='uefa', figsize=(16, 9))

In [5]:
# Create a dataframe of the strongest foot
df_foot = df_statsbomb_event[df_statsbomb_event.body_part_name.isin(['Right Foot', 'Left Foot'])].copy()
df_foot = df_foot.groupby('player_id').body_part_name.value_counts()
df_foot = df_foot.unstack()
df_foot.reset_index(drop=False, inplace=True)
df_foot.index.name = ''
df_foot.rename({'Left Foot': 'left', 'Right Foot': 'right'}, axis=1, inplace=True)
df_foot.fillna(0, inplace=True)
n_total = df_foot[['left', 'right']].sum(axis=1)
df_foot['left'] = df_foot['left'] / n_total
df_foot['right'] = df_foot['right'] / n_total
df_foot.loc[(np.abs(df_foot.left - df_foot.right) < 0.1) & n_total > 20, 'foot'] = 'both'
df_foot.loc[(df_foot.foot.isnull()) & (df_foot.left > df_foot.right), 'foot'] = 'left'
df_foot.loc[(df_foot.foot.isnull()), 'foot'] = 'right'

In [6]:
# Fast attack, win ball in own third, shoot in last quarter in 7-25 seconds
df_statsbomb_event['eventSec'] = (df_statsbomb_event.timestamp_minute * 60 + df_statsbomb_event.timestamp_second +
                                  df_statsbomb_event.timestamp_millisecond/1000)
mask_gk = ((df_statsbomb_event.type_name == 'Goal Keeper') & 
           (~df_statsbomb_event.sub_type_name.isin(['Shot Faced', 'Goal Conceded', 'Penalty Conceded'])))
mask_50 =((df_statsbomb_event.type_name == '50/50') & 
          (df_statsbomb_event.outcome_name.isin(['Success To Team', 'Won','Success To Opposition'])))
mask_other = (df_statsbomb_event.type_name == 'Interception') | (df_statsbomb_event.sub_type_name == 'Tackle')
mask_defence_win = (mask_gk | mask_50 | mask_other) & (df_statsbomb_event.x < 40.08)
df_statsbomb_event.loc[mask_defence_win, 'defence_win'] = df_statsbomb_event.loc[mask_defence_win, 'team_id']
df_statsbomb_event.loc[mask_defence_win, 'defence_sec'] = df_statsbomb_event.loc[mask_defence_win, 'eventSec']
group_match = df_statsbomb_event.groupby(['match_id', 'period'])
df_statsbomb_event[['defence_win', 'defence_sec']] = group_match[['defence_win', 'defence_sec']].ffill()
mask_fast = (((df_statsbomb_event.eventSec - df_statsbomb_event.defence_sec) <= 25) &
             (df_statsbomb_event.x > 90) &
             (df_statsbomb_event.type_name == 'Shot') & (df_statsbomb_event.team_id == df_statsbomb_event.defence_win))
df_statsbomb_event['fast_break'] = mask_fast

In [7]:
# Add on previous info
# first filter out some events so the previous event is the correct assist type
on_ball = ['Ball Recovery','Block','Clearance','Dispossessed','Dribble','Referee Ball-Drop','Shield',
           'Error','Foul Won','Goal Keeper','Interception','Miscontrol','Own Goal Against','Pass','Shot']
# won 50/50s
mask_50_50_success = ((df_statsbomb_event.type_name=='50/50')&
                      (df_statsbomb_event.outcome_name.isin(['Won','Success To Team','Success To Opposition'])))
# off ball goal keeper
mask_goalkeeper_off_ball = df_statsbomb_event.sub_type_name.isin(['Shot Faced', 'Goal Conceded',
                                                                  'Penalty Conceded', 'Smother'])
# all on the ball events
mask_on_ball = ((df_statsbomb_event.type_name.isin(on_ball))&(~mask_goalkeeper_off_ball))|(mask_50_50_success)
df_statsbomb_event = df_statsbomb_event[mask_on_ball].copy()
match_group = df_statsbomb_event.groupby(['match_id', 'period'])
for i in range(1, 4):
    df_statsbomb_event[f'prev_id_{i}'] = match_group.id.shift(i)
    df_statsbomb_event[f'prev_type_name_{i}'] = match_group.type_name.shift(i)
    df_statsbomb_event[f'prev_player_id_{i}'] = match_group.player_id.shift(i)
    df_statsbomb_event[f'prev_team_id_{i}'] = match_group.team_id.shift(i)

In [8]:
# Add set piece column
mask_set_piece = df_statsbomb_event.sub_type_name.isin(['Throw-in', 'Corner', 'Free Kick'])
df_statsbomb_event.loc[mask_set_piece, 'set_piece'] = df_statsbomb_event.loc[mask_set_piece, 'sub_type_name']

In [9]:
# Flag 10 seconds from a corner or freekick/ 20 seconds from a throw-in
group_match = df_statsbomb_event.groupby(['match_id', 'period'])
for set_piece in ['Corner', 'Throw-in', 'Free Kick']:
    mask = df_statsbomb_event.set_piece == set_piece
    name = set_piece.replace(' ', '_').replace('-', '_').lower()
    df_statsbomb_event.loc[mask, f'{name}_sec'] = df_statsbomb_event.loc[mask, 'eventSec']
    df_statsbomb_event.loc[mask, f'{name}_team'] = df_statsbomb_event.loc[mask, 'team_id']
    df_statsbomb_event[f'{name}_sec'] = group_match[f'{name}_sec'].ffill()
    df_statsbomb_event[f'{name}_team'] = group_match[f'{name}_team'].ffill()
    df_statsbomb_event[f'{name}_sec'] = df_statsbomb_event.eventSec - df_statsbomb_event[f'{name}_sec']
df_statsbomb_event.loc[df_statsbomb_event.throw_in_sec > 20, 'throw_in_sec'] = np.nan
df_statsbomb_event.loc[df_statsbomb_event.free_kick_sec > 10, 'free_kick_sec'] = np.nan
df_statsbomb_event.loc[df_statsbomb_event.corner_sec > 10, 'corner_sec'] = np.nan
df_statsbomb_event['play_type'] = df_statsbomb_event[['throw_in_sec', 'free_kick_sec', 'corner_sec']].idxmin(axis=1).str[:-4]
# if throw-in and defensive set to null
mask_defensive = ((df_statsbomb_event.play_type == 'throw_in') &
                  (df_statsbomb_event['throw_in_team'] != df_statsbomb_event.team_id))
df_statsbomb_event.loc[mask_defensive, 'play_type'] = np.nan

In [10]:
# Add player name
player_name_series = df_statsbomb_event.player_name.str.strip().str.replace(pat='"', repl="'").str.split(' ')
df_statsbomb_event['firstName'] = player_name_series.apply(lambda x: x[0] if isinstance(x, list) else None)
df_statsbomb_event['middleName'] = player_name_series.apply(lambda x: ' '.join(x[1:-1]) if isinstance(x, list) else None)
df_statsbomb_event['middleName'] = df_statsbomb_event['middleName'].str.strip()
df_statsbomb_event['lastName'] = player_name_series.apply(lambda x: x[-1] if isinstance(x, list) else None)
df_statsbomb_event['Name'] = ((df_statsbomb_event['firstName'] + ' ' + df_statsbomb_event['middleName']).str.strip()
                              + ' ' + df_statsbomb_event['lastName'])

In [11]:
# Filter Shots
df_statsbomb_shots = (df_statsbomb_event[(df_statsbomb_event.type_name=='Shot')&
                                        (~df_statsbomb_event.sub_type_name.isin(['Corner']))]
                      .dropna(how='all', axis=1)
                      .copy())
print('Number of shots:',len(df_statsbomb_shots))
print('Number of goals:', (df_statsbomb_shots.outcome_name == 'Goal').sum())

Number of shots: 22669
Number of goals: 2826


In [12]:
# Add on competition gender
df_statsbomb_shots = df_statsbomb_shots.merge(df_statsbomb_match[['match_id', 'competition_gender','competition_name',
                                                                  'match_week','home_team_id','away_team_id']], 
                                                                  on='match_id', how='left')
# fill in when the match dataset is missing
women = [7298, 22536, 265905, 266234, 266466, 266574, 266933, 267161, 267405, 267609, 267679]
men = [18235, 18236, 18237, 18240, 18241, 18242, 18245, 18236, 18237, 18240, 18241, 18242, 18243, 18244, 18245]
df_statsbomb_shots.loc[df_statsbomb_shots.match_id.isin(women), 'competition_gender'] = 'female'
df_statsbomb_shots.loc[df_statsbomb_shots.match_id.isin(men), 'competition_gender'] = 'male'
df_statsbomb_shots

Unnamed: 0,match_id,id,index,period,timestamp_minute,timestamp_second,timestamp_millisecond,minute,second,type_id,...,play_type,firstName,middleName,lastName,Name,competition_gender,competition_name,match_week,home_team_id,away_team_id
0,15946,65f16e50-7c5d-4293-b2fc-d20887a772f9,148,1,2,29,94,2,29,16,...,,Lionel,Andrés Messi,Cuccittini,Lionel Andrés Messi Cuccittini,male,La Liga,1.0,217.0,206.0
1,15946,b0f73423-3990-45ae-9dda-3512c2d1aff3,283,1,5,39,239,5,39,16,...,,Jordi,Alba,Ramos,Jordi Alba Ramos,male,La Liga,1.0,217.0,206.0
2,15946,13b1ddab-d22e-43d9-bfe4-12632fea1a27,755,1,15,28,625,15,28,16,...,,Lionel,Andrés Messi,Cuccittini,Lionel Andrés Messi Cuccittini,male,La Liga,1.0,217.0,206.0
3,15946,391bfb74-07a6-4afe-9568-02a9b23f5bd4,788,1,16,19,616,16,19,16,...,,Rubén,Sobrino,Pozuelo,Rubén Sobrino Pozuelo,male,La Liga,1.0,217.0,206.0
4,15946,5e55f5a5-954f-4cc4-ba6e-a9cf6d6e249e,842,1,18,15,914,18,15,16,...,,Luis,Alberto Suárez,Díaz,Luis Alberto Suárez Díaz,male,La Liga,1.0,217.0,206.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22664,7558,82c088ab-199e-425f-a473-7ba420875014,3129,2,44,18,675,89,18,16,...,,Cristian,Gabriel Rodríguez,Barrotti,Cristian Gabriel Rodríguez Barrotti,male,FIFA World Cup,3.0,783.0,796.0
22665,7558,96e16373-a680-4b07-88b9-f69219692837,3134,2,44,47,115,89,47,16,...,corner,Diego,Roberto Godín,Leal,Diego Roberto Godín Leal,male,FIFA World Cup,3.0,783.0,796.0
22666,7558,ee862796-108e-4dbb-9020-fca7dca701bc,3138,2,44,48,195,89,48,16,...,corner,Edinson,Roberto Cavani,Gómez,Edinson Roberto Cavani Gómez,male,FIFA World Cup,3.0,783.0,796.0
22667,7558,9627b537-af10-4437-88de-bb9b17068768,3253,2,49,15,955,94,15,16,...,,Giorgian,Daniel De Arrascaeta,Benedetti,Giorgian Daniel De Arrascaeta Benedetti,male,FIFA World Cup,3.0,783.0,796.0


In [13]:
# StatsBomb: Add on shot fidelity
additional_higher_fidelity_shots = [22536, 265905, 266234, 266466, 266574, 266933, 267161, 267405,
                                    267609, 267679]
df_statsbomb_shots = df_statsbomb_shots.merge(df_statsbomb_match[['match_id', 'metadata_shot_fidelity_version']],
                                              on='match_id', how='left', validate='m:1')
df_statsbomb_shots.loc[df_statsbomb_shots.match_id.isin(additional_higher_fidelity_shots),
                       'metadata_shot_fidelity_version'] = '2'
df_statsbomb_shots

Unnamed: 0,match_id,id,index,period,timestamp_minute,timestamp_second,timestamp_millisecond,minute,second,type_id,...,firstName,middleName,lastName,Name,competition_gender,competition_name,match_week,home_team_id,away_team_id,metadata_shot_fidelity_version
0,15946,65f16e50-7c5d-4293-b2fc-d20887a772f9,148,1,2,29,94,2,29,16,...,Lionel,Andrés Messi,Cuccittini,Lionel Andrés Messi Cuccittini,male,La Liga,1.0,217.0,206.0,2
1,15946,b0f73423-3990-45ae-9dda-3512c2d1aff3,283,1,5,39,239,5,39,16,...,Jordi,Alba,Ramos,Jordi Alba Ramos,male,La Liga,1.0,217.0,206.0,2
2,15946,13b1ddab-d22e-43d9-bfe4-12632fea1a27,755,1,15,28,625,15,28,16,...,Lionel,Andrés Messi,Cuccittini,Lionel Andrés Messi Cuccittini,male,La Liga,1.0,217.0,206.0,2
3,15946,391bfb74-07a6-4afe-9568-02a9b23f5bd4,788,1,16,19,616,16,19,16,...,Rubén,Sobrino,Pozuelo,Rubén Sobrino Pozuelo,male,La Liga,1.0,217.0,206.0,2
4,15946,5e55f5a5-954f-4cc4-ba6e-a9cf6d6e249e,842,1,18,15,914,18,15,16,...,Luis,Alberto Suárez,Díaz,Luis Alberto Suárez Díaz,male,La Liga,1.0,217.0,206.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22664,7558,82c088ab-199e-425f-a473-7ba420875014,3129,2,44,18,675,89,18,16,...,Cristian,Gabriel Rodríguez,Barrotti,Cristian Gabriel Rodríguez Barrotti,male,FIFA World Cup,3.0,783.0,796.0,
22665,7558,96e16373-a680-4b07-88b9-f69219692837,3134,2,44,47,115,89,47,16,...,Diego,Roberto Godín,Leal,Diego Roberto Godín Leal,male,FIFA World Cup,3.0,783.0,796.0,
22666,7558,ee862796-108e-4dbb-9020-fca7dca701bc,3138,2,44,48,195,89,48,16,...,Edinson,Roberto Cavani,Gómez,Edinson Roberto Cavani Gómez,male,FIFA World Cup,3.0,783.0,796.0,
22667,7558,9627b537-af10-4437-88de-bb9b17068768,3253,2,49,15,955,94,15,16,...,Giorgian,Daniel De Arrascaeta,Benedetti,Giorgian Daniel De Arrascaeta Benedetti,male,FIFA World Cup,3.0,783.0,796.0,


In [14]:
# Add on pass assist information
pass_cols = ['pass_assisted_shot_id', 'end_x', 'end_y', 'pass_height_name',
             'pass_switch', 'technique_name', 'pass_cross', 'pass_cut_back']
df_pass = df_statsbomb_event.loc[df_statsbomb_event.pass_assisted_shot_id.notnull(), pass_cols]
df_pass.rename({'pass_assisted_shot_id': 'id', 'end_x': 'pass_end_x', 'end_y': 'pass_end_y'}, axis=1, inplace=True)
df_pass['pass_switch'].fillna(False, inplace=True)
df_pass['pass_cross'].fillna(False, inplace=True)
df_pass['pass_cut_back'].fillna(False, inplace=True)
df_statsbomb_shots = df_statsbomb_shots.merge(df_pass, on='id', how='left', suffixes=['', '_pass'])
df_statsbomb_shots.rename({'technique_name_pass': 'pass_technique_name'}, axis=1, inplace=True)

In [15]:
# Simplify the pass height name column
df_statsbomb_shots.pass_height_name.replace({'Ground Pass': 'Ground/ Low Pass',
                                             'Low Pass': 'Ground/ Low Pass'}, inplace=True)

In [16]:
# Add carry length
df_statsbomb_shots['carry_length'] = (((df_statsbomb_shots.x - df_statsbomb_shots.pass_end_x) / 120 * 115) ** 2 +
                                      ((df_statsbomb_shots.y - df_statsbomb_shots.pass_end_y) / 80 * 74) ** 2) ** 0.5

In [17]:
# Simplify body part name. As wyscout just has either foot and other
df_statsbomb_shots['body_part_name'] = df_statsbomb_shots.body_part_name.replace({'Head': 'Other'})

In [18]:
# Merge on strongest foot
df_statsbomb_shots = df_statsbomb_shots.merge(df_foot, how='left', on='player_id')

In [19]:
# Flag for whether shot is on the strongest foot
df_statsbomb_shots['strong_foot'] = (((df_statsbomb_shots.foot.isin(['left', 'both'])) &
                                     (df_statsbomb_shots.body_part_name == 'Left Foot')) |
                                    ((df_statsbomb_shots.foot.isin(['right', 'both'])) & 
                                     (df_statsbomb_shots.body_part_name == 'Right Foot')))

In [20]:
# Convert coordinates to standard pitch size (105m * 68m)
x_cols = ['x', 'pass_end_x']
y_cols = ['y', 'pass_end_y']
df_statsbomb_shots[x_cols] = (df_statsbomb_shots[x_cols]) / float(pitch_statsbomb.dim.right) * pitch_statsperform.dim.right
df_statsbomb_shots[y_cols] = ((float(pitch_statsbomb.dim.bottom) - df_statsbomb_shots[y_cols]) /
                              float(pitch_statsbomb.dim.bottom)  * pitch_statsperform.dim.top)

In [21]:
# Angles/ distance to goals
left_post, right_post = pitch_statsperform.goal_right
goal_width = abs(right_post - left_post)[1]
dx = abs(pitch_statsperform.dim.right - df_statsbomb_shots.x)
dy = abs(pitch_statsperform.dim.center_width - df_statsbomb_shots.y)
df_statsbomb_shots['visible_angle'] = np.arctan2(goal_width * dx , (dx**2 + dy**2 - (goal_width / 2.) ** 2))
df_statsbomb_shots['middle_angle'] = np.arctan2(dy, dx)
df_statsbomb_shots['distance_to_goal'] = round((dy**2 + dx**2)**0.5, 1)

In [22]:
# Interaction between angle and distance
df_statsbomb_shots['distance_visible_angle'] = df_statsbomb_shots.distance_to_goal * df_statsbomb_shots.visible_angle

In [23]:
# Log distance
df_statsbomb_shots['log_distance_to_goal'] = np.log(df_statsbomb_shots.distance_to_goal)

In [24]:
# Counter attack
df_statsbomb_shots['counter_attack'] = df_statsbomb_shots.play_pattern_name == 'From Counter'

In [25]:
# Assist type
df_statsbomb_shots.loc[df_statsbomb_shots.pass_end_x.notnull(), 'assist_type'] = 'pass'
df_statsbomb_shots.loc[df_statsbomb_shots.sub_type_name.isin(['Free Kick', 'Corner', 'Kick Off']), 'assist_type'] = 'direct'
df_statsbomb_shots.loc[df_statsbomb_shots.assist_type.isnull() &
                       ((df_statsbomb_shots.prev_type_name_3.isin(['Shot', 'Goal Keeper'])) |
                        (df_statsbomb_shots.prev_type_name_2.isin(['Shot', 'Goal Keeper'])) |
                        (df_statsbomb_shots.prev_type_name_1.isin(['Shot', 'Goal Keeper']))), 'assist_type'] = 'rebound'
df_statsbomb_shots.loc[df_statsbomb_shots.assist_type.isnull() &
                       ((df_statsbomb_shots.prev_type_name_3.isin(['Clearance'])) |
                        (df_statsbomb_shots.prev_type_name_2.isin(['Clearance'])) |
                        (df_statsbomb_shots.prev_type_name_1.isin(['Clearance']))), 'assist_type'] = 'clearance'
df_statsbomb_shots.loc[df_statsbomb_shots.assist_type.isnull(), 'assist_type'] = 'recovery'

In [26]:
# Shot type name
df_statsbomb_shots.sub_type_name.replace({'Open Play': np.nan,
                                          'Free Kick': 'direct_set_piece',
                                          'Kick Off': 'direct_set_piece',
                                          'Penalty': 'penalty'}, inplace=True)
mask_null = df_statsbomb_shots.sub_type_name.isnull()
df_statsbomb_shots.loc[mask_null, 'sub_type_name'] = df_statsbomb_shots.loc[mask_null, 'play_type']
df_statsbomb_shots.loc[df_statsbomb_shots.sub_type_name.isnull(), 'sub_type_name'] = 'open_play'
df_statsbomb_shots.rename({'sub_type_name': 'shot_type_name'}, axis=1, inplace=True)
df_statsbomb_shots

Unnamed: 0,match_id,id,index,period,timestamp_minute,timestamp_second,timestamp_millisecond,minute,second,type_id,...,right,foot,strong_foot,visible_angle,middle_angle,distance_to_goal,distance_visible_angle,log_distance_to_goal,counter_attack,assist_type
0,15946,65f16e50-7c5d-4293-b2fc-d20887a772f9,148,1,2,29,94,2,29,16,...,0.116951,left,False,0.367098,0.940045,12.3,4.515301,2.509599,False,pass
1,15946,b0f73423-3990-45ae-9dda-3512c2d1aff3,283,1,5,39,239,5,39,16,...,0.139461,left,True,0.274876,1.127256,12.2,3.353489,2.501436,False,pass
2,15946,13b1ddab-d22e-43d9-bfe4-12632fea1a27,755,1,15,28,625,15,28,16,...,0.116951,left,True,0.286517,0.188550,24.9,7.134262,3.214868,False,recovery
3,15946,391bfb74-07a6-4afe-9568-02a9b23f5bd4,788,1,16,19,616,16,19,16,...,0.861111,right,False,0.725077,0.115344,9.6,6.960740,2.261763,False,pass
4,15946,5e55f5a5-954f-4cc4-ba6e-a9cf6d6e249e,842,1,18,15,914,18,15,16,...,0.802068,right,True,0.290461,0.842331,17.1,4.966883,2.839078,False,clearance
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22664,7558,82c088ab-199e-425f-a473-7ba420875014,3129,2,44,18,675,89,18,16,...,0.096386,left,True,0.240095,0.743918,22.6,5.426144,3.117950,False,pass
22665,7558,96e16373-a680-4b07-88b9-f69219692837,3134,2,44,47,115,89,47,16,...,0.795014,right,False,1.201162,0.160512,5.3,6.366161,1.667707,False,pass
22666,7558,ee862796-108e-4dbb-9020-fca7dca701bc,3138,2,44,48,195,89,48,16,...,0.838710,right,False,2.648089,0.770906,1.2,3.177706,0.182322,False,rebound
22667,7558,9627b537-af10-4437-88de-bb9b17068768,3253,2,49,15,955,94,15,16,...,0.704545,right,False,0.898617,0.506752,7.0,6.290320,1.945910,False,pass


In [27]:
# Add goal boolean
df_statsbomb_shots['goal'] = df_statsbomb_shots.outcome_name == 'Goal'

In [28]:
# Turn some of the StatsBomb columns to boolean columns
df_statsbomb_shots['shot_open_goal'].fillna(False, inplace=True)
df_statsbomb_shots['under_pressure'] = df_statsbomb_shots['under_pressure'].fillna(0).astype(bool)
df_statsbomb_shots['shot_one_on_one'].fillna(False, inplace=True)

In [29]:
# Reduce columns
cols_to_keep = ['match_id', 'id', 'eventSec', 'period', 'goal', 'team_id', 'team_name', 'player_id', 'firstName',
                'middleName', 'lastName', 'Name', 'shot_type_name', 'x', 'y', 'counter_attack',
                'fast_break', 'strong_foot', 'body_part_name', 'assist_type',
                'pass_end_y', 'pass_end_x', 'pass_switch', 'pass_cross',
                'pass_cut_back', 'pass_height_name', 'pass_technique_name',
                'carry_length', 'visible_angle', 'middle_angle', 'distance_to_goal', 
                'distance_visible_angle', 'log_distance_to_goal', 'competition_gender',
                'shot_one_on_one', 'shot_open_goal', 'under_pressure','competition_name','match_week','home_team_id',
                'away_team_id']
df_statsbomb_shots = df_statsbomb_shots[cols_to_keep].copy()

In [30]:
# Add on the goalkeeper player id
df_statsbomb_shots = df_statsbomb_shots.merge(df_gk[['id', 'player_id']], how='left', on='id', suffixes=['', '_goalkeeper'])
df_statsbomb_shots

Unnamed: 0,match_id,id,eventSec,period,goal,team_id,team_name,player_id,firstName,middleName,...,log_distance_to_goal,competition_gender,shot_one_on_one,shot_open_goal,under_pressure,competition_name,match_week,home_team_id,away_team_id,player_id_goalkeeper
0,15946,65f16e50-7c5d-4293-b2fc-d20887a772f9,149.094,1,False,217,Barcelona,5503.0,Lionel,Andrés Messi,...,2.509599,male,False,False,False,La Liga,1.0,217.0,206.0,6629.0
1,15946,b0f73423-3990-45ae-9dda-3512c2d1aff3,339.239,1,False,217,Barcelona,5211.0,Jordi,Alba,...,2.501436,male,False,False,False,La Liga,1.0,217.0,206.0,6629.0
2,15946,13b1ddab-d22e-43d9-bfe4-12632fea1a27,928.625,1,False,217,Barcelona,5503.0,Lionel,Andrés Messi,...,3.214868,male,False,False,False,La Liga,1.0,217.0,206.0,6629.0
3,15946,391bfb74-07a6-4afe-9568-02a9b23f5bd4,979.616,1,False,206,Deportivo Alavés,6613.0,Rubén,Sobrino,...,2.261763,male,False,False,True,La Liga,1.0,217.0,206.0,20055.0
4,15946,5e55f5a5-954f-4cc4-ba6e-a9cf6d6e249e,1095.914,1,False,217,Barcelona,5246.0,Luis,Alberto Suárez,...,2.839078,male,False,False,False,La Liga,1.0,217.0,206.0,6629.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22664,7558,82c088ab-199e-425f-a473-7ba420875014,2658.675,2,False,783,Uruguay,5256.0,Cristian,Gabriel Rodríguez,...,3.117950,male,False,False,True,FIFA World Cup,3.0,783.0,796.0,5172.0
22665,7558,96e16373-a680-4b07-88b9-f69219692837,2687.115,2,False,783,Uruguay,5249.0,Diego,Roberto Godín,...,1.667707,male,False,False,True,FIFA World Cup,3.0,783.0,796.0,5172.0
22666,7558,ee862796-108e-4dbb-9020-fca7dca701bc,2688.195,2,True,783,Uruguay,4319.0,Edinson,Roberto Cavani,...,0.182322,male,False,False,True,FIFA World Cup,3.0,783.0,796.0,5172.0
22667,7558,9627b537-af10-4437-88de-bb9b17068768,2955.955,2,False,783,Uruguay,5258.0,Giorgian,Daniel De Arrascaeta,...,1.945910,male,False,False,False,FIFA World Cup,3.0,783.0,796.0,5172.0


In [31]:
df_statsbomb_shots.loc[(df_statsbomb_shots.team_id == df_statsbomb_shots.home_team_id), 'H_A_column'] = 'Home Team'
df_statsbomb_shots.loc[(df_statsbomb_shots.team_id == df_statsbomb_shots.away_team_id), 'H_A_column'] = 'Away Team'
df_statsbomb_shots

Unnamed: 0,match_id,id,eventSec,period,goal,team_id,team_name,player_id,firstName,middleName,...,competition_gender,shot_one_on_one,shot_open_goal,under_pressure,competition_name,match_week,home_team_id,away_team_id,player_id_goalkeeper,H_A_column
0,15946,65f16e50-7c5d-4293-b2fc-d20887a772f9,149.094,1,False,217,Barcelona,5503.0,Lionel,Andrés Messi,...,male,False,False,False,La Liga,1.0,217.0,206.0,6629.0,Home Team
1,15946,b0f73423-3990-45ae-9dda-3512c2d1aff3,339.239,1,False,217,Barcelona,5211.0,Jordi,Alba,...,male,False,False,False,La Liga,1.0,217.0,206.0,6629.0,Home Team
2,15946,13b1ddab-d22e-43d9-bfe4-12632fea1a27,928.625,1,False,217,Barcelona,5503.0,Lionel,Andrés Messi,...,male,False,False,False,La Liga,1.0,217.0,206.0,6629.0,Home Team
3,15946,391bfb74-07a6-4afe-9568-02a9b23f5bd4,979.616,1,False,206,Deportivo Alavés,6613.0,Rubén,Sobrino,...,male,False,False,True,La Liga,1.0,217.0,206.0,20055.0,Away Team
4,15946,5e55f5a5-954f-4cc4-ba6e-a9cf6d6e249e,1095.914,1,False,217,Barcelona,5246.0,Luis,Alberto Suárez,...,male,False,False,False,La Liga,1.0,217.0,206.0,6629.0,Home Team
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22664,7558,82c088ab-199e-425f-a473-7ba420875014,2658.675,2,False,783,Uruguay,5256.0,Cristian,Gabriel Rodríguez,...,male,False,False,True,FIFA World Cup,3.0,783.0,796.0,5172.0,Home Team
22665,7558,96e16373-a680-4b07-88b9-f69219692837,2687.115,2,False,783,Uruguay,5249.0,Diego,Roberto Godín,...,male,False,False,True,FIFA World Cup,3.0,783.0,796.0,5172.0,Home Team
22666,7558,ee862796-108e-4dbb-9020-fca7dca701bc,2688.195,2,True,783,Uruguay,4319.0,Edinson,Roberto Cavani,...,male,False,False,True,FIFA World Cup,3.0,783.0,796.0,5172.0,Home Team
22667,7558,9627b537-af10-4437-88de-bb9b17068768,2955.955,2,False,783,Uruguay,5258.0,Giorgian,Daniel De Arrascaeta,...,male,False,False,False,FIFA World Cup,3.0,783.0,796.0,5172.0,Home Team


In [32]:
df_statsbomb_event2 = pd.read_parquet(os.path.join(STATSBOMB, 'event.parquet'))
df_statsbomb_event2

Unnamed: 0,match_id,id,index,period,timestamp_minute,timestamp_second,timestamp_millisecond,minute,second,type_id,...,block_save_block,out,shot_open_goal,injury_stoppage_in_chain,shot_follows_dribble,pass_no_touch,dribble_no_touch,half_start_late_video_start,player_off_permanent,half_end_early_video_end
0,15946,ce508a95-38d3-4248-a50e-dc8d7e23230c,1,1,0,0,0,0,0,35,...,,,,,,,,,,
1,15946,39abd1c2-2575-41be-ae79-3e936eead529,2,1,0,0,0,0,0,35,...,,,,,,,,,,
2,15946,1148e967-380f-421f-97f0-d0c820c72984,3,1,0,0,0,0,0,18,...,,,,,,,,,,
3,15946,7cf6876d-7c50-4d64-b999-e2a039641c81,4,1,0,0,0,0,0,18,...,,,,,,,,,,
4,15946,34208ade-2af4-45c3-970e-655937cad938,5,1,0,0,98,0,0,30,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3264,7558,13d33dae-b9c5-4985-81f6-3c8924422414,3265,2,49,29,195,94,29,6,...,,,,,,,,,,
3265,7558,7c9ae5c4-4f39-4eb1-a991-6a9e6a3aa930,3266,2,49,29,528,94,29,23,...,,,,,,,,,,
3266,7558,77ce3eaa-bc32-4736-970b-8bd9cebf7dac,3267,2,49,36,395,94,36,30,...,,,,,,,,,,
3267,7558,bd659952-9bde-4d94-9977-259f42a2c5a5,3268,2,49,37,155,94,37,34,...,,,,,,,,,,


In [33]:
mask_red = (df_statsbomb_event2.foul_committed_card_name == 'Red Card') | (df_statsbomb_event2.foul_committed_card_name == 'Second Yellow')
df_statsbomb_reds = df_statsbomb_event2[mask_red].copy()
df_statsbomb_reds

Unnamed: 0,match_id,id,index,period,timestamp_minute,timestamp_second,timestamp_millisecond,minute,second,type_id,...,block_save_block,out,shot_open_goal,injury_stoppage_in_chain,shot_follows_dribble,pass_no_touch,dribble_no_touch,half_start_late_video_start,player_off_permanent,half_end_early_video_end
1700,15986,4228971b-b34d-47c6-9920-acb70e4ec958,1700,1,32,36,585,32,36,22,...,,,,,,,,,,
3241,16056,28497024-d7ca-4857-af34-e4512e2d289e,3242,2,35,57,123,80,57,22,...,,,,,,,,,,
3188,16095,fad9729c-718f-48fb-a949-e4ccf5d2c1e7,3189,2,30,33,274,75,33,22,...,,,,,,,,,,
2151,16149,c5a8d2c8-3040-4e35-83be-48c0a0cedf72,2152,2,5,28,341,50,28,22,...,,,,,,,,,,
3768,16173,92eeb820-41a5-4ad7-bda8-18db17ef8aff,3769,2,45,32,853,90,32,22,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2557,2275139,f3da86bf-715b-4306-bf88-9312bb10fed7,2558,2,44,38,702,89,38,22,...,,,,,,,,,,
2206,303615,850d9713-221c-4a4a-8661-57eb6b691d15,2207,2,3,23,38,48,23,22,...,,,,,,,,,,
2252,303615,c93f3b66-6983-41a7-9753-0b6259ca0402,2253,2,6,34,128,51,34,22,...,,,,,,,,,,
2986,303634,f6067d32-ccf7-4f54-a3f5-ea358c60da39,2987,2,29,42,589,74,42,22,...,,,,,,,,,,


In [34]:
df_statsbomb_lineup = pd.read_parquet(os.path.join(STATSBOMB, 'lineup.parquet'))
df_statsbomb_lineup.drop(['player_jersey_number','player_country_id','player_country_name','player_nickname'], axis=1, inplace=True)
df_statsbomb_lineup

Unnamed: 0,team_id,team_name,match_id,player_id,player_name
0,217,Barcelona,15946,3109,Malcom Filipe Silva de Oliveira
1,217,Barcelona,15946,3501,Philippe Coutinho Correia
2,217,Barcelona,15946,5203,Sergio Busquets i Burgos
3,217,Barcelona,15946,5211,Jordi Alba Ramos
4,217,Barcelona,15946,5213,Gerard Piqué Bernabéu
...,...,...,...,...,...
31,210,Real Sociedad,9948,6906,Jon Bautista Orgilles
32,210,Real Sociedad,9948,7069,Miguel Ángel Moyà Rumbo
33,217,Barcelona,9948,8652,Jasper Cillessen
34,217,Barcelona,9948,10802,André Filipe Tavares Gomes


In [35]:
df_statsbomb_reds['player_id'] = df_statsbomb_reds['player_id'].astype('int64')
df_statsbomb_reds = df_statsbomb_reds.merge(df_statsbomb_lineup, how='inner')
df_statsbomb_reds = df_statsbomb_reds.merge(df_statsbomb_match[['match_id','home_team_id','away_team_id']], 
                                                                  on='match_id', how='left')
df_statsbomb_reds['players'] = 11 - df_statsbomb_reds.groupby(['match_id','team_id'])['minute'].rank(method="first", ascending=True)
df_statsbomb_reds

Unnamed: 0,match_id,id,index,period,timestamp_minute,timestamp_second,timestamp_millisecond,minute,second,type_id,...,injury_stoppage_in_chain,shot_follows_dribble,pass_no_touch,dribble_no_touch,half_start_late_video_start,player_off_permanent,half_end_early_video_end,home_team_id,away_team_id,players
0,15986,4228971b-b34d-47c6-9920-acb70e4ec958,1700,1,32,36,585,32,36,22,...,,,,,,,,217,211,10.0
1,16056,28497024-d7ca-4857-af34-e4512e2d289e,3242,2,35,57,123,80,57,22,...,,,,,,,,217,218,10.0
2,16095,fad9729c-718f-48fb-a949-e4ccf5d2c1e7,3189,2,30,33,274,75,33,22,...,,,,,,,,221,217,10.0
3,16149,c5a8d2c8-3040-4e35-83be-48c0a0cedf72,2152,2,5,28,341,50,28,22,...,,,,,,,,211,217,10.0
4,16173,92eeb820-41a5-4ad7-bda8-18db17ef8aff,3769,2,45,32,853,90,32,22,...,,,,,,,,215,217,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,2275139,f3da86bf-715b-4306-bf88-9312bb10fed7,2558,2,44,38,702,89,38,22,...,,,,,,,,749,1475,10.0
126,303615,850d9713-221c-4a4a-8661-57eb6b691d15,2207,2,3,23,38,48,23,22,...,,,,,,,,217,214,10.0
127,303615,c93f3b66-6983-41a7-9753-0b6259ca0402,2253,2,6,34,128,51,34,22,...,,,,,,,,217,214,10.0
128,303634,f6067d32-ccf7-4f54-a3f5-ea358c60da39,2987,2,29,42,589,74,42,22,...,,,,,,,,217,422,10.0


In [36]:
df_statsbomb_reds['eventSec'] = (df_statsbomb_reds.timestamp_minute * 60 + df_statsbomb_reds.timestamp_second +
                                  df_statsbomb_reds.timestamp_millisecond/1000)
cols_to_keep = ['match_id', 'id', 'eventSec', 'period', 'team_id','foul_committed_card_name','players','home_team_id','away_team_id']
df_statsbomb_reds = df_statsbomb_reds[cols_to_keep].copy()
df_statsbomb_reds['minute'] = df_statsbomb_reds.eventSec / 60
df_statsbomb_reds.loc[(df_statsbomb_reds.period == 1) & (df_statsbomb_reds.minute > 45), 'minute'] = 45
df_statsbomb_reds.loc[(df_statsbomb_reds.period == 2), 'minute'] = df_statsbomb_reds.minute + 45
df_statsbomb_reds.sort_values('minute', inplace=True)
df_statsbomb_reds

Unnamed: 0,match_id,id,eventSec,period,team_id,foul_committed_card_name,players,home_team_id,away_team_id,minute
116,7541,73653e61-5716-45f1-826f-1f23c18876e2,168.560,1,769,Red Card,10.0,769,778,2.809333
17,266149,aaff2778-09fa-4c16-ad5c-4070d9972275,182.023,1,215,Red Card,10.0,217,215,3.033717
62,68357,7f9b1011-ed47-48c8-b4d9-df989f40a111,807.366,3,1205,Red Card,10.0,852,1205,13.456100
6,18241,b3ba6e39-f232-4c25-86fc-c13f85f2e392,846.888,4,212,Second Yellow,10.0,220,212,14.114800
12,2275072,922d45c2-6944-4c75-8c58-bf01cdae0f10,916.763,1,974,Red Card,10.0,971,974,15.279383
...,...,...,...,...,...,...,...,...,...,...
77,69209,a1530800-ca11-42bc-a6d3-b826849a501c,2876.625,2,217,Second Yellow,9.0,1217,217,92.943750
21,266477,ab13a8ed-1bc7-4cc2-bf84-ebb6ba403882,2912.876,2,222,Second Yellow,10.0,222,217,93.547933
114,7492,24d70da3-2022-4999-ad33-f37f3bcc2757,3014.901,2,763,Red Card,10.0,764,763,95.248350
115,7524,b977ad9c-db48-456b-a1d5-f986cddfec12,3036.340,2,764,Red Card,10.0,762,764,95.605667


In [37]:
df_statsbomb_reds2 = df_statsbomb_reds.copy()
df_statsbomb_reds2.loc[(df_statsbomb_reds2.team_id == df_statsbomb_reds2.home_team_id), 'team_id2'] = df_statsbomb_reds2.away_team_id
df_statsbomb_reds2.loc[(df_statsbomb_reds2.team_id == df_statsbomb_reds2.away_team_id), 'team_id2'] = df_statsbomb_reds2.home_team_id
df_statsbomb_reds2.team_id = df_statsbomb_reds2.team_id2
df_statsbomb_reds2 = df_statsbomb_reds2.rename(columns={"players": "players_rival"})
df_statsbomb_reds2

Unnamed: 0,match_id,id,eventSec,period,team_id,foul_committed_card_name,players_rival,home_team_id,away_team_id,minute,team_id2
116,7541,73653e61-5716-45f1-826f-1f23c18876e2,168.560,1,778.0,Red Card,10.0,769,778,2.809333,778.0
17,266149,aaff2778-09fa-4c16-ad5c-4070d9972275,182.023,1,217.0,Red Card,10.0,217,215,3.033717,217.0
62,68357,7f9b1011-ed47-48c8-b4d9-df989f40a111,807.366,3,852.0,Red Card,10.0,852,1205,13.456100,852.0
6,18241,b3ba6e39-f232-4c25-86fc-c13f85f2e392,846.888,4,220.0,Second Yellow,10.0,220,212,14.114800,220.0
12,2275072,922d45c2-6944-4c75-8c58-bf01cdae0f10,916.763,1,971.0,Red Card,10.0,971,974,15.279383,971.0
...,...,...,...,...,...,...,...,...,...,...,...
77,69209,a1530800-ca11-42bc-a6d3-b826849a501c,2876.625,2,1217.0,Second Yellow,9.0,1217,217,92.943750,1217.0
21,266477,ab13a8ed-1bc7-4cc2-bf84-ebb6ba403882,2912.876,2,217.0,Second Yellow,10.0,222,217,93.547933,217.0
114,7492,24d70da3-2022-4999-ad33-f37f3bcc2757,3014.901,2,764.0,Red Card,10.0,764,763,95.248350,764.0
115,7524,b977ad9c-db48-456b-a1d5-f986cddfec12,3036.340,2,762.0,Red Card,10.0,762,764,95.605667,762.0


In [38]:
df_statsbomb_shots['minute'] = df_statsbomb_shots.eventSec / 60
df_statsbomb_shots.loc[(df_statsbomb_shots.period == 1) & (df_statsbomb_shots.minute > 45), 'minute'] = 45
df_statsbomb_shots.loc[(df_statsbomb_shots.period == 2), 'minute'] = df_statsbomb_shots.minute + 45
df_statsbomb_shots.sort_values('minute', inplace=True)
df_statsbomb_shots

Unnamed: 0,match_id,id,eventSec,period,goal,team_id,team_name,player_id,firstName,middleName,...,shot_one_on_one,shot_open_goal,under_pressure,competition_name,match_week,home_team_id,away_team_id,player_id_goalkeeper,H_A_column,minute
11800,68357,25875da2-cf9d-461f-a423-ef7806fa3603,0.400,5,True,852,Norway Women's,10386.0,Caroline,Graham,...,False,False,False,Women's World Cup,4.0,852.0,1205.0,5066.0,Home Team,0.006667
1050,18243,85011ca9-6869-4a82-8101-8db06eb88d21,0.777,5,True,220,Real Madrid,5200.0,Lucas,Vázquez,...,False,False,False,Champions League,7.0,220.0,212.0,6378.0,Home Team,0.012950
22348,2302764,2a8656c0-311f-4abc-b097-be10d19c3e0b,1.731,5,False,243,AC Milan,34385.0,Claudio,,...,True,False,False,Champions League,1.0,243.0,24.0,34390.0,Home Team,0.028850
19810,7582,d077fefd-b6be-4992-92c8-ae347b6ab2f5,5.627,5,True,772,Spain,5216.0,Andrés,Iniesta,...,False,False,False,FIFA World Cup,4.0,772.0,796.0,5172.0,Home Team,0.093783
19769,7581,539170ab-d9f1-47e6-b4c4-d9a5fbf9599f,7.413,5,False,776,Denmark,3043.0,Christian,Dannemann,...,False,False,False,FIFA World Cup,4.0,785.0,776.0,3444.0,Away Team,0.123550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19505,7567,cee57292-f34a-4595-bb06-f58bafb561ed,3187.680,2,False,770,Germany,5572.0,Mats,,...,False,False,False,FIFA World Cup,3.0,791.0,770.0,5619.0,Away Team,98.128000
19506,7567,60c04bc0-2d71-4bdd-9e54-a095419682a3,3221.093,2,False,770,Germany,5572.0,Mats,,...,False,False,False,FIFA World Cup,3.0,791.0,770.0,5619.0,Away Team,98.684883
12607,69163,322e37fb-a234-47e1-9bbf-1416b94160d5,3227.196,2,False,1206,Canada Women's,15290.0,Ashley,,...,False,False,False,Women's World Cup,4.0,858.0,1206.0,4640.0,Away Team,98.786600
12608,69163,be44b16b-63a0-446a-8517-00d51b93c403,3230.227,2,False,1206,Canada Women's,15180.0,Sophie,,...,False,False,False,Women's World Cup,4.0,858.0,1206.0,4640.0,Away Team,98.837117


In [39]:
df_statsbomb_shots = df_statsbomb_shots.append(df_statsbomb_reds, sort=False,ignore_index=True)
df_statsbomb_shots = df_statsbomb_shots.append(df_statsbomb_reds2, sort=False,ignore_index=True)

In [40]:
df_statsbomb_shots.drop(['team_id2','home_team_id','away_team_id','foul_committed_card_name'], axis=1, inplace=True)
df_statsbomb_shots

Unnamed: 0,match_id,id,eventSec,period,goal,team_id,team_name,player_id,firstName,middleName,...,shot_one_on_one,shot_open_goal,under_pressure,competition_name,match_week,player_id_goalkeeper,H_A_column,minute,players,players_rival
0,68357,25875da2-cf9d-461f-a423-ef7806fa3603,0.400,5,True,852.0,Norway Women's,10386.0,Caroline,Graham,...,False,False,False,Women's World Cup,4.0,5066.0,Home Team,0.006667,,
1,18243,85011ca9-6869-4a82-8101-8db06eb88d21,0.777,5,True,220.0,Real Madrid,5200.0,Lucas,Vázquez,...,False,False,False,Champions League,7.0,6378.0,Home Team,0.012950,,
2,2302764,2a8656c0-311f-4abc-b097-be10d19c3e0b,1.731,5,False,243.0,AC Milan,34385.0,Claudio,,...,True,False,False,Champions League,1.0,34390.0,Home Team,0.028850,,
3,7582,d077fefd-b6be-4992-92c8-ae347b6ab2f5,5.627,5,True,772.0,Spain,5216.0,Andrés,Iniesta,...,False,False,False,FIFA World Cup,4.0,5172.0,Home Team,0.093783,,
4,7581,539170ab-d9f1-47e6-b4c4-d9a5fbf9599f,7.413,5,False,776.0,Denmark,3043.0,Christian,Dannemann,...,False,False,False,FIFA World Cup,4.0,3444.0,Away Team,0.123550,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22924,69209,a1530800-ca11-42bc-a6d3-b826849a501c,2876.625,2,,1217.0,,,,,...,,,,,,,,92.943750,,9.0
22925,266477,ab13a8ed-1bc7-4cc2-bf84-ebb6ba403882,2912.876,2,,217.0,,,,,...,,,,,,,,93.547933,,10.0
22926,7492,24d70da3-2022-4999-ad33-f37f3bcc2757,3014.901,2,,764.0,,,,,...,,,,,,,,95.248350,,10.0
22927,7524,b977ad9c-db48-456b-a1d5-f986cddfec12,3036.340,2,,762.0,,,,,...,,,,,,,,95.605667,,10.0


In [41]:
df_statsbomb_shots.sort_values('minute', inplace=True)
df_statsbomb_shots['players'] = df_statsbomb_shots.groupby(['match_id','team_id'])['players'].fillna(method='ffill')
df_statsbomb_shots['players_rival'] = df_statsbomb_shots.groupby(['match_id','team_id'])['players_rival'].fillna(method='ffill')
df_statsbomb_shots

Unnamed: 0,match_id,id,eventSec,period,goal,team_id,team_name,player_id,firstName,middleName,...,shot_one_on_one,shot_open_goal,under_pressure,competition_name,match_week,player_id_goalkeeper,H_A_column,minute,players,players_rival
0,68357,25875da2-cf9d-461f-a423-ef7806fa3603,0.400,5,True,852.0,Norway Women's,10386.0,Caroline,Graham,...,False,False,False,Women's World Cup,4.0,5066.0,Home Team,0.006667,,
1,18243,85011ca9-6869-4a82-8101-8db06eb88d21,0.777,5,True,220.0,Real Madrid,5200.0,Lucas,Vázquez,...,False,False,False,Champions League,7.0,6378.0,Home Team,0.012950,,
2,2302764,2a8656c0-311f-4abc-b097-be10d19c3e0b,1.731,5,False,243.0,AC Milan,34385.0,Claudio,,...,True,False,False,Champions League,1.0,34390.0,Home Team,0.028850,,
3,7582,d077fefd-b6be-4992-92c8-ae347b6ab2f5,5.627,5,True,772.0,Spain,5216.0,Andrés,Iniesta,...,False,False,False,FIFA World Cup,4.0,5172.0,Home Team,0.093783,,
4,7581,539170ab-d9f1-47e6-b4c4-d9a5fbf9599f,7.413,5,False,776.0,Denmark,3043.0,Christian,Dannemann,...,False,False,False,FIFA World Cup,4.0,3444.0,Away Team,0.123550,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22664,7567,cee57292-f34a-4595-bb06-f58bafb561ed,3187.680,2,False,770.0,Germany,5572.0,Mats,,...,False,False,False,FIFA World Cup,3.0,5619.0,Away Team,98.128000,,
22665,7567,60c04bc0-2d71-4bdd-9e54-a095419682a3,3221.093,2,False,770.0,Germany,5572.0,Mats,,...,False,False,False,FIFA World Cup,3.0,5619.0,Away Team,98.684883,,
22666,69163,322e37fb-a234-47e1-9bbf-1416b94160d5,3227.196,2,False,1206.0,Canada Women's,15290.0,Ashley,,...,False,False,False,Women's World Cup,4.0,4640.0,Away Team,98.786600,,
22667,69163,be44b16b-63a0-446a-8517-00d51b93c403,3230.227,2,False,1206.0,Canada Women's,15180.0,Sophie,,...,False,False,False,Women's World Cup,4.0,4640.0,Away Team,98.837117,,


In [42]:
df_statsbomb_shots['players'].fillna(11, inplace=True)
df_statsbomb_shots['players_rival'].fillna(11, inplace=True)
df_statsbomb_shots.sort_values('minute', inplace=True)
df_statsbomb_shots = df_statsbomb_shots[df_statsbomb_shots['goal'].notna()]
df_statsbomb_shots

Unnamed: 0,match_id,id,eventSec,period,goal,team_id,team_name,player_id,firstName,middleName,...,shot_one_on_one,shot_open_goal,under_pressure,competition_name,match_week,player_id_goalkeeper,H_A_column,minute,players,players_rival
0,68357,25875da2-cf9d-461f-a423-ef7806fa3603,0.400,5,True,852.0,Norway Women's,10386.0,Caroline,Graham,...,False,False,False,Women's World Cup,4.0,5066.0,Home Team,0.006667,11.0,11.0
1,18243,85011ca9-6869-4a82-8101-8db06eb88d21,0.777,5,True,220.0,Real Madrid,5200.0,Lucas,Vázquez,...,False,False,False,Champions League,7.0,6378.0,Home Team,0.012950,11.0,11.0
2,2302764,2a8656c0-311f-4abc-b097-be10d19c3e0b,1.731,5,False,243.0,AC Milan,34385.0,Claudio,,...,True,False,False,Champions League,1.0,34390.0,Home Team,0.028850,11.0,11.0
3,7582,d077fefd-b6be-4992-92c8-ae347b6ab2f5,5.627,5,True,772.0,Spain,5216.0,Andrés,Iniesta,...,False,False,False,FIFA World Cup,4.0,5172.0,Home Team,0.093783,11.0,11.0
4,7581,539170ab-d9f1-47e6-b4c4-d9a5fbf9599f,7.413,5,False,776.0,Denmark,3043.0,Christian,Dannemann,...,False,False,False,FIFA World Cup,4.0,3444.0,Away Team,0.123550,11.0,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22664,7567,cee57292-f34a-4595-bb06-f58bafb561ed,3187.680,2,False,770.0,Germany,5572.0,Mats,,...,False,False,False,FIFA World Cup,3.0,5619.0,Away Team,98.128000,11.0,11.0
22665,7567,60c04bc0-2d71-4bdd-9e54-a095419682a3,3221.093,2,False,770.0,Germany,5572.0,Mats,,...,False,False,False,FIFA World Cup,3.0,5619.0,Away Team,98.684883,11.0,11.0
22666,69163,322e37fb-a234-47e1-9bbf-1416b94160d5,3227.196,2,False,1206.0,Canada Women's,15290.0,Ashley,,...,False,False,False,Women's World Cup,4.0,4640.0,Away Team,98.786600,11.0,11.0
22667,69163,be44b16b-63a0-446a-8517-00d51b93c403,3230.227,2,False,1206.0,Canada Women's,15180.0,Sophie,,...,False,False,False,Women's World Cup,4.0,4640.0,Away Team,98.837117,11.0,11.0


In [43]:
# Save dataset
df_statsbomb_shots.reset_index(drop=True, inplace=True)
df_statsbomb_shots.to_parquet(os.path.join(STATSBOMB, 'shots.parquet'))

In [44]:
# Show information
df_statsbomb_shots.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22669 entries, 0 to 22668
Data columns (total 44 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   match_id                22669 non-null  int64  
 1   id                      22669 non-null  object 
 2   eventSec                22669 non-null  float64
 3   period                  22669 non-null  int64  
 4   goal                    22669 non-null  object 
 5   team_id                 22669 non-null  float64
 6   team_name               22669 non-null  object 
 7   player_id               22669 non-null  float64
 8   firstName               22669 non-null  object 
 9   middleName              22669 non-null  object 
 10  lastName                22669 non-null  object 
 11  Name                    22669 non-null  object 
 12  shot_type_name          22669 non-null  object 
 13  x                       22669 non-null  float64
 14  y                       22669 non-null