In [1]:
import pandas as pd
import numpy as np
import os
from mplsoccer.pitch import Pitch
from shapely.geometry import MultiPoint, Polygon, Point

In [2]:
# Load the data
cwd = os.getcwd()
STATSBOMB = os.path.join(cwd, 'data', 'statsbomb')
df_statsbomb_event = pd.read_parquet(os.path.join(STATSBOMB, 'event.parquet'))
df_statsbomb_freeze = pd.read_parquet(os.path.join(STATSBOMB, 'freeze.parquet'))
df_statsbomb_freeze

Unnamed: 0,id,event_freeze_id,player_teammate,player_id,player_name,player_position_id,player_position_name,x,y,match_id
0,65f16e50-7c5d-4293-b2fc-d20887a772f9,1,True,6379,Sergi Roberto Carnicer,12,Right Midfield,96.4,53.9,15946
1,b0f73423-3990-45ae-9dda-3512c2d1aff3,1,False,6612,Rubén Duarte Sánchez,6,Left Back,106.0,54.5,15946
2,13b1ddab-d22e-43d9-bfe4-12632fea1a27,1,True,5477,Ousmane Dembélé,16,Left Midfield,100.3,37.2,15946
3,391bfb74-07a6-4afe-9568-02a9b23f5bd4,1,False,5211,Jordi Alba Ramos,6,Left Back,110.0,39.8,15946
4,5e55f5a5-954f-4cc4-ba6e-a9cf6d6e249e,1,False,6626,Mubarak Wakaso,13,Right Center Midfield,105.3,52.3,15946
...,...,...,...,...,...,...,...,...,...,...
244,69116a37-5b99-442c-8bba-906ada0d1de0,17,True,6300,Sebastián Coates Nión,3,Right Center Back,106.0,34.0,7558
245,9064c65e-c437-4519-933d-29401c98799f,17,False,5677,Fedor Kudryashov,6,Left Back,106.0,35.0,7558
246,440fd15e-6001-48a8-a42d-d7459a22bb42,17,True,5248,José Martín Cáceres Silva,2,Right Back,63.0,49.0,7558
247,69116a37-5b99-442c-8bba-906ada0d1de0,18,True,5255,Matías Vecino Falero,16,Left Midfield,114.0,33.0,7558


In [3]:
# Filter shots
df_statsbomb_shot = df_statsbomb_event[df_statsbomb_event.type_name == 'Shot'].copy()
df_statsbomb_shot

Unnamed: 0,match_id,id,index,period,timestamp_minute,timestamp_second,timestamp_millisecond,minute,second,type_id,...,block_save_block,out,shot_open_goal,injury_stoppage_in_chain,shot_follows_dribble,pass_no_touch,dribble_no_touch,half_start_late_video_start,player_off_permanent,half_end_early_video_end
147,15946,65f16e50-7c5d-4293-b2fc-d20887a772f9,148,1,2,29,94,2,29,16,...,,,,,,,,,,
282,15946,b0f73423-3990-45ae-9dda-3512c2d1aff3,283,1,5,39,239,5,39,16,...,,,,,,,,,,
754,15946,13b1ddab-d22e-43d9-bfe4-12632fea1a27,755,1,15,28,625,15,28,16,...,,,,,,,,,,
787,15946,391bfb74-07a6-4afe-9568-02a9b23f5bd4,788,1,16,19,616,16,19,16,...,,,,,,,,,,
841,15946,5e55f5a5-954f-4cc4-ba6e-a9cf6d6e249e,842,1,18,15,914,18,15,16,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3128,7558,82c088ab-199e-425f-a473-7ba420875014,3129,2,44,18,675,89,18,16,...,,,,,,,,,,
3133,7558,96e16373-a680-4b07-88b9-f69219692837,3134,2,44,47,115,89,47,16,...,,,,,,,,,,
3137,7558,ee862796-108e-4dbb-9020-fca7dca701bc,3138,2,44,48,195,89,48,16,...,,,,,,,,,,
3252,7558,9627b537-af10-4437-88de-bb9b17068768,3253,2,49,15,955,94,15,16,...,,,,,,,,,,


In [4]:
# Filter out penalty goals from freeze frames
non_penalty_id = df_statsbomb_shot.loc[(df_statsbomb_shot.sub_type_name != 'Penalty'), 'id']
df_statsbomb_freeze = df_statsbomb_freeze[df_statsbomb_freeze.id.isin(non_penalty_id)].copy()

In [5]:
# Add the shot taker to the freeze frame
cols_to_keep = ['id','player_id','player_name','position_id','position_name','x','y','match_id']
freeze_ids = df_statsbomb_freeze.id.unique()
df_shot_taker = df_statsbomb_shot.loc[df_statsbomb_shot.id.isin(freeze_ids), cols_to_keep].copy()
df_shot_taker['player_teammate'] = True
df_shot_taker['event_freeze_id'] = 0
df_shot_taker.rename({'position_id': 'player_position_id', 'position_name': 'player_position_name'}, axis=1, inplace=True)
df_statsbomb_freeze = pd.concat([df_statsbomb_freeze, df_shot_taker])

In [6]:
# Calculate features
statsbomb_pitch = Pitch()

# store the results in lists
area_goal = []
area_shot = []
n_angle = []

# loop through the freeze frames create a voronoi and calculate the area around the goalkeeper/ shot taker
for shot_id in df_statsbomb_freeze.id.unique():
    subset = df_statsbomb_freeze.loc[df_statsbomb_freeze.id == shot_id,
                                     ['x', 'y', 'player_teammate', 'event_freeze_id', 
                                      'player_position_id','player_position_name']].copy()
    team1, team2 = statsbomb_pitch.voronoi(subset.x, subset.y, subset.player_teammate)
    subset['rank'] = subset.groupby('player_teammate')['x'].cumcount()
    
    # goal keeper voronoi
    if (subset.player_position_name=='Goalkeeper').sum() > 0:
        goalkeeper_voronoi = team2[subset.loc[subset.player_position_id == 1, 'rank'].values[0]]
        area_goal.append(Polygon(goalkeeper_voronoi).area)
    else:
        area_goal.append(0)
    
    # shot voronoi
    shot_taker_voronoi = team1[subset.loc[subset.event_freeze_id == 0, 'rank'].values[0]]
    area_shot.append(Polygon(shot_taker_voronoi).area)
    
    # calculate number of players in the angle to the goal
    shot_taker = subset.loc[subset.event_freeze_id == 0, ['x', 'y']]
    verts = np.zeros((3, 2))
    verts[0, 0] = shot_taker.x
    verts[0, 1] = shot_taker.y
    verts[1:, :] = statsbomb_pitch.goal_right
    angle = Polygon(verts).buffer(0)  # the angle to the goal polygon, buffer added as sometimes shot is on the goal line
    players = MultiPoint(subset.loc[subset.event_freeze_id!=0, ['x', 'y']].values.tolist())  # points for players
    intersection = players.intersection(angle)  # intersection between angle and players
    if isinstance(intersection, MultiPoint):  # calculate number of players
        n_players = len(players.intersection(angle))
    elif isinstance(intersection, Point):
        n_players = 1
    else:
        n_players = 0
    n_angle.append(n_players)
    
# create a dataframe
df_freeze_features = pd.DataFrame({'id': df_statsbomb_freeze.id.unique(), 'area_shot': area_shot,
                                   'area_goal': area_goal, 'n_angle': n_angle})
df_freeze_features

Unnamed: 0,id,area_shot,area_goal,n_angle
0,65f16e50-7c5d-4293-b2fc-d20887a772f9,356.776792,112.696874,2
1,b0f73423-3990-45ae-9dda-3512c2d1aff3,412.906814,90.815372,1
2,13b1ddab-d22e-43d9-bfe4-12632fea1a27,49.781203,436.850984,3
3,391bfb74-07a6-4afe-9568-02a9b23f5bd4,83.147914,240.463241,1
4,5e55f5a5-954f-4cc4-ba6e-a9cf6d6e249e,580.715066,47.976154,1
...,...,...,...,...
22316,82c088ab-199e-425f-a473-7ba420875014,3031.035038,90.467650,2
22317,96e16373-a680-4b07-88b9-f69219692837,12.190909,16.617424,1
22318,ee862796-108e-4dbb-9020-fca7dca701bc,3.100000,1.775000,1
22319,9627b537-af10-4437-88de-bb9b17068768,42.758459,31.918768,1


In [7]:
# Add on goalkeeper position
gk_position = df_statsbomb_freeze.loc[(df_statsbomb_freeze.player_position_name == 'Goalkeeper') &
                                      (df_statsbomb_freeze.player_teammate == False), ['id', 'x', 'y']]
gk_position.rename({'x': 'goalkeeper_x','y': 'goalkeeper_y'}, axis=1, inplace=True)
df_freeze_features = df_freeze_features.merge(gk_position, how='left', on='id', validate='1:1')

In [8]:
# Save features
df_freeze_features.to_parquet(os.path.join(STATSBOMB, 'freeze_features.parquet'))
df_freeze_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22321 entries, 0 to 22320
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            22321 non-null  object 
 1   area_shot     22321 non-null  float64
 2   area_goal     22321 non-null  float64
 3   n_angle       22321 non-null  int64  
 4   goalkeeper_x  22262 non-null  float64
 5   goalkeeper_y  22262 non-null  float64
dtypes: float64(4), int64(1), object(1)
memory usage: 1.2+ MB
