In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from mplsoccer.pitch import Pitch
import shapely.geometry as geom


Load the data

In [2]:
cwd = os.getcwd()
WYSCOUT = os.path.join(cwd, 'data', 'wyscout')
df_wyscout_event = pd.read_parquet(os.path.join(WYSCOUT, 'event.parquet'))
df_wyscout_player = pd.read_parquet(os.path.join(WYSCOUT, 'player.parquet'))
df_wyscout_match = pd.read_parquet(os.path.join(WYSCOUT, 'match.parquet'))
df_wyscout_formation = pd.read_parquet(os.path.join(WYSCOUT, 'formation.parquet'))
df_wyscout_sub = pd.read_parquet(os.path.join(WYSCOUT, 'substitution.parquet'))

Add on team name

In [3]:
df_team = pd.concat([(df_wyscout_match[['away_team_id', 'away_team_name']]
                      .rename({'away_team_id': 'team_id', 'away_team_name': 'team_name'}, axis=1)),
                     (df_wyscout_match[['home_team_id', 'home_team_name']]
                      .rename({'home_team_id': 'team_id', 'home_team_name': 'team_name'}, axis=1))])
df_team.drop_duplicates('team_id', inplace=True)
df_wyscout_event = df_wyscout_event.merge(df_team, on='team_id', how='left')

Add on smart pass marker

In [4]:
df_wyscout_event['smart_pass'] = (df_wyscout_event.subEventName == 'Smart pass')

Replace player_id = Zero with nullm

In [5]:
df_wyscout_event.player_id.replace({0: np.nan}, inplace=True)

Pitches for coordinate conversion

In [6]:
pitch_wyscout = Pitch(pitch_type='wyscout')
pitch_statsperform = Pitch(pitch_type='uefa', figsize=(16, 9))

Rename Free Kick to Set Piece

In [7]:
df_wyscout_event.eventName.replace('Free Kick', 'Set Piece', inplace=True)

Add on a column for a pass attempt

In [8]:
mask_pass = ((df_wyscout_event.eventName == 'Pass') | 
             df_wyscout_event.subEventName.isin(['Throw in', 'Free Kick', 'Goal kick', 'Corner', 'Free kick cross']))
df_wyscout_event['pass_attempt'] = mask_pass
df_wyscout_event

Unnamed: 0,eventId,subEventName,player_id,match_id,eventName,team_id,matchPeriod,eventSec,subEventId,id,...,through,fairplay,lost,neutral,won,accurate,not_accurate,team_name,smart_pass,pass_attempt
0,8,Simple pass,26010.0,1694390,Pass,4418,1H,1.255990,85.0,88178642,...,False,False,False,False,False,True,False,France,False,True
1,8,Simple pass,3682.0,1694390,Pass,4418,1H,2.351908,85.0,88178643,...,False,False,False,False,False,True,False,France,False,True
2,8,Simple pass,31528.0,1694390,Pass,4418,1H,3.241028,85.0,88178644,...,False,False,False,False,False,True,False,France,False,True
3,8,High pass,7855.0,1694390,Pass,4418,1H,6.033681,83.0,88178645,...,False,False,False,False,False,False,True,France,False,True
4,1,Ground defending duel,25437.0,1694390,Duel,4418,1H,13.143591,12.0,88178646,...,False,False,False,True,False,True,False,France,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3087687,3,Free kick cross,70974.0,2576338,Set Piece,3193,2H,2870.982660,32.0,253567160,...,False,False,False,False,False,True,False,Genoa CFC,False,True
3087688,1,Ground loose ball duel,14745.0,2576338,Duel,3185,2H,2872.101142,13.0,253567161,...,False,False,False,True,False,True,False,Torino FC,False,False
3087689,1,Ground loose ball duel,413041.0,2576338,Duel,3193,2H,2872.990437,13.0,253567163,...,False,False,False,True,False,True,False,Genoa CFC,False,False
3087690,1,Air duel,20927.0,2576338,Duel,3185,2H,2874.049206,10.0,253567162,...,False,False,False,False,True,True,False,Torino FC,False,False


Add shot attempt boolean column

In [9]:
mask_corner_goal = (df_wyscout_event.subEventName=='Corner') & (df_wyscout_event.goal==True)
df_wyscout_event['shot'] = ((df_wyscout_event.eventName == 'Shot') | 
                            (df_wyscout_event.subEventName=='Free kick shot') | 
                            (df_wyscout_event.subEventName == 'Penalty') |
                            (mask_corner_goal))
df_wyscout_event

Unnamed: 0,eventId,subEventName,player_id,match_id,eventName,team_id,matchPeriod,eventSec,subEventId,id,...,fairplay,lost,neutral,won,accurate,not_accurate,team_name,smart_pass,pass_attempt,shot
0,8,Simple pass,26010.0,1694390,Pass,4418,1H,1.255990,85.0,88178642,...,False,False,False,False,True,False,France,False,True,False
1,8,Simple pass,3682.0,1694390,Pass,4418,1H,2.351908,85.0,88178643,...,False,False,False,False,True,False,France,False,True,False
2,8,Simple pass,31528.0,1694390,Pass,4418,1H,3.241028,85.0,88178644,...,False,False,False,False,True,False,France,False,True,False
3,8,High pass,7855.0,1694390,Pass,4418,1H,6.033681,83.0,88178645,...,False,False,False,False,False,True,France,False,True,False
4,1,Ground defending duel,25437.0,1694390,Duel,4418,1H,13.143591,12.0,88178646,...,False,False,True,False,True,False,France,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3087687,3,Free kick cross,70974.0,2576338,Set Piece,3193,2H,2870.982660,32.0,253567160,...,False,False,False,False,True,False,Genoa CFC,False,True,False
3087688,1,Ground loose ball duel,14745.0,2576338,Duel,3185,2H,2872.101142,13.0,253567161,...,False,False,True,False,True,False,Torino FC,False,False,False
3087689,1,Ground loose ball duel,413041.0,2576338,Duel,3193,2H,2872.990437,13.0,253567163,...,False,False,True,False,True,False,Genoa CFC,False,False,False
3087690,1,Air duel,20927.0,2576338,Duel,3185,2H,2874.049206,10.0,253567162,...,False,False,False,True,True,False,Torino FC,False,False,False


Add on switch (StatsBomb definition = ball transitioned at least 50% of the pitch vertically). Note that I have already removed dodgy end locations near the corner flags so this works.

In [10]:
mask_switch = (abs(df_wyscout_event.end_y - df_wyscout_event.y) >= 50) & (df_wyscout_event.pass_attempt)
df_wyscout_event['pass_switch'] = mask_switch

Add on cross (StatsBomb definition: See Appendix 6 of the Docs for Events)

In [11]:
# cross right side start
cross_right_start = np.array([[pitch_wyscout.dim.right,
                               pitch_wyscout.dim.bottom - pitch_wyscout.dim.six_yard_top],
                              [pitch_wyscout.dim.right,
                               pitch_wyscout.dim.bottom],
                              [pitch_wyscout.dim.right - pitch_wyscout.dim.pitch_length*0.3,
                               pitch_wyscout.dim.bottom],
                              [pitch_wyscout.dim.right - pitch_wyscout.dim.pitch_length*0.3,
                               pitch_wyscout.dim.bottom - pitch_wyscout.dim.penalty_area_top],
                              [pitch_wyscout.dim.right - pitch_wyscout.dim.penalty_area_length,
                               pitch_wyscout.dim.bottom - pitch_wyscout.dim.penalty_area_top]])
cross_right_start = geom.Polygon(cross_right_start)
# cross right side end
cross_right_end = np.array([[pitch_wyscout.dim.right,
                             pitch_wyscout.dim.top],
                            [pitch_wyscout.dim.right - pitch_wyscout.dim.penalty_area_length,
                             pitch_wyscout.dim.top],
                            [pitch_wyscout.dim.right - pitch_wyscout.dim.penalty_area_length,
                             pitch_wyscout.dim.bottom - pitch_wyscout.dim.six_yard_top],
                            [pitch_wyscout.dim.right,
                             pitch_wyscout.dim.bottom - pitch_wyscout.dim.six_yard_top]])
cross_right_end = geom.Polygon(cross_right_end)
# cross left side start
cross_left_start = np.array([[pitch_wyscout.dim.right,
                               pitch_wyscout.dim.top + pitch_wyscout.dim.six_yard_top],
                              [pitch_wyscout.dim.right,
                               pitch_wyscout.dim.top],
                              [pitch_wyscout.dim.right - pitch_wyscout.dim.pitch_length*0.3,
                               pitch_wyscout.dim.top],
                              [pitch_wyscout.dim.right - pitch_wyscout.dim.pitch_length*0.3,
                               pitch_wyscout.dim.top + pitch_wyscout.dim.penalty_area_top],
                              [pitch_wyscout.dim.right - pitch_wyscout.dim.penalty_area_length,
                               pitch_wyscout.dim.top + pitch_wyscout.dim.penalty_area_top]])
cross_left_start = geom.Polygon(cross_left_start)
# cross left side end
cross_left_end = np.array([[pitch_wyscout.dim.right,
                             pitch_wyscout.dim.bottom],
                            [pitch_wyscout.dim.right - pitch_wyscout.dim.penalty_area_length,
                             pitch_wyscout.dim.bottom],
                            [pitch_wyscout.dim.right - pitch_wyscout.dim.penalty_area_length,
                             pitch_wyscout.dim.top + pitch_wyscout.dim.six_yard_top],
                            [pitch_wyscout.dim.right,
                             pitch_wyscout.dim.top + pitch_wyscout.dim.six_yard_top]])
cross_left_end = geom.Polygon(cross_left_end)
# find intersection of passes and cross polygons
df_pass = df_wyscout_event[df_wyscout_event.pass_attempt].copy()
# starting locations
pass_start = geom.MultiPoint(df_pass[['x', 'y']].values)
cross_start_left_intersects = [point.intersects(cross_left_start) for point in pass_start]
cross_start_right_intersects = [point.intersects(cross_right_start) for point in pass_start]
# end locations
pass_end = geom.MultiPoint(df_pass[['end_x', 'end_y']].values)
cross_end_left_intersects = [point.intersects(cross_left_end) for point in pass_end]
cross_end_right_intersects = [point.intersects(cross_right_end) for point in pass_end]
# add cross marker to event data
mask_cross = ((np.array(cross_start_left_intersects) & np.array(cross_end_left_intersects)) | 
              (np.array(cross_start_right_intersects) & np.array(cross_end_right_intersects)))
cross_ids = df_pass[mask_cross].id
df_wyscout_event['cross'] = df_wyscout_event.id.isin(cross_ids)
df_wyscout_event

Unnamed: 0,eventId,subEventName,player_id,match_id,eventName,team_id,matchPeriod,eventSec,subEventId,id,...,neutral,won,accurate,not_accurate,team_name,smart_pass,pass_attempt,shot,pass_switch,cross
0,8,Simple pass,26010.0,1694390,Pass,4418,1H,1.255990,85.0,88178642,...,False,False,True,False,France,False,True,False,False,False
1,8,Simple pass,3682.0,1694390,Pass,4418,1H,2.351908,85.0,88178643,...,False,False,True,False,France,False,True,False,False,False
2,8,Simple pass,31528.0,1694390,Pass,4418,1H,3.241028,85.0,88178644,...,False,False,True,False,France,False,True,False,False,False
3,8,High pass,7855.0,1694390,Pass,4418,1H,6.033681,83.0,88178645,...,False,False,False,True,France,False,True,False,False,False
4,1,Ground defending duel,25437.0,1694390,Duel,4418,1H,13.143591,12.0,88178646,...,True,False,True,False,France,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3087687,3,Free kick cross,70974.0,2576338,Set Piece,3193,2H,2870.982660,32.0,253567160,...,False,False,True,False,Genoa CFC,False,True,False,False,False
3087688,1,Ground loose ball duel,14745.0,2576338,Duel,3185,2H,2872.101142,13.0,253567161,...,True,False,True,False,Torino FC,False,False,False,False,False
3087689,1,Ground loose ball duel,413041.0,2576338,Duel,3193,2H,2872.990437,13.0,253567163,...,True,False,True,False,Genoa CFC,False,False,False,False,False
3087690,1,Air duel,20927.0,2576338,Duel,3185,2H,2874.049206,10.0,253567162,...,False,True,True,False,Torino FC,False,False,False,False,False


Add on cut-back StatsBomb definition (StatsBomb definition: See Appendix 5 of the Docs for Events)

In [12]:
# right side start
cut_right_start = np.array([[pitch_wyscout.dim.right, pitch_wyscout.dim.bottom - pitch_wyscout.dim.six_yard_top],
                            [pitch_wyscout.dim.right, pitch_wyscout.dim.bottom],
                            [pitch_wyscout.dim.right - pitch_wyscout.dim.six_yard_length, pitch_wyscout.dim.bottom],
                            [pitch_wyscout.dim.right - pitch_wyscout.dim.six_yard_length,
                             pitch_wyscout.dim.bottom - pitch_wyscout.dim.six_yard_top]])
cut_right_start = geom.Polygon(cut_right_start)
# right side end
cut_right_end = np.array([[pitch_wyscout.dim.right - pitch_wyscout.dim.six_yard_length,
                          pitch_wyscout.dim.bottom - pitch_wyscout.dim.penalty_area_top],
                         [pitch_wyscout.dim.right - pitch_wyscout.dim.penalty_area_length,
                          pitch_wyscout.dim.bottom - pitch_wyscout.dim.penalty_area_top],
                         [pitch_wyscout.dim.right - pitch_wyscout.dim.penalty_area_length,
                          pitch_wyscout.dim.top + pitch_wyscout.dim.six_yard_top],
                         [pitch_wyscout.dim.right - pitch_wyscout.dim.six_yard_length,
                          pitch_wyscout.dim.top + pitch_wyscout.dim.six_yard_top]])
cut_right_end = geom.Polygon(cut_right_end)
# left side start
cut_left_start = np.array([[pitch_wyscout.dim.right, pitch_wyscout.dim.top + pitch_wyscout.dim.six_yard_top],
                           [pitch_wyscout.dim.right, pitch_wyscout.dim.top],
                           [pitch_wyscout.dim.right - pitch_wyscout.dim.six_yard_length, pitch_wyscout.dim.top],
                           [pitch_wyscout.dim.right - pitch_wyscout.dim.six_yard_length,
                            pitch_wyscout.dim.top + pitch_wyscout.dim.six_yard_top]])
cut_left_start = geom.Polygon(cut_left_start)
# left side end
cut_left_end = np.array([[pitch_wyscout.dim.right - pitch_wyscout.dim.six_yard_length,
                          pitch_wyscout.dim.top + pitch_wyscout.dim.penalty_area_top],
                         [pitch_wyscout.dim.right - pitch_wyscout.dim.penalty_area_length,
                          pitch_wyscout.dim.top + pitch_wyscout.dim.penalty_area_top],
                         [pitch_wyscout.dim.right - pitch_wyscout.dim.penalty_area_length,
                          pitch_wyscout.dim.bottom - pitch_wyscout.dim.six_yard_top],
                         [pitch_wyscout.dim.right - pitch_wyscout.dim.six_yard_length,
                          pitch_wyscout.dim.bottom - pitch_wyscout.dim.six_yard_top]])
cut_left_end = geom.Polygon(cut_left_end)
# find intersection of passes and cut back polygons
cut_start_left_intersects = [point.intersects(cut_left_start) for point in pass_start]
cut_start_right_intersects = [point.intersects(cut_right_start) for point in pass_start]
# end locations
cut_end_left_intersects = [point.intersects(cut_left_end) for point in pass_end]
cut_end_right_intersects = [point.intersects(cut_right_end) for point in pass_end]
# add cut back marker to event data
mask_cut = ((np.array(cut_start_left_intersects) & np.array(cut_end_left_intersects)) | 
            (np.array(cut_start_right_intersects) & np.array(cut_end_right_intersects)))
# not high and comes from a normal pass (not corner etc.)
cut_ids = df_pass[((mask_cut) & (df_pass.high == False) & (df_pass.eventName == 'Pass'))].id
df_wyscout_event['cut_back'] = df_wyscout_event.id.isin(cut_ids)
df_wyscout_event

Unnamed: 0,eventId,subEventName,player_id,match_id,eventName,team_id,matchPeriod,eventSec,subEventId,id,...,won,accurate,not_accurate,team_name,smart_pass,pass_attempt,shot,pass_switch,cross,cut_back
0,8,Simple pass,26010.0,1694390,Pass,4418,1H,1.255990,85.0,88178642,...,False,True,False,France,False,True,False,False,False,False
1,8,Simple pass,3682.0,1694390,Pass,4418,1H,2.351908,85.0,88178643,...,False,True,False,France,False,True,False,False,False,False
2,8,Simple pass,31528.0,1694390,Pass,4418,1H,3.241028,85.0,88178644,...,False,True,False,France,False,True,False,False,False,False
3,8,High pass,7855.0,1694390,Pass,4418,1H,6.033681,83.0,88178645,...,False,False,True,France,False,True,False,False,False,False
4,1,Ground defending duel,25437.0,1694390,Duel,4418,1H,13.143591,12.0,88178646,...,False,True,False,France,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3087687,3,Free kick cross,70974.0,2576338,Set Piece,3193,2H,2870.982660,32.0,253567160,...,False,True,False,Genoa CFC,False,True,False,False,False,False
3087688,1,Ground loose ball duel,14745.0,2576338,Duel,3185,2H,2872.101142,13.0,253567161,...,False,True,False,Torino FC,False,False,False,False,False,False
3087689,1,Ground loose ball duel,413041.0,2576338,Duel,3193,2H,2872.990437,13.0,253567163,...,False,True,False,Genoa CFC,False,False,False,False,False,False
3087690,1,Air duel,20927.0,2576338,Duel,3185,2H,2874.049206,10.0,253567162,...,True,True,False,Torino FC,False,False,False,False,False,False


Add goal scored (excluding shootouts)

In [13]:
df_wyscout_event['goal_scored_excl_shootout'] = (((df_wyscout_event.goal) &
                                                  (df_wyscout_event['matchPeriod']!='P') &
                                                  (df_wyscout_event['shot'])) |
                                                 (df_wyscout_event.own_goal))
df_wyscout_event

Unnamed: 0,eventId,subEventName,player_id,match_id,eventName,team_id,matchPeriod,eventSec,subEventId,id,...,accurate,not_accurate,team_name,smart_pass,pass_attempt,shot,pass_switch,cross,cut_back,goal_scored_excl_shootout
0,8,Simple pass,26010.0,1694390,Pass,4418,1H,1.255990,85.0,88178642,...,True,False,France,False,True,False,False,False,False,False
1,8,Simple pass,3682.0,1694390,Pass,4418,1H,2.351908,85.0,88178643,...,True,False,France,False,True,False,False,False,False,False
2,8,Simple pass,31528.0,1694390,Pass,4418,1H,3.241028,85.0,88178644,...,True,False,France,False,True,False,False,False,False,False
3,8,High pass,7855.0,1694390,Pass,4418,1H,6.033681,83.0,88178645,...,False,True,France,False,True,False,False,False,False,False
4,1,Ground defending duel,25437.0,1694390,Duel,4418,1H,13.143591,12.0,88178646,...,True,False,France,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3087687,3,Free kick cross,70974.0,2576338,Set Piece,3193,2H,2870.982660,32.0,253567160,...,True,False,Genoa CFC,False,True,False,False,False,False,False
3087688,1,Ground loose ball duel,14745.0,2576338,Duel,3185,2H,2872.101142,13.0,253567161,...,True,False,Torino FC,False,False,False,False,False,False,False
3087689,1,Ground loose ball duel,413041.0,2576338,Duel,3193,2H,2872.990437,13.0,253567163,...,True,False,Genoa CFC,False,False,False,False,False,False,False
3087690,1,Air duel,20927.0,2576338,Duel,3185,2H,2874.049206,10.0,253567162,...,True,False,Torino FC,False,False,False,False,False,False,False


Check for missing Goals - there is one Kevin de Bruyne goal missing in the Wyscout event data

In [14]:
goals_per_game = pd.DataFrame(df_wyscout_event[df_wyscout_event['goal_scored_excl_shootout']]
                              .groupby('match_id')
                              .match_id.count())
goals_per_game.columns = ['Goals']
goals_per_game.reset_index(inplace=True)
goals_per_game = goals_per_game.merge(df_wyscout_match[['home_score', 'label', 'away_score','match_id', 'kick_off']], 
                                      how='right', on='match_id')
goals_per_game = goals_per_game.fillna(0)
# only one game missing a goal it's a Kevin de Bruyne goal and it's not in the event data
goals_per_game[goals_per_game.Goals != (goals_per_game.home_score + goals_per_game.away_score)]

Unnamed: 0,match_id,Goals,home_score,label,away_score,kick_off
313,2499781,0.0,0.0,"Chelsea - Manchester City, 0 - 1",1.0,2017-09-30 18:30:00


**Create a pass_height_name feature.**

Assumptions:
* 1) headed passes are high (roughly 60% are in the StatsBomb data)
* 2) smart passes with the through ball tag are Ground/ low. It says in the Wyscout docs that through ball is added to smart pass if the pass is on the ground or it’s over the heads of the opposite players, but it’s on short distance – 5-10 meters.
* 3) smart passes without through balls are high passes
* 4) hand passes are ground/low (hopefully launch catches high passes)
* 5) throw-in / goal kick are high
* 6) free-kick, crosses and corners are low unless high=True

In [15]:
# assumption made here that head passes are high passes (in StatsBomb roughly 60% are)
df_wyscout_event.loc[df_wyscout_event.subEventName.isin(['High pass', 'Launch', 'Head pass']),
                     'pass_height_name'] = 'High Pass'
df_wyscout_event.loc[(df_wyscout_event.high == True) & (df_wyscout_event.eventName == 'Pass'),
                     'pass_height_name'] = 'High Pass'
# if smart pass and not a through ball assumed high
df_wyscout_event.loc[(df_wyscout_event.subEventName == 'Smart pass') & (df_wyscout_event.through == False),
                     'pass_height_name'] = 'High Pass'
df_wyscout_event.loc[df_wyscout_event.subEventName == 'Simple pass',
                     'pass_height_name'] = 'Ground/ Low Pass' 
df_wyscout_event.loc[(df_wyscout_event.subEventName.isin(['Corner', 'Free kick cross', 'Free Kick', 'Cross'])) &
                     (df_wyscout_event.high == False),
                     'pass_height_name'] = 'Ground/ Low Pass'
df_wyscout_event.loc[df_wyscout_event.subEventName.isin(['Throw in', 'Goal kick']),
                     'pass_height_name'] = 'High Pass'
df_wyscout_event.loc[(df_wyscout_event.subEventName.isin(['Corner', 'Free kick cross', 'Cross'])) &
                     (df_wyscout_event.high),
                     'pass_height_name'] = 'High Pass'
# assumption made here that smart through balls are ground/ low
df_wyscout_event.loc[(df_wyscout_event.subEventName == 'Smart pass') & (df_wyscout_event.through),
                     'pass_height_name'] = 'Ground/ Low Pass'
df_wyscout_event.loc[(df_wyscout_event.subEventName == 'Hand pass'),
                     'pass_height_name'] = 'Ground/ Low Pass'
df_wyscout_event

Unnamed: 0,eventId,subEventName,player_id,match_id,eventName,team_id,matchPeriod,eventSec,subEventId,id,...,not_accurate,team_name,smart_pass,pass_attempt,shot,pass_switch,cross,cut_back,goal_scored_excl_shootout,pass_height_name
0,8,Simple pass,26010.0,1694390,Pass,4418,1H,1.255990,85.0,88178642,...,False,France,False,True,False,False,False,False,False,Ground/ Low Pass
1,8,Simple pass,3682.0,1694390,Pass,4418,1H,2.351908,85.0,88178643,...,False,France,False,True,False,False,False,False,False,Ground/ Low Pass
2,8,Simple pass,31528.0,1694390,Pass,4418,1H,3.241028,85.0,88178644,...,False,France,False,True,False,False,False,False,False,Ground/ Low Pass
3,8,High pass,7855.0,1694390,Pass,4418,1H,6.033681,83.0,88178645,...,True,France,False,True,False,False,False,False,False,High Pass
4,1,Ground defending duel,25437.0,1694390,Duel,4418,1H,13.143591,12.0,88178646,...,False,France,False,False,False,False,False,False,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3087687,3,Free kick cross,70974.0,2576338,Set Piece,3193,2H,2870.982660,32.0,253567160,...,False,Genoa CFC,False,True,False,False,False,False,False,High Pass
3087688,1,Ground loose ball duel,14745.0,2576338,Duel,3185,2H,2872.101142,13.0,253567161,...,False,Torino FC,False,False,False,False,False,False,False,
3087689,1,Ground loose ball duel,413041.0,2576338,Duel,3193,2H,2872.990437,13.0,253567163,...,False,Genoa CFC,False,False,False,False,False,False,False,
3087690,1,Air duel,20927.0,2576338,Duel,3185,2H,2874.049206,10.0,253567162,...,False,Torino FC,False,False,False,False,False,False,False,


Seperate out names in player dataset

In [16]:
df_wyscout_player['fullName'] = (df_wyscout_player.firstName + ' ' + df_wyscout_player.lastName).str.strip()
player_name_series = df_wyscout_player.fullName.str.split(' ')
df_wyscout_player['firstName'] = player_name_series.apply(lambda x: x[0] if isinstance(x, list) else None)
df_wyscout_player['middleName'] = player_name_series.apply(lambda x: ' '.join(x[1:-1]) if isinstance(x, list) else None)
df_wyscout_player['lastName'] = player_name_series.apply(lambda x: x[-1] if isinstance(x, list) else None)
df_wyscout_player['middleName'] = df_wyscout_player['middleName'].str.strip()
df_wyscout_player['Name'] = ((df_wyscout_player['firstName'] + ' ' + df_wyscout_player['middleName']).str.strip()
                             + ' ' + df_wyscout_player['lastName'])
df_wyscout_player

Unnamed: 0,weight,firstName,middleName,lastName,currentTeamId,birthDate,height,player_id,foot,shortName,...,passportArea_alpha2code,role_code2,role_code3,role_name,birthArea_name,birthArea_id,birthArea_alpha3code,birthArea_alpha2code,fullName,Name
0,78,Harun,,Tekin,4502.0,1989-06-17,187,32777,right,H. Tekin,...,TR,GK,GKP,Goalkeeper,Turkey,792.0,TUR,TR,Harun Tekin,Harun Tekin
1,73,Malang,,Sarr,3775.0,1999-01-23,182,393228,left,M. Sarr,...,SN,DF,DEF,Defender,France,250.0,FRA,FR,Malang Sarr,Malang Sarr
2,72,Over,,Mandanda,3772.0,1998-10-26,176,393230,,O. Mandanda,...,FR,GK,GKP,Goalkeeper,France,250.0,FRA,FR,Over Mandanda,Over Mandanda
3,82,Alfred,John Momar,N'Diaye,683.0,1990-03-06,187,32793,right,A. N'Diaye,...,SN,MD,MID,Midfielder,France,250.0,FRA,FR,Alfred John Momar N'Diaye,Alfred John Momar N'Diaye
4,84,Ibrahima,,Konaté,2975.0,1999-05-25,192,393247,right,I. Konaté,...,FR,DF,DEF,Defender,France,250.0,FRA,FR,Ibrahima Konaté,Ibrahima Konaté
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3598,72,Ali,,Maâloul,16041.0,1990-01-01,175,120839,left,A. Maâloul,...,TN,DF,DEF,Defender,Tunisia,788.0,TUN,TN,Ali Maâloul,Ali Maâloul
3599,76,Carlos,Alberto Cáceda,Oyaguez,15591.0,1991-09-27,183,114736,right,C. Cáceda,...,PE,GK,GKP,Goalkeeper,Peru,604.0,PER,PE,Carlos Alberto Cáceda Oyaguez,Carlos Alberto Cáceda Oyaguez
3600,78,Miguel,Gianpierre Araujo,Blanco,12072.0,1994-10-24,179,114908,right,M. Araujo,...,PE,DF,DEF,Defender,Peru,604.0,PER,PE,Miguel Gianpierre Araujo Blanco,Miguel Gianpierre Araujo Blanco
3601,70,Ahmed,Reda,Tagnaouti,16183.0,1996-04-05,182,285583,right,A. Tagnaouti,...,MA,GK,GKP,Goalkeeper,Morocco,504.0,MAR,MA,Ahmed Reda Tagnaouti,Ahmed Reda Tagnaouti


Add on the player name/ foot

In [17]:
df_wyscout_player.foot.replace({'null': None, '': None}, inplace=True)
df_wyscout_event = df_wyscout_event.merge(df_wyscout_player[['player_id',
                                                             'firstName', 'middleName', 'lastName', 'Name', 'foot']],
                                          how='left', on='player_id')
df_wyscout_event

Unnamed: 0,eventId,subEventName,player_id,match_id,eventName,team_id,matchPeriod,eventSec,subEventId,id,...,pass_switch,cross,cut_back,goal_scored_excl_shootout,pass_height_name,firstName,middleName,lastName,Name,foot
0,8,Simple pass,26010.0,1694390,Pass,4418,1H,1.255990,85.0,88178642,...,False,False,False,False,Ground/ Low Pass,Olivier,,Giroud,Olivier Giroud,left
1,8,Simple pass,3682.0,1694390,Pass,4418,1H,2.351908,85.0,88178643,...,False,False,False,False,Ground/ Low Pass,Antoine,,Griezmann,Antoine Griezmann,left
2,8,Simple pass,31528.0,1694390,Pass,4418,1H,3.241028,85.0,88178644,...,False,False,False,False,Ground/ Low Pass,N'Golo,,Kanté,N'Golo Kanté,right
3,8,High pass,7855.0,1694390,Pass,4418,1H,6.033681,83.0,88178645,...,False,False,False,False,High Pass,Laurent,,Koscielny,Laurent Koscielny,right
4,1,Ground defending duel,25437.0,1694390,Duel,4418,1H,13.143591,12.0,88178646,...,False,False,False,False,,Blaise,,Matuidi,Blaise Matuidi,left
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3087687,3,Free kick cross,70974.0,2576338,Set Piece,3193,2H,2870.982660,32.0,253567160,...,False,False,False,False,High Pass,Iuri,José Picanço,Medeiros,Iuri José Picanço Medeiros,left
3087688,1,Ground loose ball duel,14745.0,2576338,Duel,3185,2H,2872.101142,13.0,253567161,...,False,False,False,False,,Cristian,,Molinaro,Cristian Molinaro,left
3087689,1,Ground loose ball duel,413041.0,2576338,Duel,3193,2H,2872.990437,13.0,253567163,...,False,False,False,False,,Jawad,El,Yamiq,Jawad El Yamiq,right
3087690,1,Air duel,20927.0,2576338,Duel,3185,2H,2874.049206,10.0,253567162,...,False,False,False,False,,Lorenzo,De,Silvestri,Lorenzo De Silvestri,right


Create a pass_technique name.

Assumptions:

* outswinging, different foot as the side of the pitch
* inswinging, same foot as the side of the pitch
* missing/ both foot = inswinging
* each player takes the kick with the foot in the player table.

This misses straight corner kicks and may not be 100% correct.

In [18]:
mask_corner = df_wyscout_event.subEventName == 'Corner'
mask_right = df_wyscout_event.foot == 'right'
mask_left = df_wyscout_event.foot == 'left'
mask_right_side = df_wyscout_event.y >= pitch_wyscout.dim.center_width
mask_left_side = df_wyscout_event.y < pitch_wyscout.dim.center_width
mask_both = df_wyscout_event.foot == 'both'
mask_missing = df_wyscout_event.foot.isnull()
mask_inswing = mask_corner & ((mask_left & mask_left_side) | (mask_right & mask_right_side) | mask_both | mask_missing)
mask_outswing = mask_corner & ((mask_left & mask_right_side) | (mask_right & mask_left_side))
df_wyscout_event.loc[mask_inswing, 'pass_technique_name'] = 'Inswinging'
df_wyscout_event.loc[mask_outswing, 'pass_technique_name'] = 'Outswinging'
df_wyscout_event.loc[df_wyscout_event.through, 'pass_technique_name'] = 'Through Ball'
df_wyscout_event

Unnamed: 0,eventId,subEventName,player_id,match_id,eventName,team_id,matchPeriod,eventSec,subEventId,id,...,cross,cut_back,goal_scored_excl_shootout,pass_height_name,firstName,middleName,lastName,Name,foot,pass_technique_name
0,8,Simple pass,26010.0,1694390,Pass,4418,1H,1.255990,85.0,88178642,...,False,False,False,Ground/ Low Pass,Olivier,,Giroud,Olivier Giroud,left,
1,8,Simple pass,3682.0,1694390,Pass,4418,1H,2.351908,85.0,88178643,...,False,False,False,Ground/ Low Pass,Antoine,,Griezmann,Antoine Griezmann,left,
2,8,Simple pass,31528.0,1694390,Pass,4418,1H,3.241028,85.0,88178644,...,False,False,False,Ground/ Low Pass,N'Golo,,Kanté,N'Golo Kanté,right,
3,8,High pass,7855.0,1694390,Pass,4418,1H,6.033681,83.0,88178645,...,False,False,False,High Pass,Laurent,,Koscielny,Laurent Koscielny,right,
4,1,Ground defending duel,25437.0,1694390,Duel,4418,1H,13.143591,12.0,88178646,...,False,False,False,,Blaise,,Matuidi,Blaise Matuidi,left,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3087687,3,Free kick cross,70974.0,2576338,Set Piece,3193,2H,2870.982660,32.0,253567160,...,False,False,False,High Pass,Iuri,José Picanço,Medeiros,Iuri José Picanço Medeiros,left,
3087688,1,Ground loose ball duel,14745.0,2576338,Duel,3185,2H,2872.101142,13.0,253567161,...,False,False,False,,Cristian,,Molinaro,Cristian Molinaro,left,
3087689,1,Ground loose ball duel,413041.0,2576338,Duel,3193,2H,2872.990437,13.0,253567163,...,False,False,False,,Jawad,El,Yamiq,Jawad El Yamiq,right,
3087690,1,Air duel,20927.0,2576338,Duel,3185,2H,2874.049206,10.0,253567162,...,False,False,False,,Lorenzo,De,Silvestri,Lorenzo De Silvestri,right,


Fix a corner at the wrong end

In [19]:
df_wyscout_event.loc[(df_wyscout_event.y == 100) & (df_wyscout_event.x == 0) & 
                     (df_wyscout_event.subEventName == 'Corner'), 'x'] = 100

Correct a few shots that appear to be on the wrong side of the pitch. I haven't checked, but these seem to be too far away to be real shots. Especially as some are goals

In [20]:
mask_correct_shot = (df_wyscout_event.x < 34) & (df_wyscout_event.shot)
df_wyscout_event.loc[mask_correct_shot, 'x'] = 100 - df_wyscout_event.loc[mask_correct_shot, 'x']

Fast attack, win ball in own third, shoot in last quarter in 7-25 seconds

In [21]:
mask_defence_win = ((df_wyscout_event.subEventName.isin(['Ground defending duel', 'Air duel', 'Save attempt'])) |
                    (df_wyscout_event.interception)) & (df_wyscout_event.x < 33.4)
df_wyscout_event.loc[mask_defence_win, 'defence_win'] = df_wyscout_event.loc[mask_defence_win, 'team_id']
df_wyscout_event.loc[mask_defence_win, 'defence_sec'] = df_wyscout_event.loc[mask_defence_win, 'eventSec']
group_match = df_wyscout_event.groupby(['match_id', 'matchPeriod'])
df_wyscout_event[['defence_win', 'defence_sec']] = group_match[['defence_win', 'defence_sec']].ffill()
mask_fast = (((df_wyscout_event.eventSec - df_wyscout_event.defence_sec) <= 25) &
             (df_wyscout_event.x > 75) &
             (df_wyscout_event.shot) & (df_wyscout_event.team_id == df_wyscout_event.defence_win))
df_wyscout_event['fast_break'] = mask_fast
df_wyscout_event

Unnamed: 0,eventId,subEventName,player_id,match_id,eventName,team_id,matchPeriod,eventSec,subEventId,id,...,pass_height_name,firstName,middleName,lastName,Name,foot,pass_technique_name,defence_win,defence_sec,fast_break
0,8,Simple pass,26010.0,1694390,Pass,4418,1H,1.255990,85.0,88178642,...,Ground/ Low Pass,Olivier,,Giroud,Olivier Giroud,left,,,,False
1,8,Simple pass,3682.0,1694390,Pass,4418,1H,2.351908,85.0,88178643,...,Ground/ Low Pass,Antoine,,Griezmann,Antoine Griezmann,left,,,,False
2,8,Simple pass,31528.0,1694390,Pass,4418,1H,3.241028,85.0,88178644,...,Ground/ Low Pass,N'Golo,,Kanté,N'Golo Kanté,right,,,,False
3,8,High pass,7855.0,1694390,Pass,4418,1H,6.033681,83.0,88178645,...,High Pass,Laurent,,Koscielny,Laurent Koscielny,right,,,,False
4,1,Ground defending duel,25437.0,1694390,Duel,4418,1H,13.143591,12.0,88178646,...,,Blaise,,Matuidi,Blaise Matuidi,left,,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3087687,3,Free kick cross,70974.0,2576338,Set Piece,3193,2H,2870.982660,32.0,253567160,...,High Pass,Iuri,José Picanço,Medeiros,Iuri José Picanço Medeiros,left,,3185.0,2823.371314,False
3087688,1,Ground loose ball duel,14745.0,2576338,Duel,3185,2H,2872.101142,13.0,253567161,...,,Cristian,,Molinaro,Cristian Molinaro,left,,3185.0,2823.371314,False
3087689,1,Ground loose ball duel,413041.0,2576338,Duel,3193,2H,2872.990437,13.0,253567163,...,,Jawad,El,Yamiq,Jawad El Yamiq,right,,3185.0,2823.371314,False
3087690,1,Air duel,20927.0,2576338,Duel,3185,2H,2874.049206,10.0,253567162,...,,Lorenzo,De,Silvestri,Lorenzo De Silvestri,right,,3185.0,2874.049206,False


Flag 10 seconds from a corner or freekick/ 20 seconds from a throw-in

In [22]:
for set_piece in ['Corner', 'Throw in', ['Free kick cross', 'Free kick shot']]:
    if isinstance(set_piece, list):
        mask = df_wyscout_event.subEventName.isin(set_piece)
        name = 'free_kick'
    else:
        mask = df_wyscout_event.subEventName.isin([set_piece])
        name = set_piece.replace(' ', '_').lower()
    df_wyscout_event.loc[mask, f'{name}_sec'] = df_wyscout_event.loc[mask, 'eventSec']
    df_wyscout_event.loc[mask, f'{name}_team'] = df_wyscout_event.loc[mask, 'team_id']
    df_wyscout_event[f'{name}_sec'] = group_match[f'{name}_sec'].ffill()
    df_wyscout_event[f'{name}_team'] = group_match[f'{name}_team'].ffill()
    df_wyscout_event[f'{name}_sec'] = df_wyscout_event.eventSec - df_wyscout_event[f'{name}_sec']
df_wyscout_event.loc[df_wyscout_event.throw_in_sec > 20, 'throw_in_sec'] = np.nan
df_wyscout_event.loc[df_wyscout_event.free_kick_sec > 10, 'free_kick_sec'] = np.nan
df_wyscout_event.loc[df_wyscout_event.corner_sec > 10, 'corner_sec'] = np.nan
df_wyscout_event['play_type'] = df_wyscout_event[['throw_in_sec', 'free_kick_sec', 'corner_sec']].idxmin(axis=1).str[:-4]
# if throw-in and defensive set to null
mask_defensive = ((df_wyscout_event.play_type == 'throw_in') &
                  (df_wyscout_event['throw_in_team'] != df_wyscout_event.team_id))
df_wyscout_event.loc[mask_defensive, 'play_type'] = np.nan
df_wyscout_event

Unnamed: 0,eventId,subEventName,player_id,match_id,eventName,team_id,matchPeriod,eventSec,subEventId,id,...,defence_win,defence_sec,fast_break,corner_sec,corner_team,throw_in_sec,throw_in_team,free_kick_sec,free_kick_team,play_type
0,8,Simple pass,26010.0,1694390,Pass,4418,1H,1.255990,85.0,88178642,...,,,False,,,,,,,
1,8,Simple pass,3682.0,1694390,Pass,4418,1H,2.351908,85.0,88178643,...,,,False,,,,,,,
2,8,Simple pass,31528.0,1694390,Pass,4418,1H,3.241028,85.0,88178644,...,,,False,,,,,,,
3,8,High pass,7855.0,1694390,Pass,4418,1H,6.033681,83.0,88178645,...,,,False,,,,,,,
4,1,Ground defending duel,25437.0,1694390,Duel,4418,1H,13.143591,12.0,88178646,...,,,False,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3087687,3,Free kick cross,70974.0,2576338,Set Piece,3193,2H,2870.982660,32.0,253567160,...,3185.0,2823.371314,False,,3185.0,,3193.0,0.000000,3193.0,free_kick
3087688,1,Ground loose ball duel,14745.0,2576338,Duel,3185,2H,2872.101142,13.0,253567161,...,3185.0,2823.371314,False,,3185.0,,3193.0,1.118482,3193.0,free_kick
3087689,1,Ground loose ball duel,413041.0,2576338,Duel,3193,2H,2872.990437,13.0,253567163,...,3185.0,2823.371314,False,,3185.0,,3193.0,2.007777,3193.0,free_kick
3087690,1,Air duel,20927.0,2576338,Duel,3185,2H,2874.049206,10.0,253567162,...,3185.0,2874.049206,False,,3185.0,,3193.0,3.066546,3193.0,free_kick


Add on previous info

In [23]:
# first filter out some events so the previous event is the correct assist type
mask_exclude = ((df_wyscout_event.eventName.isin(['Goalkeeper leaving line', 'Interruption'])) |
                (df_wyscout_event.subEventName == 'Acceleration'))
df_wyscout_event = df_wyscout_event[~mask_exclude].copy()
match_group = df_wyscout_event.groupby(['match_id', 'matchPeriod'])
for i in range(1, 4):
    df_wyscout_event[f'prev_id_{i}'] = match_group.id.shift(i)
    df_wyscout_event[f'prevEventName_{i}'] = match_group.eventName.shift(i)
    df_wyscout_event[f'prevSubEventName_{i}'] = match_group.subEventName.shift(i)
    df_wyscout_event[f'prev_player_id_{i}'] = match_group.player_id.shift(i)
    df_wyscout_event[f'prev_team_id_{i}'] = match_group.team_id.shift(i)
    df_wyscout_event[f'prev_pass_attempt_{i}'] = match_group.pass_attempt.shift(i)
    df_wyscout_event[f'prev_shot_{i}'] = match_group.shot.shift(i)
df_wyscout_event

Unnamed: 0,eventId,subEventName,player_id,match_id,eventName,team_id,matchPeriod,eventSec,subEventId,id,...,prev_team_id_2,prev_pass_attempt_2,prev_shot_2,prev_id_3,prevEventName_3,prevSubEventName_3,prev_player_id_3,prev_team_id_3,prev_pass_attempt_3,prev_shot_3
0,8,Simple pass,26010.0,1694390,Pass,4418,1H,1.255990,85.0,88178642,...,,,,,,,,,,
1,8,Simple pass,3682.0,1694390,Pass,4418,1H,2.351908,85.0,88178643,...,,,,,,,,,,
2,8,Simple pass,31528.0,1694390,Pass,4418,1H,3.241028,85.0,88178644,...,4418.0,True,False,,,,,,,
3,8,High pass,7855.0,1694390,Pass,4418,1H,6.033681,83.0,88178645,...,4418.0,True,False,88178642.0,Pass,Simple pass,26010.0,4418.0,True,False
4,1,Ground defending duel,25437.0,1694390,Duel,4418,1H,13.143591,12.0,88178646,...,4418.0,True,False,88178643.0,Pass,Simple pass,3682.0,4418.0,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3087687,3,Free kick cross,70974.0,2576338,Set Piece,3193,2H,2870.982660,32.0,253567160,...,3185.0,False,False,253567155.0,Duel,Ground attacking duel,116269.0,3193.0,False,False
3087688,1,Ground loose ball duel,14745.0,2576338,Duel,3185,2H,2872.101142,13.0,253567161,...,3185.0,False,False,253567158.0,Duel,Ground defending duel,21234.0,3185.0,False,False
3087689,1,Ground loose ball duel,413041.0,2576338,Duel,3193,2H,2872.990437,13.0,253567163,...,3193.0,True,False,253567159.0,Foul,Foul,21234.0,3185.0,False,False
3087690,1,Air duel,20927.0,2576338,Duel,3185,2H,2874.049206,10.0,253567162,...,3185.0,False,False,253567160.0,Set Piece,Free kick cross,70974.0,3193.0,True,False


### Filter Non-corner shots & Extra Time shots

In [24]:
mask_shot1 = (((df_wyscout_event.eventName=='Shot') | (df_wyscout_event.subEventName=='Free kick shot') | 
              (df_wyscout_event.subEventName=='Penalty')) & (df_wyscout_event['matchPeriod']!='P') & 
              (df_wyscout_event['matchPeriod']!='E1') & (df_wyscout_event['matchPeriod']!='E2'))
mask_shot2 = (df_wyscout_event.subEventName=='Corner') & (df_wyscout_event.goal==True)
mask_corner_goal = (df_wyscout_event.subEventName=='Corner') & (df_wyscout_event.goal==True)
mask_shot = (mask_shot1 | mask_shot2) & (~mask_corner_goal)
df_wyscout_shots = df_wyscout_event[mask_shot].copy()
print('Number of shots:', len(df_wyscout_shots))
print('Number of goals:', df_wyscout_shots.goal.sum())
df_wyscout_shots

Number of shots: 43375
Number of goals: 4779


Unnamed: 0,eventId,subEventName,player_id,match_id,eventName,team_id,matchPeriod,eventSec,subEventId,id,...,prev_team_id_2,prev_pass_attempt_2,prev_shot_2,prev_id_3,prevEventName_3,prevSubEventName_3,prev_player_id_3,prev_team_id_3,prev_pass_attempt_3,prev_shot_3
8,10,Shot,25437.0,1694390,Shot,4418,1H,31.226217,100.0,88178649,...,4418.0,True,False,88178663.0,Duel,Ground attacking duel,83575.0,11944.0,False,False
43,10,Shot,83824.0,1694390,Shot,11944,1H,143.119551,100.0,88178722,...,4418.0,True,False,88178719.0,Set Piece,Throw in,105330.0,11944.0,True,False
59,10,Shot,33235.0,1694390,Shot,11944,1H,219.576026,100.0,88178751,...,11944.0,False,False,88178743.0,Set Piece,Corner,83753.0,11944.0,True,False
69,10,Shot,6165.0,1694390,Shot,11944,1H,247.532561,100.0,88373458,...,11944.0,False,False,88373456.0,Set Piece,Corner,83753.0,11944.0,True,False
162,10,Shot,3682.0,1694390,Shot,4418,1H,557.319065,100.0,88178811,...,4418.0,True,False,88178805.0,Pass,Simple pass,31528.0,4418.0,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3087190,10,Shot,116269.0,2576338,Shot,3193,2H,1152.032980,100.0,253566542,...,3193.0,True,False,253566537.0,Pass,Simple pass,20749.0,3193.0,True,False
3087224,10,Shot,3548.0,2576338,Shot,3193,2H,1251.730517,100.0,253566586,...,3193.0,True,False,253566580.0,Pass,Simple pass,3548.0,3193.0,True,False
3087489,10,Shot,21177.0,2576338,Shot,3193,2H,2065.034482,100.0,253566910,...,3185.0,True,False,253566952.0,Duel,Ground attacking duel,14745.0,3185.0,False,False
3087538,10,Shot,349102.0,2576338,Shot,3193,2H,2367.252041,100.0,253566961,...,3193.0,True,False,253567008.0,Others on the ball,Clearance,20927.0,3185.0,False,False


Add body part name

In [25]:
df_wyscout_shots.loc[df_wyscout_shots.left_foot, 'body_part_name'] = 'Left Foot'
df_wyscout_shots.loc[df_wyscout_shots.right_foot, 'body_part_name'] = 'Right Foot'
df_wyscout_shots.loc[df_wyscout_shots.other_body_part, 'body_part_name'] = 'Other'
df_wyscout_shots.drop(['left_foot', 'right_foot', 'other_body_part'], axis=1, inplace=True)
df_wyscout_shots

Unnamed: 0,eventId,subEventName,player_id,match_id,eventName,team_id,matchPeriod,eventSec,subEventId,id,...,prev_pass_attempt_2,prev_shot_2,prev_id_3,prevEventName_3,prevSubEventName_3,prev_player_id_3,prev_team_id_3,prev_pass_attempt_3,prev_shot_3,body_part_name
8,10,Shot,25437.0,1694390,Shot,4418,1H,31.226217,100.0,88178649,...,True,False,88178663.0,Duel,Ground attacking duel,83575.0,11944.0,False,False,Right Foot
43,10,Shot,83824.0,1694390,Shot,11944,1H,143.119551,100.0,88178722,...,True,False,88178719.0,Set Piece,Throw in,105330.0,11944.0,True,False,Right Foot
59,10,Shot,33235.0,1694390,Shot,11944,1H,219.576026,100.0,88178751,...,False,False,88178743.0,Set Piece,Corner,83753.0,11944.0,True,False,Right Foot
69,10,Shot,6165.0,1694390,Shot,11944,1H,247.532561,100.0,88373458,...,False,False,88373456.0,Set Piece,Corner,83753.0,11944.0,True,False,Other
162,10,Shot,3682.0,1694390,Shot,4418,1H,557.319065,100.0,88178811,...,True,False,88178805.0,Pass,Simple pass,31528.0,4418.0,True,False,Left Foot
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3087190,10,Shot,116269.0,2576338,Shot,3193,2H,1152.032980,100.0,253566542,...,True,False,253566537.0,Pass,Simple pass,20749.0,3193.0,True,False,Left Foot
3087224,10,Shot,3548.0,2576338,Shot,3193,2H,1251.730517,100.0,253566586,...,True,False,253566580.0,Pass,Simple pass,3548.0,3193.0,True,False,Left Foot
3087489,10,Shot,21177.0,2576338,Shot,3193,2H,2065.034482,100.0,253566910,...,True,False,253566952.0,Duel,Ground attacking duel,14745.0,3185.0,False,False,Left Foot
3087538,10,Shot,349102.0,2576338,Shot,3193,2H,2367.252041,100.0,253566961,...,True,False,253567008.0,Others on the ball,Clearance,20927.0,3185.0,False,False,Right Foot


Strongest foot column

In [26]:
mask_strong_foot = (((df_wyscout_shots.foot.isin(['right', 'both'])) & (df_wyscout_shots.body_part_name == 'Right Foot')) |
                    (((df_wyscout_shots.foot.isin(['left', 'both'])) & (df_wyscout_shots.body_part_name == 'Left Foot'))))
df_wyscout_shots['strong_foot'] = mask_strong_foot
df_wyscout_shots

Unnamed: 0,eventId,subEventName,player_id,match_id,eventName,team_id,matchPeriod,eventSec,subEventId,id,...,prev_shot_2,prev_id_3,prevEventName_3,prevSubEventName_3,prev_player_id_3,prev_team_id_3,prev_pass_attempt_3,prev_shot_3,body_part_name,strong_foot
8,10,Shot,25437.0,1694390,Shot,4418,1H,31.226217,100.0,88178649,...,False,88178663.0,Duel,Ground attacking duel,83575.0,11944.0,False,False,Right Foot,False
43,10,Shot,83824.0,1694390,Shot,11944,1H,143.119551,100.0,88178722,...,False,88178719.0,Set Piece,Throw in,105330.0,11944.0,True,False,Right Foot,True
59,10,Shot,33235.0,1694390,Shot,11944,1H,219.576026,100.0,88178751,...,False,88178743.0,Set Piece,Corner,83753.0,11944.0,True,False,Right Foot,True
69,10,Shot,6165.0,1694390,Shot,11944,1H,247.532561,100.0,88373458,...,False,88373456.0,Set Piece,Corner,83753.0,11944.0,True,False,Other,False
162,10,Shot,3682.0,1694390,Shot,4418,1H,557.319065,100.0,88178811,...,False,88178805.0,Pass,Simple pass,31528.0,4418.0,True,False,Left Foot,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3087190,10,Shot,116269.0,2576338,Shot,3193,2H,1152.032980,100.0,253566542,...,False,253566537.0,Pass,Simple pass,20749.0,3193.0,True,False,Left Foot,True
3087224,10,Shot,3548.0,2576338,Shot,3193,2H,1251.730517,100.0,253566586,...,False,253566580.0,Pass,Simple pass,3548.0,3193.0,True,False,Left Foot,True
3087489,10,Shot,21177.0,2576338,Shot,3193,2H,2065.034482,100.0,253566910,...,False,253566952.0,Duel,Ground attacking duel,14745.0,3185.0,False,False,Left Foot,True
3087538,10,Shot,349102.0,2576338,Shot,3193,2H,2367.252041,100.0,253566961,...,False,253567008.0,Others on the ball,Clearance,20927.0,3185.0,False,False,Right Foot,True


Add shot type name

In [27]:
# note there are three goals that come direct from corner kicks - they are tagged 'Free Kick' originally though
# this was renamed earlier to Set Piece
df_wyscout_shots['shot_type_name']  = df_wyscout_shots.subEventName.replace({'Penalty': 'Penalty'})
df_wyscout_shots.shot_type_name.replace('Free kick shot', 'Direct Set Piece', inplace=True)
df_wyscout_shots.shot_type_name.replace('Shot', 'Open Play', inplace=True)
df_wyscout_shots.shot_type_name.unique()

array(['Open Play', 'Penalty', 'Direct Set Piece'], dtype=object)

Assist type, and get event id if pass

In [28]:
# direct from set pieces
df_wyscout_shots.loc[df_wyscout_shots.shot_type_name == 'Direct Set Piece', 'assist_type'] = 'direct'
df_wyscout_shots.loc[df_wyscout_shots.shot_type_name == 'Penalty', 'assist_type'] = 'direct'
# rebound/ clearance from the previous event
df_wyscout_shots.loc[(df_wyscout_shots.assist_type.isnull()) &
                     (df_wyscout_shots.prevEventName_1.isin(['Save attempt', 'Shot', 'Set Piece'])), 'assist_type'] = 'rebound'
df_wyscout_shots.loc[(df_wyscout_shots.assist_type.isnull()) &
                     (df_wyscout_shots.prevSubEventName_1 == 'Clearance'), 'assist_type'] = 'clearance'
# pass from the previous event
mask_pass1 = ((df_wyscout_shots.prev_pass_attempt_1 == True) &
              (df_wyscout_shots.team_id == df_wyscout_shots.prev_team_id_1) &
              (df_wyscout_shots.assist_type.isnull()))
df_wyscout_shots.loc[mask_pass1, 'assist_type'] = 'pass'
df_wyscout_shots.loc[mask_pass1, 'pass_id'] = df_wyscout_shots.loc[mask_pass1, 'prev_id_1']
# pass from the third previous event if and there are two duels in-between
mask_duel = ((df_wyscout_shots.prevEventName_1 == 'Duel') &
             (df_wyscout_shots.prevEventName_2 == 'Duel') &
             (df_wyscout_shots.assist_type.isnull()))             
mask_pass2 = (mask_duel & (df_wyscout_shots.prev_pass_attempt_3) &
              (df_wyscout_shots.team_id == df_wyscout_shots.prev_team_id_3))
df_wyscout_shots.loc[mask_pass2, 'assist_type'] = 'pass'
df_wyscout_shots.loc[mask_pass2, 'pass_id'] = df_wyscout_shots.loc[mask_pass2, 'prev_id_3'] 
# rebound/clearance if the third previous event involved a shot or save and there are two duels in-between
df_wyscout_shots.loc[(df_wyscout_shots.assist_type.isnull()) & mask_duel &
                     ((df_wyscout_shots.prev_shot_3) | (df_wyscout_shots.prevEventName_3 == 'Save attempt')),
                     'assist_type'] = 'rebound'
df_wyscout_shots.loc[(df_wyscout_shots.assist_type.isnull()) & mask_duel &
                     (df_wyscout_shots.prevSubEventName_3 =='Clearance'),
                     'assist_type'] = 'clearance'
# if still null and the second event involves a shot or save set to a rebound
df_wyscout_shots.loc[(df_wyscout_shots.assist_type.isnull()) & 
                     ((df_wyscout_shots.prevEventName_2 == 'Save attempt') | (df_wyscout_shots.prev_shot_2)),
                     'assist_type'] = 'rebound'
# if still null and the second event involves a clearance set to a clearance
df_wyscout_shots.loc[df_wyscout_shots.assist_type.isnull() & (df_wyscout_shots.prevSubEventName_2 == 'Clearance'),
                     'assist_type'] = 'clearance'
# if still null and the third event involves a pass set to a pass
mask_pass3 = ((df_wyscout_shots.assist_type.isnull()) & (df_wyscout_shots.prev_pass_attempt_3) & 
              (df_wyscout_shots.prev_team_id_3 == df_wyscout_shots.team_id))
df_wyscout_shots.loc[mask_pass3, 'assist_type'] = 'pass'
df_wyscout_shots.loc[mask_pass3, 'pass_id'] = df_wyscout_shots.loc[mask_pass3, 'prev_id_3']
# if still null set to recovery
df_wyscout_shots.loc[df_wyscout_shots.assist_type.isnull(), 'assist_type'] = 'recovery'

Keep subset of columns

In [29]:
df_wyscout_shots = df_wyscout_shots[['match_id', 'matchPeriod', 'eventSec', 'id', 'goal', 'team_id', 'team_name', 'player_id',
                                     'firstName', 'middleName', 'lastName', 'Name', 
                                     'shot_type_name', 'play_type',
                                     'x', 'y', 'counter_attack', 'fast_break',
                                     'strong_foot', 'body_part_name', 'assist_type', 'pass_id']].copy()

Add on pass information

In [30]:
df_pass = df_wyscout_event.loc[df_wyscout_event.pass_attempt, ['id', 'end_y', 'end_x', 'smart_pass',
                                                               'pass_switch', 'cross', 'cut_back', 'pass_height_name',
                                                               'pass_technique_name']].copy()
df_pass.rename({'id': 'pass_id', 'end_x': 'pass_end_x', 'end_y': 'pass_end_y',
                'cross': 'pass_cross', 'cut_back': 'pass_cut_back'}, axis=1, inplace=True)
df_pass

Unnamed: 0,pass_id,pass_end_y,pass_end_x,smart_pass,pass_switch,pass_cross,pass_cut_back,pass_height_name,pass_technique_name
0,88178642,50.0,47.0,False,False,False,False,Ground/ Low Pass,
1,88178643,48.0,41.0,False,False,False,False,Ground/ Low Pass,
2,88178644,35.0,32.0,False,False,False,False,Ground/ Low Pass,
3,88178645,6.0,89.0,False,False,False,False,High Pass,
6,88178648,16.0,93.0,False,False,False,False,High Pass,
...,...,...,...,...,...,...,...,...,...
3087673,253567133,60.0,33.0,False,False,False,False,Ground/ Low Pass,
3087676,253567136,43.0,31.0,False,False,False,False,Ground/ Low Pass,
3087678,253567145,87.0,61.0,False,False,False,False,High Pass,
3087680,253567149,26.0,70.0,False,False,False,False,Ground/ Low Pass,


In [31]:
df_wyscout_shots = df_wyscout_shots.merge(df_pass, how='left', on='pass_id')
df_wyscout_shots

Unnamed: 0,match_id,matchPeriod,eventSec,id,goal,team_id,team_name,player_id,firstName,middleName,...,assist_type,pass_id,pass_end_y,pass_end_x,smart_pass,pass_switch,pass_cross,pass_cut_back,pass_height_name,pass_technique_name
0,1694390,1H,31.226217,88178649,False,4418,France,25437.0,Blaise,,...,recovery,,,,,,,,,
1,1694390,1H,143.119551,88178722,False,11944,Romania,83824.0,Mihai,Doru,...,pass,88178719.0,34.0,92.0,False,False,False,False,High Pass,
2,1694390,1H,219.576026,88178751,False,11944,Romania,33235.0,Bogdan,Sorin,...,pass,88178743.0,40.0,96.0,False,False,True,False,High Pass,Outswinging
3,1694390,1H,247.532561,88373458,False,11944,Romania,6165.0,Florin,,...,pass,88373456.0,62.0,95.0,False,False,True,False,High Pass,Inswinging
4,1694390,1H,557.319065,88178811,False,4418,France,3682.0,Antoine,,...,pass,88178808.0,33.0,75.0,False,False,False,False,Ground/ Low Pass,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43370,2576338,2H,1152.032980,253566542,False,3193,Genoa CFC,116269.0,Diego,Sebastián Laxalt,...,pass,253566537.0,95.0,74.0,False,True,False,False,Ground/ Low Pass,
43371,2576338,2H,1251.730517,253566586,False,3193,Genoa CFC,3548.0,Giuseppe,,...,pass,253566584.0,38.0,93.0,True,False,False,False,Ground/ Low Pass,Through Ball
43372,2576338,2H,2065.034482,253566910,True,3193,Genoa CFC,21177.0,Goran,,...,recovery,,,,,,,,,
43373,2576338,2H,2367.252041,253566961,False,3193,Genoa CFC,349102.0,Stephane,,...,pass,253566959.0,32.0,79.0,False,False,False,False,Ground/ Low Pass,


Convert coordinates to standard pitch size (105m * 68m)

In [32]:
x_cols = ['x', 'pass_end_x']
y_cols = ['y', 'pass_end_y']
df_wyscout_shots[x_cols] = (df_wyscout_shots[x_cols]) / float(pitch_wyscout.dim.right) * pitch_statsperform.dim.right
df_wyscout_shots[y_cols] = ((float(pitch_wyscout.dim.bottom) - df_wyscout_shots[y_cols]) /
                            float(pitch_wyscout.dim.bottom)  * pitch_statsperform.dim.top)

Droping the end locations of the assist pass as they too often are the same as the shot location (compared with StatsBomb where they differ more often)

In [33]:
df_wyscout_shots.drop(['pass_end_y', 'pass_end_x'], axis=1, inplace=True)

Angles/ distance to goals

In [34]:
left_post, right_post = pitch_statsperform.goal_right
goal_width = abs(right_post - left_post)[1]
dx = abs(pitch_statsperform.dim.right - df_wyscout_shots.x)
dy = abs(pitch_statsperform.dim.center_width - df_wyscout_shots.y)
df_wyscout_shots['visible_angle'] = np.arctan2(goal_width * dx , (dx**2 + dy**2 - (goal_width / 2.) ** 2))
df_wyscout_shots['middle_angle'] = np.arctan2(dy, dx)
df_wyscout_shots['distance_to_goal'] = round((dy**2 + dx**2)**0.5, 1)
df_wyscout_shots

Unnamed: 0,match_id,matchPeriod,eventSec,id,goal,team_id,team_name,player_id,firstName,middleName,...,pass_id,smart_pass,pass_switch,pass_cross,pass_cut_back,pass_height_name,pass_technique_name,visible_angle,middle_angle,distance_to_goal
0,1694390,1H,31.226217,88178649,False,4418,France,25437.0,Blaise,,...,,,,,,,,0.242346,0.986195,17.1
1,1694390,1H,143.119551,88178722,False,11944,Romania,83824.0,Mihai,Doru,...,88178719.0,False,False,False,False,High Pass,,0.196835,0.438513,33.6
2,1694390,1H,219.576026,88178751,False,11944,Romania,33235.0,Bogdan,Sorin,...,88178743.0,False,False,True,False,High Pass,Outswinging,0.851948,0.847817,6.3
3,1694390,1H,247.532561,88373458,False,11944,Romania,6165.0,Florin,,...,88373456.0,False,False,True,False,High Pass,Inswinging,0.472204,1.059169,8.6
4,1694390,1H,557.319065,88178811,False,4418,France,3682.0,Antoine,,...,88178808.0,False,False,False,False,Ground/ Low Pass,,0.233111,0.414826,28.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43370,2576338,2H,1152.032980,253566542,False,3193,Genoa CFC,116269.0,Diego,Sebastián Laxalt,...,253566537.0,False,True,False,False,Ground/ Low Pass,,0.980870,0.574700,6.3
43371,2576338,2H,1251.730517,253566586,False,3193,Genoa CFC,3548.0,Giuseppe,,...,253566584.0,True,False,False,False,Ground/ Low Pass,Through Ball,0.465107,0.837575,11.0
43372,2576338,2H,2065.034482,253566910,True,3193,Genoa CFC,21177.0,Goran,,...,,,,,,,,0.635289,0.253476,10.8
43373,2576338,2H,2367.252041,253566961,False,3193,Genoa CFC,349102.0,Stephane,,...,253566959.0,False,False,False,False,Ground/ Low Pass,,0.253651,0.506752,25.2


Interaction between angle and distance

In [35]:
df_wyscout_shots['distance_visible_angle'] = df_wyscout_shots.distance_to_goal * df_wyscout_shots.visible_angle

Log distance

In [36]:
df_wyscout_shots['log_distance_to_goal'] = np.log(df_wyscout_shots.distance_to_goal)

Amend shot type to take into account the play_type (set piece column) made earlier

In [37]:
mask_amend = (df_wyscout_shots.shot_type_name != 'Direct Set Piece') & (df_wyscout_shots.play_type.notnull())
df_wyscout_shots.loc[mask_amend, 'shot_type_name'] = df_wyscout_shots.loc[mask_amend, 'play_type']

In [38]:
df_wyscout_shots['shot_type_name'] = df_wyscout_shots.shot_type_name.str.lower().str.replace(' ', '_')
df_wyscout_shots.drop('play_type', axis=1, inplace=True)

Add Men

In [39]:
df_wyscout_shots['competition_gender'] = 'male'

Match Period consistent with StatsBomb

In [40]:
df_wyscout_shots.rename({'matchPeriod': 'period'}, axis=1, inplace=True)
df_wyscout_shots['period'] = df_wyscout_shots.period.map({'1H': 1, '2H': 2, 'E1': 3, 'E2': 4})

Add Home/Away Team

In [41]:
df_wyscout_shots = df_wyscout_shots.merge(df_wyscout_match[['match_id',
                                                             'match_week', 'home_team_id', 'away_team_id', 'competition_name']],
                                          how='left', on='match_id')
df_wyscout_shots

Unnamed: 0,match_id,period,eventSec,id,goal,team_id,team_name,player_id,firstName,middleName,...,visible_angle,middle_angle,distance_to_goal,distance_visible_angle,log_distance_to_goal,competition_gender,match_week,home_team_id,away_team_id,competition_name
0,1694390,1,31.226217,88178649,False,4418,France,25437.0,Blaise,,...,0.242346,0.986195,17.1,4.144110,2.839078,male,1,4418.0,11944.0,UEFA Euro
1,1694390,1,143.119551,88178722,False,11944,Romania,83824.0,Mihai,Doru,...,0.196835,0.438513,33.6,6.613670,3.514526,male,1,4418.0,11944.0,UEFA Euro
2,1694390,1,219.576026,88178751,False,11944,Romania,33235.0,Bogdan,Sorin,...,0.851948,0.847817,6.3,5.367272,1.840550,male,1,4418.0,11944.0,UEFA Euro
3,1694390,1,247.532561,88373458,False,11944,Romania,6165.0,Florin,,...,0.472204,1.059169,8.6,4.060955,2.151762,male,1,4418.0,11944.0,UEFA Euro
4,1694390,1,557.319065,88178811,False,4418,France,3682.0,Antoine,,...,0.233111,0.414826,28.7,6.690284,3.356897,male,1,4418.0,11944.0,UEFA Euro
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43370,2576338,2,1152.032980,253566542,False,3193,Genoa CFC,116269.0,Diego,Sebastián Laxalt,...,0.980870,0.574700,6.3,6.179484,1.840550,male,38,3193.0,3185.0,Serie A
43371,2576338,2,1251.730517,253566586,False,3193,Genoa CFC,3548.0,Giuseppe,,...,0.465107,0.837575,11.0,5.116179,2.397895,male,38,3193.0,3185.0,Serie A
43372,2576338,2,2065.034482,253566910,True,3193,Genoa CFC,21177.0,Goran,,...,0.635289,0.253476,10.8,6.861118,2.379546,male,38,3193.0,3185.0,Serie A
43373,2576338,2,2367.252041,253566961,False,3193,Genoa CFC,349102.0,Stephane,,...,0.253651,0.506752,25.2,6.392012,3.226844,male,38,3193.0,3185.0,Serie A


Add Type of competition (League or Cup)

In [42]:
df_wyscout_shots.loc[(df_wyscout_shots.competition_name == 'UEFA Euro'), 'competition_type'] = 'Cup'
df_wyscout_shots.loc[(df_wyscout_shots.competition_name != 'UEFA Euro'), 'competition_type'] = 'League'
df_wyscout_shots

Unnamed: 0,match_id,period,eventSec,id,goal,team_id,team_name,player_id,firstName,middleName,...,middle_angle,distance_to_goal,distance_visible_angle,log_distance_to_goal,competition_gender,match_week,home_team_id,away_team_id,competition_name,competition_type
0,1694390,1,31.226217,88178649,False,4418,France,25437.0,Blaise,,...,0.986195,17.1,4.144110,2.839078,male,1,4418.0,11944.0,UEFA Euro,Cup
1,1694390,1,143.119551,88178722,False,11944,Romania,83824.0,Mihai,Doru,...,0.438513,33.6,6.613670,3.514526,male,1,4418.0,11944.0,UEFA Euro,Cup
2,1694390,1,219.576026,88178751,False,11944,Romania,33235.0,Bogdan,Sorin,...,0.847817,6.3,5.367272,1.840550,male,1,4418.0,11944.0,UEFA Euro,Cup
3,1694390,1,247.532561,88373458,False,11944,Romania,6165.0,Florin,,...,1.059169,8.6,4.060955,2.151762,male,1,4418.0,11944.0,UEFA Euro,Cup
4,1694390,1,557.319065,88178811,False,4418,France,3682.0,Antoine,,...,0.414826,28.7,6.690284,3.356897,male,1,4418.0,11944.0,UEFA Euro,Cup
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43370,2576338,2,1152.032980,253566542,False,3193,Genoa CFC,116269.0,Diego,Sebastián Laxalt,...,0.574700,6.3,6.179484,1.840550,male,38,3193.0,3185.0,Serie A,League
43371,2576338,2,1251.730517,253566586,False,3193,Genoa CFC,3548.0,Giuseppe,,...,0.837575,11.0,5.116179,2.397895,male,38,3193.0,3185.0,Serie A,League
43372,2576338,2,2065.034482,253566910,True,3193,Genoa CFC,21177.0,Goran,,...,0.253476,10.8,6.861118,2.379546,male,38,3193.0,3185.0,Serie A,League
43373,2576338,2,2367.252041,253566961,False,3193,Genoa CFC,349102.0,Stephane,,...,0.506752,25.2,6.392012,3.226844,male,38,3193.0,3185.0,Serie A,League


In [43]:
df_wyscout_shots.loc[(df_wyscout_shots.team_id == df_wyscout_shots.home_team_id), 'H_A_column'] = 'Home Team'
df_wyscout_shots.loc[(df_wyscout_shots.team_id == df_wyscout_shots.away_team_id), 'H_A_column'] = 'Away Team'
df_wyscout_shots.drop(['home_team_id','away_team_id'], axis=1, inplace=True)
df_wyscout_shots

Unnamed: 0,match_id,period,eventSec,id,goal,team_id,team_name,player_id,firstName,middleName,...,visible_angle,middle_angle,distance_to_goal,distance_visible_angle,log_distance_to_goal,competition_gender,match_week,competition_name,competition_type,H_A_column
0,1694390,1,31.226217,88178649,False,4418,France,25437.0,Blaise,,...,0.242346,0.986195,17.1,4.144110,2.839078,male,1,UEFA Euro,Cup,Home Team
1,1694390,1,143.119551,88178722,False,11944,Romania,83824.0,Mihai,Doru,...,0.196835,0.438513,33.6,6.613670,3.514526,male,1,UEFA Euro,Cup,Away Team
2,1694390,1,219.576026,88178751,False,11944,Romania,33235.0,Bogdan,Sorin,...,0.851948,0.847817,6.3,5.367272,1.840550,male,1,UEFA Euro,Cup,Away Team
3,1694390,1,247.532561,88373458,False,11944,Romania,6165.0,Florin,,...,0.472204,1.059169,8.6,4.060955,2.151762,male,1,UEFA Euro,Cup,Away Team
4,1694390,1,557.319065,88178811,False,4418,France,3682.0,Antoine,,...,0.233111,0.414826,28.7,6.690284,3.356897,male,1,UEFA Euro,Cup,Home Team
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43370,2576338,2,1152.032980,253566542,False,3193,Genoa CFC,116269.0,Diego,Sebastián Laxalt,...,0.980870,0.574700,6.3,6.179484,1.840550,male,38,Serie A,League,Home Team
43371,2576338,2,1251.730517,253566586,False,3193,Genoa CFC,3548.0,Giuseppe,,...,0.465107,0.837575,11.0,5.116179,2.397895,male,38,Serie A,League,Home Team
43372,2576338,2,2065.034482,253566910,True,3193,Genoa CFC,21177.0,Goran,,...,0.635289,0.253476,10.8,6.861118,2.379546,male,38,Serie A,League,Home Team
43373,2576338,2,2367.252041,253566961,False,3193,Genoa CFC,349102.0,Stephane,,...,0.253651,0.506752,25.2,6.392012,3.226844,male,38,Serie A,League,Home Team


Add players on the attacking and defending team (substracting the sent off players).

In [44]:
# Create a new df with only the red cards and second yellow cards
mask_red = ((df_wyscout_event.red_card == True) | df_wyscout_event.second_yellow_card == True)
df_wyscout_reds = df_wyscout_event[mask_red].copy()
df_wyscout_reds = df_wyscout_reds.merge(df_wyscout_match[['match_id', 'home_team_id', 'away_team_id']],
                                          how='left', on='match_id')
df_wyscout_reds['minute'] = np.ceil(df_wyscout_reds.eventSec / 60)
df_wyscout_reds.loc[(df_wyscout_reds.matchPeriod == '1H') & (df_wyscout_reds.minute > 45), 'minute'] = 45
df_wyscout_reds.sort_values('minute', inplace=True)
df_wyscout_reds

Unnamed: 0,eventId,subEventName,player_id,match_id,eventName,team_id,matchPeriod,eventSec,subEventId,id,...,prev_id_3,prevEventName_3,prevSubEventName_3,prev_player_id_3,prev_team_id_3,prev_pass_attempt_3,prev_shot_3,home_team_id,away_team_id,minute
120,2,Foul,25605.0,2501022,Foul,3770,2H,16.460098,20.0,244858571,...,244858944.0,Others on the ball,Touch,25399.0,3774.0,False,False,3770.0,3774.0,1.0
321,2,Violent Foul,134508.0,2576315,Foul,3163,2H,110.286792,27.0,249430954,...,249430535.0,Others on the ball,Touch,14812.0,3161.0,False,False,3163.0,3161.0,2.0
165,2,Out of game foul,51132.0,2517010,Foul,2975,2H,106.625655,23.0,245019377,...,245019147.0,Duel,Ground defending duel,135561.0,2482.0,False,False,2975.0,2482.0,2.0
53,2,Violent Foul,25650.0,2500733,Foul,3782,2H,81.148817,27.0,186157006,...,186157002.0,Pass,Cross,278867.0,3782.0,True,False,3782.0,3799.0,2.0
218,2,Foul,25577.0,2565812,Foul,698,2H,81.716119,20.0,232024283,...,232024535.0,Pass,Simple pass,4501.0,675.0,True,False,675.0,698.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,2,Foul,20540.0,2576088,Foul,3163,2H,3055.537932,20.0,207175303,...,207174905.0,Pass,High pass,20875.0,3173.0,True,False,3163.0,3173.0,51.0
111,2,Protest,26276.0,2500959,Foul,3770,2H,3029.890070,24.0,232283394,...,232283670.0,Duel,Ground attacking duel,,3770.0,False,False,3771.0,3770.0,51.0
252,2,Foul,86309.0,2576027,Foul,3194,2H,3111.467553,20.0,192612426,...,192612366.0,Pass,Simple pass,252473.0,3185.0,True,False,3185.0,3194.0,52.0
282,2,Foul,246059.0,2576169,Foul,3315,2H,3061.076943,20.0,225281413,...,225281569.0,Duel,Ground defending duel,,3315.0,False,False,3315.0,3172.0,52.0


Using the formation df to not count sent off players that are on the bench

In [45]:
df_wyscout_sub_in = df_wyscout_sub[['match_id', 'player_id_in', 'player_id_out','minute']].copy()
df_wyscout_sub_out = df_wyscout_sub_in.copy()
df_wyscout_sub_out.rename({'player_id_out': 'player_id', 'minute': 'minute_out'}, axis=1, inplace=True)
df_wyscout_sub_out.drop(['player_id_in'], axis=1, inplace=True)
df_wyscout_sub_in.rename({'player_id_in': 'player_id', 'minute': 'minute_in'}, axis=1, inplace=True)
df_wyscout_sub_in.drop(['player_id_out'], axis=1, inplace=True)
df_wyscout_formation = df_wyscout_formation.merge(df_wyscout_sub_in, how='left')
df_wyscout_formation = df_wyscout_formation.merge(df_wyscout_sub_out, how='left')
df_wyscout_formation.loc[df_wyscout_formation.bench == False, 'minute_in'] = 0
df_wyscout_formation = df_wyscout_formation[df_wyscout_formation.minute_in.notnull()].copy()
df_wyscout_formation.sort_values('minute_in', inplace=True)
df_wyscout_formation.drop(['lineup_id','ownGoals','goals','bench','lineup_assists'], axis=1, inplace=True)
df_wyscout_formation

Unnamed: 0,match_id,team_id,player_id,redCards,yellowCards,minute_in,minute_out
0,2500089,1646,9206,0,0,0.0,61.0
41573,2576181,3166,20529,0,83,0.0,85.0
41574,2576183,3157,15524,0,0,0.0,79.0
41575,2576186,3159,3318,0,0,0.0,46.0
41576,2576187,3173,286223,0,17,0.0,88.0
...,...,...,...,...,...,...,...
15807,1694436,3757,21385,0,0,108.0,
15905,1694440,4418,134513,0,0,110.0,
15914,1694428,9598,135810,0,0,110.0,
15866,1694428,9598,69411,0,0,120.0,


Create a new df for counting at what moment a team loses a pitch player.

In [46]:
df_wyscout_reds = df_wyscout_reds.merge(df_wyscout_formation, how='left')
df_wyscout_reds.dropna(subset = ["minute_in"], inplace=True)
df_wyscout_reds = df_wyscout_reds[df_wyscout_reds.minute_out.isnull()].copy()
df_wyscout_reds.loc[(df_wyscout_reds.matchPeriod == '2H'), 'minute'] = df_wyscout_reds.minute + 45
df_wyscout_reds.sort_values('minute', inplace=True)
df_wyscout_reds['players'] = 11 - df_wyscout_reds.groupby(['match_id','team_id'])['minute'].rank(method="first", ascending=True)

cols_to_keep = ['match_id', 'eventSec', 'id', 'team_id','redCards','yellowCards', 'players','home_team_id','away_team_id','minute']
df_wyscout_reds = df_wyscout_reds[cols_to_keep].copy()
df_wyscout_reds

Unnamed: 0,match_id,eventSec,id,team_id,redCards,yellowCards,players,home_team_id,away_team_id,minute
8,2500915,133.957196,224010037,3795,3,0,10.0,3780.0,3795.0,3.0
9,2575961,129.395092,180454709,3197,5,0,10.0,3197.0,3157.0,3.0
15,2576305,322.856399,247589700,3187,6,0,10.0,3176.0,3187.0,6.0
21,2516884,386.055139,215485428,2457,7,0,10.0,2975.0,2457.0,7.0
17,2576285,368.575285,244328363,3176,7,0,10.0,3176.0,3162.0,7.0
...,...,...,...,...,...,...,...,...,...,...
325,2576088,3055.537932,207175303,3163,99,0,10.0,3163.0,3173.0,96.0
326,2500959,3029.890070,232283394,3770,95,39,10.0,3771.0,3770.0,96.0
328,2576169,3061.076943,225281413,3315,97,68,10.0,3315.0,3172.0,97.0
327,2576027,3111.467553,192612426,3194,97,79,10.0,3185.0,3194.0,97.0


Create a new df for counting at what moment the opposite team loses a pitch player. It's the same df as the previous one but with the id of the other team.

In [47]:
df_wyscout_reds2 = df_wyscout_reds.copy()
df_wyscout_reds2.loc[(df_wyscout_reds2.team_id == df_wyscout_reds2.home_team_id), 'team_id2'] = df_wyscout_reds2.away_team_id
df_wyscout_reds2.loc[(df_wyscout_reds2.team_id == df_wyscout_reds2.away_team_id), 'team_id2'] = df_wyscout_reds2.home_team_id
df_wyscout_reds2.team_id = df_wyscout_reds2.team_id2
df_wyscout_reds2 = df_wyscout_reds2.rename(columns={"players": "players_rival"})
df_wyscout_reds2

Unnamed: 0,match_id,eventSec,id,team_id,redCards,yellowCards,players_rival,home_team_id,away_team_id,minute,team_id2
8,2500915,133.957196,224010037,3780.0,3,0,10.0,3780.0,3795.0,3.0,3780.0
9,2575961,129.395092,180454709,3157.0,5,0,10.0,3197.0,3157.0,3.0,3157.0
15,2576305,322.856399,247589700,3176.0,6,0,10.0,3176.0,3187.0,6.0,3176.0
21,2516884,386.055139,215485428,2975.0,7,0,10.0,2975.0,2457.0,7.0,2975.0
17,2576285,368.575285,244328363,3162.0,7,0,10.0,3176.0,3162.0,7.0,3162.0
...,...,...,...,...,...,...,...,...,...,...,...
325,2576088,3055.537932,207175303,3173.0,99,0,10.0,3163.0,3173.0,96.0,3173.0
326,2500959,3029.890070,232283394,3771.0,95,39,10.0,3771.0,3770.0,96.0,3771.0
328,2576169,3061.076943,225281413,3172.0,97,68,10.0,3315.0,3172.0,97.0,3172.0
327,2576027,3111.467553,192612426,3185.0,97,79,10.0,3185.0,3194.0,97.0,3185.0


Unify both df with the shooting one to order the events and know how many players have both teams for every shot.

In [48]:
df_wyscout_shots['minute'] = np.ceil(df_wyscout_shots.eventSec / 60)
df_wyscout_shots.loc[(df_wyscout_shots.period == 2), 'minute'] = df_wyscout_shots.minute + 45
df_wyscout_shots = df_wyscout_shots.append(df_wyscout_reds, sort=False,ignore_index=True)
df_wyscout_shots = df_wyscout_shots.append(df_wyscout_reds2, sort=False,ignore_index=True)
df_wyscout_shots.drop(['team_id2','home_team_id','away_team_id','redCards','yellowCards'], axis=1, inplace=True)
df_wyscout_shots.sort_values('minute', inplace=True)
df_wyscout_shots['players'] = df_wyscout_shots.groupby(['match_id','team_id'])['players'].fillna(method='ffill')
df_wyscout_shots['players_rival'] = df_wyscout_shots.groupby(['match_id','team_id'])['players_rival'].fillna(method='ffill')
df_wyscout_shots['players'].fillna(11, inplace=True)
df_wyscout_shots['players_rival'].fillna(11, inplace=True)
df_wyscout_shots

Unnamed: 0,match_id,period,eventSec,id,goal,team_id,team_name,player_id,firstName,middleName,...,distance_visible_angle,log_distance_to_goal,competition_gender,match_week,competition_name,competition_type,H_A_column,minute,players,players_rival
0,1694390,1.0,31.226217,88178649,False,4418.0,France,25437.0,Blaise,,...,4.144110,2.839078,male,1.0,UEFA Euro,Cup,Home Team,1.0,11.0,11.0
14519,2500873,1.0,30.058262,216473192,False,19830.0,AS Monaco,256480.0,Keita,Baldé,...,6.994143,2.459589,male,19.0,Ligue 1,League,Home Team,1.0,11.0,11.0
14364,2500867,1.0,49.168492,216452062,False,3799.0,Angers SCO,27646.0,Pierrick,,...,6.965581,3.230804,male,19.0,Ligue 1,League,Home Team,1.0,11.0,11.0
31012,2565776,1.0,42.072257,227259240,True,679.0,Atlético Madrid,3682.0,Antoine,,...,6.165610,2.302585,male,23.0,La Liga,League,Away Team,1.0,11.0,11.0
36512,2576055,1.0,51.552028,199672340,False,3164.0,UC Sampdoria,20479.0,Fabio,,...,7.256019,3.346389,male,10.0,Serie A,League,Away Team,1.0,11.0,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3682,2499820,2.0,3307.255167,203362731,False,1644.0,Watford FC,7929.0,Tom,,...,7.058080,2.442347,male,11.0,Premier League,League,Away Team,101.0,11.0,11.0
1762,2499739,2.0,3310.186437,182137485,False,1659.0,AFC Bournemouth,9277.0,Steve,,...,5.375789,2.933857,male,3.0,Premier League,League,Home Team,101.0,11.0,10.0
19403,2516750,2.0,3389.861176,182026562,False,2463.0,1. FC Köln,127685.0,Miloš,,...,6.415413,2.660260,male,2.0,Bundesliga,League,Home Team,102.0,11.0,10.0
3683,2499820,2.0,3446.055301,203362910,False,1623.0,Everton,145692.0,El-Hadji,Baye Oumar,...,4.813802,2.879198,male,11.0,Premier League,League,Home Team,103.0,11.0,11.0


Drop the sent off events to have only shots again.

In [49]:
df_wyscout_shots = df_wyscout_shots[df_wyscout_shots['goal'].notna()]
df_wyscout_shots.drop(['pass_id'], axis=1, inplace=True)
df_wyscout_shots.reset_index(drop=True, inplace=True)
df_wyscout_shots

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,match_id,period,eventSec,id,goal,team_id,team_name,player_id,firstName,middleName,...,distance_visible_angle,log_distance_to_goal,competition_gender,match_week,competition_name,competition_type,H_A_column,minute,players,players_rival
0,1694390,1.0,31.226217,88178649,False,4418.0,France,25437.0,Blaise,,...,4.144110,2.839078,male,1.0,UEFA Euro,Cup,Home Team,1.0,11.0,11.0
1,2500873,1.0,30.058262,216473192,False,19830.0,AS Monaco,256480.0,Keita,Baldé,...,6.994143,2.459589,male,19.0,Ligue 1,League,Home Team,1.0,11.0,11.0
2,2500867,1.0,49.168492,216452062,False,3799.0,Angers SCO,27646.0,Pierrick,,...,6.965581,3.230804,male,19.0,Ligue 1,League,Home Team,1.0,11.0,11.0
3,2565776,1.0,42.072257,227259240,True,679.0,Atlético Madrid,3682.0,Antoine,,...,6.165610,2.302585,male,23.0,La Liga,League,Away Team,1.0,11.0,11.0
4,2576055,1.0,51.552028,199672340,False,3164.0,UC Sampdoria,20479.0,Fabio,,...,7.256019,3.346389,male,10.0,Serie A,League,Away Team,1.0,11.0,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43370,2499820,2.0,3307.255167,203362731,False,1644.0,Watford FC,7929.0,Tom,,...,7.058080,2.442347,male,11.0,Premier League,League,Away Team,101.0,11.0,11.0
43371,2499739,2.0,3310.186437,182137485,False,1659.0,AFC Bournemouth,9277.0,Steve,,...,5.375789,2.933857,male,3.0,Premier League,League,Home Team,101.0,11.0,10.0
43372,2516750,2.0,3389.861176,182026562,False,2463.0,1. FC Köln,127685.0,Miloš,,...,6.415413,2.660260,male,2.0,Bundesliga,League,Home Team,102.0,11.0,10.0
43373,2499820,2.0,3446.055301,203362910,False,1623.0,Everton,145692.0,El-Hadji,Baye Oumar,...,4.813802,2.879198,male,11.0,Premier League,League,Home Team,103.0,11.0,11.0


Save dataset

In [50]:
df_wyscout_shots.to_parquet(os.path.join(WYSCOUT, 'shots.parquet'))

Info on dataset

In [51]:
df_wyscout_shots.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43375 entries, 0 to 43374
Data columns (total 39 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   match_id                43375 non-null  int64  
 1   period                  43375 non-null  float64
 2   eventSec                43375 non-null  float64
 3   id                      43375 non-null  int64  
 4   goal                    43375 non-null  object 
 5   team_id                 43375 non-null  float64
 6   team_name               43375 non-null  object 
 7   player_id               43372 non-null  float64
 8   firstName               43372 non-null  object 
 9   middleName              43372 non-null  object 
 10  lastName                43372 non-null  object 
 11  Name                    43372 non-null  object 
 12  shot_type_name          43375 non-null  object 
 13  x                       43375 non-null  float64
 14  y                       43375 non-null