In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mplsoccer.pitch import Pitch

Data Folders

In [2]:
cwd = os.getcwd()
WYSCOUT = os.path.join(cwd, 'data', 'wyscout')
STATSBOMB = os.path.join(cwd, 'data', 'statsbomb')

Load match data

In [3]:
df_wyscout_match = pd.read_parquet(os.path.join(WYSCOUT, 'match.parquet'))
df_statsbomb_match = pd.read_parquet(os.path.join(STATSBOMB, 'match.parquet'))
df_wyscout_formation = pd.read_parquet(os.path.join(WYSCOUT, 'formation.parquet'))
df_wyscout_sub =pd.read_parquet(os.path.join(WYSCOUT, 'substitution.parquet'))


Overlaps

In [4]:
df_wyscout_match['match_date'] = pd.to_datetime(df_wyscout_match.kick_off.dt.date)
match_sb = df_statsbomb_match[['match_id', 'home_team_name', 'away_team_name', 'match_date', 'competition_name']]
match_wy = df_wyscout_match[['match_id', 'home_team_name', 'away_team_name', 'match_date' ,'competition_name']]
overlap = match_wy.merge(match_sb, on=['home_team_name', 'away_team_name', 'match_date', 'competition_name'],
                         how='inner', suffixes=['_wyscout', '_statsbomb'])

There are 100 overlapping games. 64 world cup games and 36 La Liga games from the 2017/18 season. Barcelona actually played 38 La Liga games in 2017/18, but Messi did not play 2 of those games.

In [5]:
overlap.competition_name.value_counts()

FIFA World Cup    64
La Liga           36
Name: competition_name, dtype: int64


Remove overlapping games from Wyscout data

In [6]:
df_wyscout_event = pd.read_parquet(os.path.join(WYSCOUT, 'event.parquet'))

In [7]:
mask = df_wyscout_event.match_id.isin(overlap.match_id_wyscout)
df_wyscout_event_overlap = df_wyscout_event[mask].copy()
df_wyscout_event = df_wyscout_event[~mask].copy()
print('Number of events removed:', mask.sum())

Number of events removed: 163602


Remove overlapping matches from Wyscout data

In [8]:
mask = df_wyscout_match.match_id.isin(overlap.match_id_wyscout)
df_wyscout_match_overlap = df_wyscout_match[mask].copy()
df_wyscout_match = df_wyscout_match[~mask].copy()
print('Number of matches removed:', mask.sum())

Number of matches removed: 100


Remove overlapping subs

In [9]:
mask = df_wyscout_sub.match_id.isin(overlap.match_id_wyscout)
df_wyscout_sub_overlap = df_wyscout_sub[mask].copy()
df_wyscout_sub = df_wyscout_sub[~mask].copy()
print('Number of rows removed:', mask.sum())

Number of rows removed: 591


Remove overlapping formation

In [10]:
mask = df_wyscout_formation.match_id.isin(overlap.match_id_wyscout)
df_wyscout_formation_overlap = df_wyscout_formation[mask].copy()
df_wyscout_formation = df_wyscout_formation[~mask].copy()
print('Number of rows removed:', mask.sum())

Number of rows removed: 4181


Save files

In [11]:
df_wyscout_event_overlap.to_parquet(os.path.join(WYSCOUT, 'event_overlap.parquet'))
df_wyscout_event.to_parquet(os.path.join(WYSCOUT, 'event.parquet'))
df_wyscout_match_overlap.to_parquet(os.path.join(WYSCOUT, 'match_overlap.parquet'))
df_wyscout_match.to_parquet(os.path.join(WYSCOUT, 'match.parquet'))
df_wyscout_sub_overlap.to_parquet(os.path.join(WYSCOUT, 'substitution_overlap.parquet'))
df_wyscout_sub.to_parquet(os.path.join(WYSCOUT, 'substitution.parquet'))
df_wyscout_formation_overlap.to_parquet(os.path.join(WYSCOUT, 'formation_overlap.parquet'))
df_wyscout_formation.to_parquet(os.path.join(WYSCOUT, 'formation.parquet'))