# Introduction

This code takes the statbomb data. It applies feature engineering by event type to summarise events across a match.


# Configuration

In [1]:
# game chunk file size set to avoid code crashes
chunk_size = 250
#chunk selection
a=0
#chunk file export
b=1


In [None]:
!pip install statsbombpy


In [None]:
from statsbombpy import sb
import pandas as pd


The statsbombpy automatically interact with repository. For more details you can access to github.


https://github.com/statsbomb/open-data

In [None]:
df_competitions = sb.competitions()
df_competitions.head()

Filters need setting to select matches of interest

In [None]:
# filter games of interest
df_competitions['season_year'] = df_competitions['season_name'].str[-4:].astype(int)
df_competitions_filtered = df_competitions[
    (df_competitions['season_year'] > 2000) &
    (df_competitions['competition_international'] == False) &
    (df_competitions['competition_youth'] == False) &
    (df_competitions['competition_gender'] == 'male')
]
df_competitions_filtered

https://github.com/statsbomb/open-data/blob/master/data/competitions.json





In [None]:

import warnings

# Suppress UserWarnings related to mplsoccer
warnings.filterwarnings("ignore", category=UserWarning, module="mplsoccer")


all_matches = []
for index, row in df_competitions_filtered.iterrows():
    competition_id = row['competition_id']
    season_id = row['season_id']
    try:
        df_matches = sb.matches(competition_id=competition_id, season_id=season_id)
        all_matches.append(df_matches)
    except Exception as e:
        print(f"Could not retrieve matches for competition_id: {competition_id}, season_id: {season_id}. Error: {e}")

if all_matches:
    df_all_matches = pd.concat(all_matches, ignore_index=True)
    print(df_all_matches.head())
else:
    print("No matches data retrieved.")


df_all_matches

In [None]:
#take competition and match_id columns from df_all_matches

df_selected_cols = df_all_matches[['competition', 'match_id','season']]
print(df_selected_cols.head())

In [None]:
df_wwc = df_all_matches

In [None]:

df_wwc_subset = df_wwc[['match_id', 'kick_off', 'match_week']].copy()
df_wwc_subset['kick_off_hour'] = pd.to_datetime(df_wwc_subset['kick_off']).dt.hour
df_wwc_subset = df_wwc_subset.drop(columns=['kick_off'])
df_wwc_subset

In [None]:
matchids=df_wwc['match_id'].unique()
print(matchids)

In [None]:

# Group match ids into chunks to manage file size
match_id_chunks = [matchids[i:i + chunk_size] for i in range(0, len(matchids), chunk_size)]

# Print the names of each group (by index)
for i, chunk in enumerate(match_id_chunks):
  print(f"Group {i+1}")
  # If you want to see the match ids in each group, uncomment the line below
  print(chunk)

In [None]:

df_final_matches = pd.merge(df_final_matches, df_wwc[['match_id', 'home_team']], on='match_id', how='left')


In [None]:

df_final_matches['home_team_flag'] = (df_final_matches['possession_team'] == df_final_matches['home_team']).astype(int)


In [None]:
df_final_matches[['x','y']] = df_final_matches['location'].apply(pd.Series)
df_final_matches[['carry_end_x','carry_end_y']] = df_final_matches['carry_end_location'].apply(pd.Series)
#df_final_matches[['goalkeeper_end_x','goalkeeper_end_y']] = df_final_matches['goalkeeper_end_location'].apply(pd.Series)
df_final_matches[['pass_end_x','pass_end_y']] = df_final_matches['pass_end_location'].apply(pd.Series)

In [None]:

df_final_matches['new_id'] = df_final_matches['match_id'].astype(str) + '_' + df_final_matches['possession'].astype(str)
df_first_event_of_possession = df_final_matches.loc[df_final_matches.groupby('new_id')['index'].idxmin()]

In [None]:
print(df_final_matches['type'].unique())

In [None]:
#
new_dfs = {}
for event_type in df_final_matches['type'].unique():
  df_name = f"df_{event_type.replace(' ', '_').lower()}"
  new_dfs[df_name] = df_final_matches[df_final_matches['type'] == event_type].copy()

print("List of new dataframes:")
for df_name in new_dfs.keys():
    print(df_name)


**High Level**

In [None]:
# Sum duration and shot_statsbomb_xg by match_id

match_analysis = df_final_matches.groupby('match_id').agg(
    duration=('duration', 'sum'),
    xg=('shot_statsbomb_xg', 'sum'),
    home_team_duration=('duration', lambda x: x[df_final_matches.loc[x.index, 'home_team_flag'] == 1].sum()),
    max_posession=('possession', 'max')
).reset_index()

# Calculate the shot_home_proportion after aggregation
match_analysis['home_team_duration'] = match_analysis['home_team_duration'] / match_analysis['duration']

match_analysis

**shots**

In [None]:
selected_columns = [

    'id',
    'match_id',
    'location',
    'period',
    'possession',
    'possession_team',
    'under_pressure',
    'home_team_flag',
    'duration',
    'counterpress',
    'index',
    'x',
    'y',
    'play_pattern',
    'shot_body_part',
    'shot_first_time',
    'shot_outcome',
    'shot_technique',
    'shot_type',
    'shot_statsbomb_xg'
]
df_shot = new_dfs['df_shot']
existing_columns = [col for col in selected_columns if col in df_shot.columns]

# Create a new dataframe with only the existing selected columns
df_final_matches_selected_shot = df_shot[existing_columns]
df_final_matches_selected_shot

In [None]:
# Join df_final_matches_selected_shot and df_final_matches on 'match_id' and 'possession'
df_merged = pd.merge(df_final_matches_selected_shot[['match_id', 'possession']],
                     df_final_matches[['match_id', 'possession', 'index']],
                     on=['match_id', 'possession'],
                     how='inner')

# Find the minimum index for each match_id and possession combination
df_min_index = df_merged.groupby(['match_id', 'possession'])['index'].min().reset_index()
df_min_index.rename(columns={'index': 'min_index'}, inplace=True)

# Join the minimum index back onto df_final_matches_selected_shot
df_final_matches_selected_shot = pd.merge(df_final_matches_selected_shot,
                                          df_min_index,
                                          on=['match_id', 'possession'],
                                          how='left')

print(df_final_matches_selected_shot.head())


In [None]:
# Reset the min_index_diff_flag to 0 before recalculating
df_final_matches_selected_shot['sequence length'] = df_final_matches_selected_shot['index'] - df_final_matches_selected_shot['min_index']

df_final_matches_selected_shot['first_shot_flag'] = 0

# Group by match_id and possession
grouped = df_final_matches_selected_shot.groupby(['match_id', 'possession'])

# Iterate through each group and set the flag for the minimum index_diff
for name, group in grouped:
  min_index_diff_idx = group['sequence length'].idxmin()
  df_final_matches_selected_shot.loc[min_index_diff_idx, 'first_shot_flag'] = 1

df_final_matches_selected_shot

In [None]:

# Ensure df_final_matches_selected_shot is sorted by match_id, possession, and index
df_final_matches_selected_shot = df_final_matches_selected_shot.sort_values(by=['match_id', 'possession', 'index'])

# Initialize a list to store the counts for each shot
pass_counts = []

# Iterate through each row in df_final_matches_selected_shot
for index, row in df_final_matches_selected_shot.iterrows():
    match_id = row['match_id']
    shot_index = row['index']
    min_index = row['min_index']

    # Filter df_final_matches for the current match_id and the relevant index range
    relevant_events = df_final_matches[
        (df_final_matches['match_id'] == match_id) &
        (df_final_matches['index'] >= min_index) &
        (df_final_matches['index'] <= shot_index)
    ]

    # Count the number of rows where 'type' is 'Pass' in the filtered events
    count_passes = relevant_events[relevant_events['type'] == 'Pass'].shape[0]

    # Append the count to the list
    pass_counts.append(count_passes)

# Add the pass counts as a new column to df_final_matches_selected_shot
df_final_matches_selected_shot['passes_in_sequence'] = pass_counts

print(df_final_matches_selected_shot[['match_id', 'index', 'min_index', 'passes_in_sequence']].head())
print("\nFirst few rows of df_final_matches_selected_shot with 'passes_in_sequence' column:")
print(df_final_matches_selected_shot.head())

In [None]:

df_final_matches_selected_shot = pd.merge(df_final_matches_selected_shot,
                                          df_selected_cols[['match_id', 'competition', 'season']],
                                          on='match_id',
                                          how='left')
print(df_final_matches_selected_shot.head())

In [None]:

# Group by match_id
match_analysis_shot = df_final_matches_selected_shot.groupby('match_id').agg(
    shot_count=('id', 'count'),
    shot_proportion_period_1=('period', lambda x: (x == 1).mean()),
    shot_proportion_home_team=('home_team_flag', 'mean'),
    shot_proportion_under_pressure=('under_pressure', lambda x: x.fillna(False).mean()), # Handle potential NaNs in under_pressure
    shot_duration_mean=('duration', 'mean'),
    shot_start_mean=('x', 'mean'),
    shot_start_proportion_x_lt_60=('x', lambda x: (x < 60).mean()), # Proportion of pass start x location when x is less than 60
    shot_start_y_between_20_60_proportion=('y', lambda y: ((y >= 20) & (y <= 60)).mean()), # Proportion of pass start y location when y is between 20 and 60
    shot_start_home=('x', lambda x: x[df_final_matches_selected_shot.loc[x.index, 'home_team_flag'] == 1].mean()),                       # Average y_dist
    shot_left_foot_proportion=('shot_body_part', lambda x: (x == 'Left Foot').mean()),
    shot_right_foot_proportion=('shot_body_part', lambda x: (x == 'Right Foot').mean()),
    shot_other_proportion=('shot_body_part', lambda x: ((x != 'Left Foot') & (x != 'Right Foot')).mean()),
    shot_first_shot_proportion=('first_shot_flag', 'mean'),
    shot_passes_in_sequence_mean=('passes_in_sequence', 'mean'),
    shot_sequence_length_mean=('sequence length', 'mean'),
    shot_type_proportion_open_play=('shot_type', lambda x: (x == 'Open Play').mean()),
    shot_play_pattern_proportion_regular_play=('play_pattern', lambda x: (x == 'Regular Play').mean()),
    shot_play_pattern_proportion_free_kick=('play_pattern', lambda x: (x == 'From Free Kick').mean()),
    shot_play_pattern_proportion_throw_in=('play_pattern', lambda x: (x == 'From Throw In').mean()),
    shot_play_pattern_proportion_corner=('play_pattern', lambda x: (x == 'From Corner').mean()),
    shot_play_pattern_proportion_counter=('play_pattern', lambda x: (x == 'From Counter').mean()),
    shot_technique_normal=('shot_technique', lambda x: (x == 'Normal').mean()),
    shot_first_time_proportion=('shot_first_time', lambda x: x.fillna(False).mean()),
    shot_on_target=('shot_outcome', lambda x: ((x == 'Goal') |  (x == 'Saved')).mean()),
    shot_blocked=('shot_outcome', lambda x: (x == 'Blocked').mean()),
    shot_off_target=('shot_outcome', lambda x: ((x != 'Goal')&(x != 'Blocked')& (x != 'Saved')).mean())
).reset_index()

match_analysis_shot

**Passes**

In [None]:
selected_columns = [

    'id',
    'match_id',
    'location',
    'period',
    'possession',
    'possession_team',
    'under_pressure',
    'home_team',
    'home_team_flag',
    'duration',
    'counterpress',
    'x',
    'y',
    'pass_end_x',
    'pass_end_y',
    'pass_body_part',
    'pass_height',
    'pass_length'
]
# Check if the columns exist in the database
df_pass = new_dfs['df_pass']

existing_columns = [col for col in selected_columns if col in df_pass.columns]

# Create a new dataframe with only the existing selected columns
df_final_matches_selected_pass = df_pass[existing_columns]

# Calculate the distance in x and y and add them as new columns
df_final_matches_selected_pass['x_dist'] = df_final_matches_selected_pass['pass_end_x'] - df_final_matches_selected_pass['x']
df_final_matches_selected_pass['y_dist'] = (df_final_matches_selected_pass['y'] - df_final_matches_selected_pass['pass_end_y']).abs()

df_final_matches_selected_pass

In [None]:

df_final_matches_selected_pass['short pass'] = (df_final_matches_selected_pass['pass_length'] < 10).astype(int)
df_final_matches_selected_pass['medium pass'] = ((df_final_matches_selected_pass['pass_length'] >= 10) & (df_final_matches_selected_pass['pass_length'] < 30)).astype(int)
df_final_matches_selected_pass['long pass'] = (df_final_matches_selected_pass['pass_length'] >= 30).astype(int)



In [None]:

# Group by match_id
match_analysis_pass = df_final_matches_selected_pass.groupby('match_id').agg(
    pass_count=('id', 'count'),
    pass_proportion_period_1=('period', lambda x: (x == 1).mean()),
    pass_proportion_home_team=('home_team_flag', 'mean'),
    pass_proportion_under_pressure=('under_pressure', lambda x: x.fillna(False).mean()), # Handle potential NaNs in under_pressure
    pass_duration_mean=('duration', 'mean'),
    pass_counterpress_rate=('counterpress', lambda x: x.fillna(False).mean()),
    pass_start_mean=('x', 'mean'),
    pass_start_proportion_x_lt_60=('x', lambda x: (x < 60).mean()), # Proportion of pass start x location when x is less than 60
    pass_start_y_between_20_60_proportion=('y', lambda y: ((y >= 20) & (y <= 60)).mean()), # Proportion of pass start y location when y is between 20 and 60
    pass_start_home=('x', lambda x: x[df_final_matches_selected_pass.loc[x.index, 'home_team_flag'] == 1].mean()),
    pass_proportion_x_dist_lt_0=('x_dist', lambda x: (x < 0).mean()),  # Proportion of x_dist < 0
    pass_x_dist_lt_0=('x_dist', lambda x: x[x < 0].mean()),      # Count of x_dist < 0
    pass_average_x_dist_gt_0=('x_dist', lambda x: x[x > 0].mean()),  # Average x_dist > 0
    pass_average_y_dist=('y_dist', 'mean'),                          # Average y_dist
    pass_average_x_dist_lt_0_home=('x_dist', lambda x: x[(df_final_matches_selected_pass.loc[x.index, 'home_team_flag'] == 1) & (x < 0)].mean()),# Average x_dist < 0 by home_team_flag
    pass_average_x_dist_gt_0_home=('x_dist', lambda x: x[(df_final_matches_selected_pass.loc[x.index, 'home_team_flag'] == 1) & (x > 0)].mean()),# Average x_dist < 0 by home_team_flag
    pass_left_foot_proportion=('pass_body_part', lambda x: (x == 'Left Foot').mean()),
    pass_right_foot_proportion=('pass_body_part', lambda x: (x == 'Right Foot').mean()),
    pass_other_proportion=('pass_body_part', lambda x: ((x != 'Left Foot') & (x != 'Right Foot')).mean()),
    pass_pass_height_proportion_ground=('pass_height', lambda x: (x == 'Ground Pass').mean()),
    pass_pass_height_proportion_low=('pass_height', lambda x: (x == 'Low Pass').mean()),
    pass_pass_height_proportion_high=('pass_height', lambda x: (x == 'High Pass').mean()),
    pass_short_pass_proportion=('short pass', 'mean'),
    pass_medium_pass_proportion=('medium pass', 'mean'),
    pass_long_pass_proportion=('long pass', 'mean')
).reset_index()

match_analysis_pass

**Carries** - reviewed

In [None]:
selected_columns = [

    'id',
    'match_id',
    'location',
    'period',
    'possession',
    'possession_team',
    'under_pressure',
    'home_team',
    'home_team_flag',
    'duration',
    'counterpress',
    'x',
    'y',
    'carry_end_x',
    'carry_end_y'
]
# Check if the columns exist in the database
df_carry = new_dfs['df_carry']

existing_columns = [col for col in selected_columns if col in df_carry.columns]

# Create a new dataframe with only the existing selected columns
df_final_matches_selected_carry = df_carry[existing_columns]

# Calculate the distance in x and y and add them as new columns
df_final_matches_selected_carry['x_dist'] = df_final_matches_selected_carry['carry_end_x'] - df_final_matches_selected_carry['x']
df_final_matches_selected_carry['y_dist'] = (df_final_matches_selected_carry['y'] - df_final_matches_selected_carry['carry_end_y']).abs()

df_final_matches_selected_carry

In [None]:

# Group by match_id
match_analysis_carry = df_final_matches_selected_carry.groupby('match_id').agg(
    carry_count=('id', 'count'),
    carry_proportion_period_1=('period', lambda x: (x == 1).mean()),
    carry_proportion_home_team=('home_team_flag', 'mean'),
    carry_proportion_under_pressure=('under_pressure', lambda x: x.fillna(False).mean()), # Handle potential NaNs in under_pressure
    carry_duration_mean=('duration', 'mean'),
    carry_counterpress_rate=('counterpress', lambda x: x.fillna(False).mean()),
    carry_start_mean=('x', 'mean'),
    carry_start_proportion_x_lt_60=('x', lambda x: (x < 60).mean()), # Proportion of carry start x location when x is less than 60
    carry_start_y_between_20_60_proportion=('y', lambda y: ((y >= 20) & (y <= 60)).mean()), # Proportion of carry start y location when y is between 20 and 60
    carry_start_home=('x', lambda x: x[df_final_matches_selected_carry.loc[x.index, 'home_team_flag'] == 1].mean()),
    carry_proportion_x_dist_lt_0=('x_dist', lambda x: (x < 0).mean()),  # Proportion of x_dist < 0
    carry_x_dist_lt_0=('x_dist', lambda x: x[x < 0].mean()),      # Count of x_dist < 0
    carry_average_x_dist_gt_0=('x_dist', lambda x: x[x > 0].mean()),  # Average x_dist > 0
    carry_average_y_dist=('y_dist', 'mean'),                          # Average y_dist
    carry_average_x_dist_lt_0_home=('x_dist', lambda x: x[(df_final_matches_selected_carry.loc[x.index, 'home_team_flag'] == 1) & (x < 0)].mean()),# Average x_dist < 0 by home_team_flag
    carry_average_x_dist_gt_0_home=('x_dist', lambda x: x[(df_final_matches_selected_carry.loc[x.index, 'home_team_flag'] == 1) & (x > 0)].mean())# Average x_dist < 0 by home_team_flag
).reset_index()

match_analysis_carry

**Pressure** - reviewed

In [None]:
selected_columns = [

    'id',
    'match_id',
    'location',
    'period',
    'possession',
    'possession_team',
    'under_pressure',
    'home_team_flag',
    'duration',
    'counterpress',
    'x',
    'y'
]
df_pressure = new_dfs['df_pressure']
existing_columns = [col for col in selected_columns if col in df_pressure.columns]

# Create a new dataframe with only the existing selected columns
df_final_matches_selected_pressure = df_pressure[existing_columns]
df_final_matches_selected_pressure

In [None]:
match_analysis_pressure = df_final_matches_selected_pressure.groupby('match_id').agg(
    pressure_count=('id', 'count'),
    pressure_proportion_period_1=('period', lambda x: (x == 1).mean()),
    pressure_proportion_home_team=('home_team_flag', 'mean'),
    pressure_duration_mean=('duration', 'mean'),
    pressure_counterpress_rate=('counterpress', lambda x: x.fillna(False).mean()),
    pressure_start_mean=('x', 'mean'),
    pressure_start_proportion_x_lt_60=('x', lambda x: (x < 60).mean()), # Proportion of carry start x location when x is less than 60
    pressure_start_y_between_20_60_proportion=('y', lambda y: ((y >= 20) & (y <= 60)).mean()) # Proportion of carry start y location when y is between 20 and 60
).reset_index()

match_analysis_pressure

**miscontrol** - reviewed

In [None]:
selected_columns = [

    'id',
    'match_id',
    'location',
    'period',
    'possession',
    'possession_team',
    'under_pressure',
    'home_team',
    'home_team_flag',
    'duration',
    'counterpress',
    'x',
    'y'
]
# Check if the columns exist in the database
df_miscontrol = new_dfs['df_miscontrol']

existing_columns = [col for col in selected_columns if col in df_miscontrol.columns]

# Create a new dataframe with only the existing selected columns
df_final_matches_selected_miscontrol = df_miscontrol[existing_columns]
df_final_matches_selected_miscontrol

In [None]:

# Group by match_id
match_analysis_miscontrol = df_final_matches_selected_miscontrol.groupby('match_id').agg(
    miscontrol_count=('id', 'count'),
    miscontrol_proportion_period_1=('period', lambda x: (x == 1).mean()),
    miscontrol_proportion_home_team=('home_team_flag', 'mean'),
    miscontrol_proportion_under_pressure=('under_pressure', lambda x: x.fillna(False).mean()),
    miscontrol_start_mean=('x', 'mean'),
    miscontrol_start_proportion_x_lt_60=('x', lambda x: (x < 60).mean()), # Proportion of carry start x location when x is less than 60
    miscontrol_start_y_between_20_60_proportion=('y', lambda y: ((y >= 20) & (y <= 60)).mean()) # Proportion of carry start y location when y is between 20 and 60
).reset_index()

match_analysis_miscontrol

**Dribble** - reviewed

In [None]:
selected_columns = [

    'id',
    'match_id',
    'location',
    'period',
    'possession',
    'possession_team',
    'under_pressure',
    'home_team',
    'home_team_flag',
    'duration',
    'dribble_outcome',
    'counterpress',
    'x',
    'y'
]
# Check if the columns exist in the database
df_dribble = new_dfs['df_dribble']

existing_columns = [col for col in selected_columns if col in df_dribble.columns]

# Create a new dataframe with only the existing selected columns
df_final_matches_selected_dribble = df_dribble[existing_columns]
df_final_matches_selected_dribble

In [None]:
# Group by match_id
match_analysis_dribble = df_final_matches_selected_dribble.groupby('match_id').agg(
    dribble_count=('id', 'count'),
    dribble_proportion_period_1=('period', lambda x: (x == 1).mean()),
    dribble_proportion_home_team=('home_team_flag', 'mean'), # Handle potential NaNs in under_pressure
    dribble_success_rate=('dribble_outcome', lambda x: (x == 'Complete').mean()),
    dribble_start_mean=('x', 'mean'),
    dribble_start_proportion_x_lt_60=('x', lambda x: (x < 60).mean()), # Proportion of carry start x location when x is less than 60
    dribble_start_y_between_20_60_proportion=('y', lambda y: ((y >= 20) & (y <= 60)).mean()) # Proportion of carry start y location when y is between 20 and 60
).reset_index()

match_analysis_dribble

**df_duel** - reviewed

In [None]:
selected_columns = [

    'id',
    'match_id',
    'location',
    'period',
    'possession',
    'possession_team',
    'under_pressure',
    'home_team',
    'home_team_flag',
    'duration',
    'counterpress',
    'x',
    'y',
    'duel_type'
]
# Check if the columns exist in the database
df_duel = new_dfs['df_duel']

existing_columns = [col for col in selected_columns if col in df_duel.columns]

# Create a new dataframe with only the existing selected columns
df_final_matches_selected_duel = df_duel[existing_columns]
df_final_matches_selected_duel

In [None]:
# Group by match_id
match_analysis_duel = df_final_matches_selected_duel.groupby('match_id').agg(
    duel_count=('id', 'count'),
    duel_proportion_period_1=('period', lambda x: (x == 1).mean()),
    duel_proportion_away_team=('home_team_flag', 'mean'),
    duel_counterpress_rate=('counterpress', lambda x: x.fillna(False).mean()),
    duel_start_mean=('x', 'mean'),
    duel_proportion_Tackle=('duel_type', lambda x: (x == 'Tackle').mean()), #Tackle and Header are the only options
    duel_start_proportion_x_lt_60=('x', lambda x: (x < 60).mean()), # Proportion of carry start x location when x is less than 60
    duel_start_y_between_20_60_proportion=('y', lambda y: ((y >= 20) & (y <= 60)).mean()) # Proportion of carry start y location when y is between 20 and 60
).reset_index()

match_analysis_duel

**df_interception** - reviewed



In [None]:
selected_columns = [

    'id',
    'match_id',
    'location',
    'period',
    'possession',
    'possession_team',
    'under_pressure',
    'home_team',
    'home_team_flag',
    'duration',
    'counterpress',
    'x',
    'y'
]
# Check if the columns exist in the database
df_interception = new_dfs['df_interception']

existing_columns = [col for col in selected_columns if col in df_interception.columns]

# Create a new dataframe with only the existing selected columns
df_final_matches_selected_interception = df_interception[existing_columns]
df_final_matches_selected_interception

In [None]:
# Group by match_id
match_analysis_interception = df_final_matches_selected_interception.groupby('match_id').agg(
    interception_count=('id', 'count'),
    interception_proportion_period_1=('period', lambda x: (x == 1).mean()),
    interception_proportion_home_team=('home_team_flag', 'mean'),
    interception_proportion_under_pressure=('under_pressure', lambda x: x.fillna(False).mean()), # Handle potential NaNs in under_pressure
    interception_counterpress_rate=('counterpress', lambda x: x.fillna(False).mean()),
    interception_start_mean=('x', 'mean'),
    interception_start_proportion_x_lt_60=('x', lambda x: (x < 60).mean()), # Proportion of carry start x location when x is less than 60
    interception_start_y_between_20_60_proportion=('y', lambda y: ((y >= 20) & (y <= 60)).mean()) # Proportion of carry start y location when y is between 20 and 60
).reset_index()

match_analysis_interception

**df_ball_recovery** - reviewed

In [None]:
selected_columns = [
    'id',
    'match_id',
    'location',
    'period',
    'possession',
    'possession_team',
    'under_pressure',
    'home_team',
    'home_team_flag',
    'duration',
    'counterpress',
    'x',
    'y'
]
# Check if the columns exist in the database
df_ball_recovery = new_dfs['df_ball_recovery']

existing_columns = [col for col in selected_columns if col in df_ball_recovery.columns]

# Create a new dataframe with only the existing selected columns
df_final_matches_selected_ball_recovery = df_ball_recovery[existing_columns]
df_final_matches_selected_ball_recovery

In [None]:
# Group by match_id
match_analysis_ball_recovery = df_final_matches_selected_ball_recovery.groupby('match_id').agg(
    ball_recovery_count=('id', 'count'),
    ball_recovery_proportion_period_1=('period', lambda x: (x == 1).mean()),
    ball_recovery_proportion_home_team=('home_team_flag', 'mean'),
    ball_recovery_proportion_under_pressure=('under_pressure', lambda x: x.fillna(False).mean()), # Handle potential NaNs in under_pressure
    dribble_start_mean=('x', 'mean'),
    dribble_start_proportion_x_lt_60=('x', lambda x: (x < 60).mean()), # Proportion of carry start x location when x is less than 60
    dribble_start_y_between_20_60_proportion=('y', lambda y: ((y >= 20) & (y <= 60)).mean()) # Proportion of carry start y location when y is between 20 and 60
).reset_index()

match_analysis_ball_recovery

**df_dispossessed** - reviewed

In [None]:
selected_columns = [
    'id',
    'match_id',
    'location',
    'period',
    'possession',
    'possession_team',
    'under_pressure',
    'home_team',
    'home_team_flag',
    'duration',
    'counterpress',
    'x',
    'y'
]
# Check if the columns exist in the database
df_dispossessed = new_dfs['df_dispossessed']

existing_columns = [col for col in selected_columns if col in df_dispossessed.columns]

# Create a new dataframe with only the existing selected columns
df_final_matches_selected_dispossessed = df_dispossessed[existing_columns]
df_final_matches_selected_dispossessed

In [None]:
# Group by match_id
match_analysis_dispossessed = df_final_matches_selected_dispossessed.groupby('match_id').agg(
    dispossessed_count=('id', 'count'),
    dispossessed_proportion_period_1=('period', lambda x: (x == 1).mean()),
    dispossessed_proportion_home_team=('home_team_flag', 'mean'),
    dispossessed_start_mean=('x', 'mean'),
    dispossessed_start_proportion_x_lt_60=('x', lambda x: (x < 60).mean()), # Proportion of carry start x location when x is less than 60
    dispossessed_start_y_between_20_60_proportion=('y', lambda y: ((y >= 20) & (y <= 60)).mean()) # Proportion of carry start y location when y is between 20 and 60
).reset_index()

match_analysis_dispossessed

**Foul Committed**

In [None]:
selected_columns = [
    'id',
    'match_id',
    'location',
    'period',
    'possession',
    'possession_team',
    'under_pressure',
    'home_team',
    'home_team_flag',
    'foul_committed_card'
]
# Check if the columns exist in the database
df_foul_committed = new_dfs['df_foul_committed']

existing_columns = [col for col in selected_columns if col in df_foul_committed.columns]

# Create a new dataframe with only the existing selected columns
df_final_matches_selected_foul_committed = df_foul_committed[existing_columns]
df_final_matches_selected_foul_committed

In [None]:
# Group by match_id
match_analysis_foul_committed = df_final_matches_selected_foul_committed.groupby('match_id').agg(
    foul_committed_yellow_card_count=('foul_committed_card', lambda x: (x == 'Yellow Card').sum()),
    foul_committed_red_card_count=('foul_committed_card', lambda x: ((x == 'Red Card')| (x == 'Second Yellow')).sum()),
    foul_committed_cards_period_1_count=('foul_committed_card', lambda x: ((x[df_final_matches_selected_foul_committed.loc[x.index, 'period'] == 1] == 'Yellow Card') | (x[df_final_matches_selected_foul_committed.loc[x.index, 'period'] == 1] == 'Red Card') | (x[df_final_matches_selected_foul_committed.loc[x.index, 'period'] == 1] == 'Second Yellow')).sum())
).reset_index()

match_analysis_foul_committed

**df_foul_won** - on pause

In [None]:
selected_columns = [
    'id',
    'match_id',
    'location',
    'period',
    'possession',
    'possession_team',
    'under_pressure',
    'home_team',
    'home_team_flag',
    'duration',
    'counterpress',
    'x',
    'y',
    'foul_won_defensive',
    'foul_won_advantage'
]
# Check if the columns exist in the database
df_foul_won = new_dfs['df_foul_won']

existing_columns = [col for col in selected_columns if col in df_foul_won.columns]

# Create a new dataframe with only the existing selected columns
df_final_matches_selected_foul_won = df_foul_won[existing_columns]
df_final_matches_selected_foul_won

In [None]:
# Group by match_id
match_analysis_foul_won = df_final_matches_selected_foul_won.groupby('match_id').agg(
    foul_won_count=('id', 'count'),
    foul_won_proportion_period_1=('period', lambda x: (x == 1).mean()),
    foul_won_proportion_home_team=('home_team_flag', 'mean'),
    foul_won_start_mean=('x', 'mean'),
    foul_won_start_proportion_x_lt_60=('x', lambda x: (x < 60).mean()), # Proportion of carry start x location when x is less than 60
    foul_won_start_y_between_20_60_proportion=('y', lambda y: ((y >= 20) & (y <= 60)).mean()), # Proportion of carry start y location when y is between 20 and 60
    foul_won_defensive_proportion=('foul_won_defensive', lambda x: x.fillna(False).mean()), # Handle potential NaNs in under_pressure
    foul_won_advantage_proportion=('foul_won_advantage', lambda x: x.fillna(False).mean()) # Handle potential NaNs in under_pressure
).reset_index()

match_analysis_foul_won

**df_error** - not to use

In [None]:
selected_columns = [
    'id',
    'match_id',
    'location',
    'period',
    'possession',
    'possession_team',
    'under_pressure',
    'home_team',
    'home_team_flag',
    'duration',
    'counterpress',
    'x',
    'y'
]
# Check if the columns exist in the database
df_error = new_dfs['df_error']

existing_columns = [col for col in selected_columns if col in df_error.columns]

# Create a new dataframe with only the existing selected columns
df_final_matches_selected_error = df_error[existing_columns]
df_final_matches_selected_error

In [None]:
# Group by match_id
match_analysis_error = df_final_matches_selected_error.groupby('match_id').agg(
    error_count=('id', 'count'),
    error_proportion_period_1=('period', lambda x: (x == 1).mean()),
    error_proportion_home_team=('home_team_flag', 'mean'),
    error_proportion_under_pressure=('under_pressure', lambda x: x.fillna(False).mean()), # Handle potential NaNs in under_pressure
    error_duration_mean=('duration', 'mean'),
    error_counterpress_rate=('counterpress', lambda x: x.fillna(False).mean()),
    dribble_start_mean=('x', 'mean'),
    dribble_start_proportion_x_lt_60=('x', lambda x: (x < 60).mean()), # Proportion of carry start x location when x is less than 60
    dribble_start_y_between_20_60_proportion=('y', lambda y: ((y >= 20) & (y <= 60)).mean()) # Proportion of carry start y location when y is between 20 and 60
).reset_index()

match_analysis_error

**df_injury_stoppage** - reviewed

In [None]:
selected_columns = [
    'id',
    'match_id',
    'location',
    'period',
    'possession',
    'possession_team',
    'under_pressure',
    'home_team',
    'home_team_flag',
    'duration',
    'counterpress',
    'x',
    'y'
]
# Check if the columns exist in the database
df_injury_stoppage = new_dfs['df_injury_stoppage']

existing_columns = [col for col in selected_columns if col in df_injury_stoppage.columns]

# Create a new dataframe with only the existing selected columns
df_final_matches_selected_injury_stoppage = df_injury_stoppage[existing_columns]
df_final_matches_selected_injury_stoppage

In [None]:
# Group by match_id
match_analysis_injury_stoppage = df_final_matches_selected_injury_stoppage.groupby('match_id').agg(
    injury_stoppage_count=('id', 'count'),
    injury_stoppage_proportion_period_1=('period', lambda x: (x == 1).mean()),
    injury_stoppage_proportion_home_team=('home_team_flag', 'mean')
).reset_index()

match_analysis_injury_stoppage

**df_referee_ball-drop** - reviewed

In [None]:
selected_columns = [
    'id',
    'match_id',
    'location',
    'period',
    'possession',
    'possession_team',
    'under_pressure',
    'home_team',
    'home_team_flag',
    'duration',
    'counterpress',
    'x',
    'y'
]
# Check if the columns exist in the database
df_drop_ball = new_dfs['df_referee_ball-drop']

existing_columns = [col for col in selected_columns if col in df_drop_ball.columns]

# Create a new dataframe with only the existing selected columns
df_final_matches_selected_drop_ball = df_drop_ball[existing_columns]
df_final_matches_selected_drop_ball

In [None]:
# Group by match_id and calculate total count, proportion by period, proportion by home_team_flag, and proportion by under_pressure
match_analysis_drop_ball = df_final_matches_selected_drop_ball.groupby('match_id').agg(
    drop_ball_count=('id', 'count')
).reset_index()

match_analysis_drop_ball

**df_clearance** - reviewed

In [None]:
selected_columns = [
    'id',
    'match_id',
    'location',
    'period',
    'possession',
    'possession_team',
    'under_pressure',
    'home_team',
    'home_team_flag',
    'duration',
    'counterpress',
    'x',
    'y',
    'clearance_head',
    'clearance_left_foot',
    'clearance_right_foot'
]
# Check if the columns exist in the database
df_clearance = new_dfs['df_clearance']

existing_columns = [col for col in selected_columns if col in df_clearance.columns]

# Create a new dataframe with only the existing selected columns
df_final_matches_selected_clearance = df_clearance[existing_columns]
df_final_matches_selected_clearance

In [None]:
# Group by match_id
match_analysis_clearance = df_final_matches_selected_clearance.groupby('match_id').agg(
    clearance_count=('id', 'count'),
    clearance_proportion_period_1=('period', lambda x: (x == 1).mean()),
    clearance_proportion_home_team=('home_team_flag', 'mean'),
    clearance_start_mean=('x', 'mean'),
    clearance_start_proportion_x_lt_60=('x', lambda x: (x < 60).mean()), # Proportion of carry start x location when x is less than 60
    clearance_start_y_between_20_60_proportion=('y', lambda y: ((y >= 20) & (y <= 60)).mean()), # Proportion of carry start y location when y is between 20 and 60
    clearance_head_proportion=('clearance_head', lambda x: x.fillna(False).mean()), # Handle potential NaNs in under_pressure
    clearance_left_foot_proportion=('clearance_left_foot', lambda x: x.fillna(False).mean()), # Handle potential NaNs in under_pressure
    clearance_right_foot_proportion=('clearance_right_foot', lambda x: x.fillna(False).mean()) # Handle potential NaNs in under_pressure
).reset_index()

match_analysis_clearance

**df_offside** - reviewed

In [None]:
selected_columns = [
    'id',
    'match_id',
    'location',
    'period',
    'possession',
    'possession_team',
    'under_pressure',
    'home_team',
    'home_team_flag',
    'duration',
    'counterpress',
    'x',
    'y'
]
# Check if the columns exist in the database
df_offside = new_dfs['df_offside']

existing_columns = [col for col in selected_columns if col in df_offside.columns]

# Create a new dataframe with only the existing selected columns
df_final_matches_selected_offside = df_offside[existing_columns]
df_final_matches_selected_offside

In [None]:
# Group by match_id
match_analysis_offside = df_final_matches_selected_offside.groupby('match_id').agg(
    offside_count=('id', 'count')
).reset_index()

match_analysis_offside

**df_dribbled_past** - reviewed

In [None]:
selected_columns = [
    'id',
    'match_id',
    'location',
    'period',
    'possession',
    'possession_team',
    'under_pressure',
    'home_team',
    'home_team_flag',
    'duration',
    'counterpress',
    'x',
    'y'
]
# Check if the columns exist in the database
df_dribbled_past = new_dfs['df_dribbled_past']

existing_columns = [col for col in selected_columns if col in df_dribbled_past.columns]

# Create a new dataframe with only the existing selected columns
df_final_matches_selected_dribbled_past = df_dribbled_past[existing_columns]
df_final_matches_selected_dribbled_past

In [None]:
# Group by match_id
match_analysis_dribbled_past = df_final_matches_selected_dribbled_past.groupby('match_id').agg(
    dribbled_past_count=('id', 'count'),
    dribbled_past_proportion_period_1=('period', lambda x: (x == 1).mean()),
    dribbled_past_proportion_home_team=('home_team_flag', 'mean'),
    dribbled_past_counterpress_rate=('counterpress', lambda x: x.fillna(False).mean()),
    dribble_past_start_mean=('x', 'mean'),
    dribble_past_start_proportion_x_lt_60=('x', lambda x: (x < 60).mean()), # Proportion of carry start x location when x is less than 60
    dribble_past_start_y_between_20_60_proportion=('y', lambda y: ((y >= 20) & (y <= 60)).mean()) # Proportion of carry start y location when y is between 20 and 60
).reset_index()

match_analysis_dribbled_past

**df_player_off** - reviewed

In [None]:
selected_columns = [
    'id',
    'match_id',
    'location',
    'period',
    'possession',
    'possession_team',
    'under_pressure',
    'home_team',
    'home_team_flag',
    'duration',
    'counterpress',
    'x',
    'y'
]
# Check if the columns exist in the database
df_player_off = new_dfs['df_player_off']

existing_columns = [col for col in selected_columns if col in df_player_off.columns]

# Create a new dataframe with only the existing selected columns
df_final_matches_selected_off = df_player_off[existing_columns]
df_final_matches_selected_off

In [None]:
# Group by match_id
match_analysis_off = df_final_matches_selected_off.groupby('match_id').agg(
    off_count=('id', 'count')
).reset_index()

match_analysis_off

**df_substitution** - reviewed

In [None]:
selected_columns = [
    'id',
    'match_id',
    'location',
    'period',
    'possession',
    'possession_team',
    'under_pressure',
    'home_team',
    'home_team_flag',
    'duration',
    'counterpress',
    'x',
    'y'
]
# Check if the columns exist in the database
df_substitution = new_dfs['df_substitution']

existing_columns = [col for col in selected_columns if col in df_substitution.columns]

# Create a new dataframe with only the existing selected columns
df_final_matches_selected_substitution = df_substitution[existing_columns]
df_final_matches_selected_substitution

In [None]:
# Group by match_id
match_analysis_substitution = df_final_matches_selected_substitution.groupby('match_id').agg(
    substitution_count=('id', 'count'),
    substitution_proportion_period_1=('period', lambda x: (x == 1).mean()),
    substitution_proportion_home_team=('home_team_flag', 'mean')
).reset_index()

match_analysis_substitution

**df_bad_behaviour** - reviewed

In [None]:
selected_columns = [
    'id',
    'match_id',
    'location',
    'period',
    'possession',
    'possession_team',
    'under_pressure',
    'home_team',
    'home_team_flag',
    'duration',
    'counterpress',
    'x',
    'y',
    'bad_behaviour_card'
]
# Check if the columns exist in the database
df_bad_behaviour = new_dfs['df_bad_behaviour']

existing_columns = [col for col in selected_columns if col in df_bad_behaviour.columns]

# Create a new dataframe with only the existing selected columns
df_final_matches_selected_bad_behaviour = df_bad_behaviour[existing_columns]
df_final_matches_selected_bad_behaviour

In [None]:
# Group by match_id
match_analysis_bad_behaviour = df_final_matches_selected_bad_behaviour.groupby('match_id').agg(
    bad_behaviour_count=('id', 'count'),
    bad_behaviour_proportion_period_1=('period', lambda x: (x == 1).mean()),
    bad_behaviour_proportion_home_team=('home_team_flag', 'mean'),
    bad_behaviour_yellow_card_proportion=('bad_behaviour_card', lambda x: (x == 'Yellow Card').mean())
).reset_index()

match_analysis_bad_behaviour

**df_tactical_shift** - reviewed

In [None]:
selected_columns = [
    'id',
    'match_id',
    'location',
    'period',
    'possession',
    'possession_team',
    'under_pressure',
    'home_team',
    'home_team_flag',
    'duration',
    'counterpress',
    'x',
    'y'
]
# Check if the columns exist in the database
df_tactical_shift = new_dfs['df_tactical_shift']

existing_columns = [col for col in selected_columns if col in df_tactical_shift.columns]

# Create a new dataframe with only the existing selected columns
df_final_matches_selected_tactical_shift = df_tactical_shift[existing_columns]
df_final_matches_selected_tactical_shift

In [None]:
# Group by match_id
match_analysis_tactical_shift = df_final_matches_selected_bad_behaviour.groupby('match_id').agg(
    tactical_shift_count=('id', 'count'),
    tactical_shift_proportion_period_1=('period', lambda x: (x == 1).mean()),
    tactical_shift_proportion_home_team=('home_team_flag', 'mean')
).reset_index()

match_analysis_tactical_shift

In [None]:

# List of dataframes to join
analysis_dfs = [
    match_analysis,
    df_wwc_subset,
    match_analysis_pass,
    match_analysis_carry,
    match_analysis_pressure,
    match_analysis_miscontrol,
    match_analysis_dribble,
    match_analysis_duel,
    match_analysis_interception,
    match_analysis_ball_recovery,
    match_analysis_dispossessed,
    match_analysis_foul_committed,
    match_analysis_foul_won,
    match_analysis_injury_stoppage,
    match_analysis_drop_ball,
    match_analysis_clearance,
    match_analysis_offside,
    match_analysis_dribbled_past,
    match_analysis_off,
    match_analysis_substitution,
    match_analysis_bad_behaviour,
    match_analysis_tactical_shift,
    match_analysis_shot
]

# Start with the first dataframe
match_analysis = analysis_dfs[0]

# Join the remaining dataframes iteratively
for df in analysis_dfs[1:]:
    match_analysis = pd.merge(match_analysis, df, on='match_id', how='outer')

# Display the merged dataframe
print(match_analysis.head())