# Empirical Analysis
In this python notebook we are going to:
1. Get the Data from the Statsbomb Open Data API and save it with timestamp
2. Run an empirical analysis on the data
3. Transform the data if needed

#### 1. Get the data from the Statsbomb Open Data API and saving it

In [1]:
# Import all the libraries used to get the data
from statsbombpy import sb
import pandas as pd
import os
from tqdm import tqdm
from datetime import datetime
# We don't have an API-key, so we will ignore that exact warning
import warnings
warnings.filterwarnings('ignore', category=sb.api_client.NoAuthWarning)

In [2]:
# Since we want as much data as possible, we will first query for all available match_ids
all_competition_season_ids = sb.competitions()[['competition_id', 'season_id']].apply(lambda row: (row['competition_id'], row['season_id']), axis='columns')
# For every competition and each season download the match meta-data
if not os.path.exists('name_to_id_mapping.csv'): 
    name_to_competition_id_mapping = pd.DataFrame({ 'competition_name': [],
                                                    'season_name': [],
                                                    'competition_id': [],
                                                    'season_id': [],
                                                    'time_downloaded': []})
else:
    name_to_competition_id_mapping = pd.read_csv('name_to_id_mapping.csv')
# We only need to look at the subset of the columns for each match
for competition_id, season_id in tqdm(all_competition_season_ids):
    matches = sb.matches(competition_id=competition_id, season_id=season_id)
    directory_name = f'statsbomb_data/{competition_id}_{season_id}'
    # This makes it easier for humans to look at the data later
    if not os.path.exists(directory_name):
        os.mkdir(directory_name)
    # Download the events for each match
    for match_id in tqdm(sb.matches(competition_id=competition_id, season_id=season_id)['match_id'].unique()):
        # If the file already exists, we skip it
        if os.path.exists(f'{directory_name}/{match_id}.pkl'):
            continue
        sb.events(match_id=match_id, split=True)['shots'].to_pickle(f'{directory_name}/{match_id}.pkl')
    name_to_competition_id_mapping.loc[len(name_to_competition_id_mapping)] = [matches['competition'][0], matches['season'][0], competition_id,season_id, str(datetime.now())]
    name_to_competition_id_mapping.to_csv('name_to_id_mapping.csv')

100%|██████████| 306/306 [00:00<00:00, 265726.09it/s]
100%|██████████| 52/52 [00:37<00:00,  1.40it/s]
100%|██████████| 1/1 [00:00<00:00,  1.67it/s]]
100%|██████████| 1/1 [00:00<00:00,  1.38it/s]]
100%|██████████| 1/1 [00:00<00:00,  1.41it/s]]
100%|██████████| 1/1 [00:00<00:00,  1.20it/s]]
100%|██████████| 1/1 [00:00<00:00,  1.34it/s]]
100%|██████████| 1/1 [00:00<00:00,  1.11it/s]]
100%|██████████| 1/1 [00:00<00:00,  1.71it/s]]
100%|██████████| 1/1 [00:00<00:00,  1.30it/s]]
100%|██████████| 1/1 [00:00<00:00,  1.26it/s]t]
100%|██████████| 1/1 [00:00<00:00,  1.41it/s]t]
100%|██████████| 1/1 [00:00<00:00,  1.41it/s]t]
100%|██████████| 1/1 [00:00<00:00,  1.51it/s]t]
100%|██████████| 1/1 [00:00<00:00,  1.19it/s]t]
100%|██████████| 1/1 [00:00<00:00,  1.61it/s]t]
100%|██████████| 1/1 [00:00<00:00,  1.50it/s]t]
100%|██████████| 1/1 [00:00<00:00,  1.44it/s]t]
100%|██████████| 1/1 [00:00<00:00,  1.43it/s]t]
100%|██████████| 1/1 [00:00<00:00,  1.44it/s]t]
100%|██████████| 1/1 [00:00<00:00,  1.51it