# Empirical Analysis
In this python notebook we are going to:
1. Get the Data from the Statsbomb Open Data API and save it with timestamp
2. Run an empirical analysis on the data
3. Transform the data if needed

#### 1. Get the data from the Statsbomb Open Data API and saving it

In [5]:
# Import all the libraries used to get the data
from statsbombpy import sb
import pandas as pd
import os
from tqdm import tqdm
from datetime import datetime
# We don't have an API-key, so we will ignore that exact warning
import warnings
warnings.filterwarnings('ignore', category=sb.api_client.NoAuthWarning)

In [6]:
# Since we want as much data as possible, we will first query for all available match_ids
all_competition_season_ids = sb.competitions()[['competition_id', 'season_id']].apply(lambda row: (row['competition_id'], row['season_id']), axis='columns')
# For every competition and each season download the match meta-data
name_to_competition_id_mapping = pd.DataFrame({ 'competition_name': [],
                                                'season_name': [],
                                                'competition_id': [],
                                                'season_id': [],
                                                'time_downloaded': []})
# We only need to look at the subset of the columns for each match
for competition_id, season_id in tqdm(all_competition_season_ids):
    matches = sb.matches(competition_id=competition_id, season_id=season_id)
    directory_name = f'statsbomb_data/{competition_id}_{season_id}'
    # This makes it easier for humans to look at the data later
    if not os.path.exists(directory_name):
        os.mkdir(directory_name)
    # Download the events for each match
    for match_id in tqdm(sb.matches(competition_id=competition_id, season_id=season_id)['match_id'].unique()):
        # If the file already exists, we skip it
        if os.path.exists(f'{directory_name}/{match_id}.pkl'):
            continue
        sb.events(match_id=match_id, split=True)['shots'].to_pickle(f'{directory_name}/{match_id}.pkl')
    pd.concat([name_to_competition_id_mapping, { 'competition_name': matches['competition'][0],
                                                'season_name': matches['season'][0],
                                                'competition_id': competition_id,
                                                'season_id': season_id,
                                                'time_downloaded': str(datetime.now())}])
    name_to_competition_id_mapping.to_csv('name_to_id_mapping.csv')

100%|██████████| 306/306 [00:00<00:00, 272669.86it/s]
  0%|          | 0/71 [00:00<?, ?it/s]


TypeError: concat() takes 1 positional argument but 2 were given