In [1]:
# Cell 1: Import all required libraries
import polars as pl
import pandas as pd
import numpy as np
from kloppy import impect
import requests
import io
from kloppy.utils import github_resolve_raw_data_url
from tqdm.notebook import tqdm  # Progress bars
import pickle
from pathlib import Path

print(" All imports successful!")

 All imports successful!


In [2]:
# Cell 2: Setup project paths
from pathlib import Path

# Create directory structure
DATA_DIR = Path("../data")
RAW_DIR = DATA_DIR / "raw"
PROCESSED_DIR = DATA_DIR / "processed"

# Create directories if they don't exist
RAW_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

print(f" Data directories created:")
print(f"  Raw data: {RAW_DIR}")
print(f"  Processed data: {PROCESSED_DIR}")

 Data directories created:
  Raw data: ../data/raw
  Processed data: ../data/processed


In [3]:
# Cell 3: Load match and squad metadata
print("Loading match metadata...")

match_url = github_resolve_raw_data_url(
    repository="ImpectAPI/open-data",
    branch="main",
    file="data/matches/matches_743.json"
)
squads_url = github_resolve_raw_data_url(
    repository="ImpectAPI/open-data",
    branch="main",
    file="data/squads/squads_743.json"
)

# Load matches
response = requests.get(match_url)
matches = (
    pl.read_json(io.StringIO(response.text))
    .unnest("matchDay")
    .rename({'iterationId': 'competitionId', 'id': 'matchId'})
    .drop(['idMappings', 'lastCalculationDate', 'name', 'available'])
    .with_columns([
        (pl.col("index") + 1).alias("matchDay")
    ])
    .drop("index")
)

# Load squads
response = requests.get(squads_url)
squads = (
    pl.read_json(io.StringIO(response.text))
    .drop(['type', 'gender', 'imageUrl', 'idMappings', 'access', 'countryId'])
)

# Join to get team names
matches = (
    matches
    .join(
        squads.rename({"name": "homeTeam"}),
        left_on="homeSquadId",
        right_on="id",
        how="left"
    )
    .join(
        squads.rename({"name": "awayTeam"}),
        left_on="awaySquadId",
        right_on="id",
        how="left"
    )
    .select(['competitionId', 'matchId', 'homeSquadId', 'awaySquadId', 
             'homeTeam', 'awayTeam', 'matchDay', 'scheduledDate'])
)

print(f" Loaded {len(matches)} matches")
print(f"\nFirst 5 matches:")
print(matches.head())

Loading match metadata...
 Loaded 306 matches

First 5 matches:
shape: (5, 8)
┌────────────┬─────────┬────────────┬────────────┬────────────┬────────────┬──────────┬────────────┐
│ competitio ┆ matchId ┆ homeSquadI ┆ awaySquadI ┆ homeTeam   ┆ awayTeam   ┆ matchDay ┆ scheduledD │
│ nId        ┆ ---     ┆ d          ┆ d          ┆ ---        ┆ ---        ┆ ---      ┆ ate        │
│ ---        ┆ i64     ┆ ---        ┆ ---        ┆ str        ┆ str        ┆ i64      ┆ ---        │
│ i64        ┆         ┆ i64        ┆ i64        ┆            ┆            ┆          ┆ str        │
╞════════════╪═════════╪════════════╪════════════╪════════════╪════════════╪══════════╪════════════╡
│ 743        ┆ 122838  ┆ 38         ┆ 33         ┆ SV Werder  ┆ FC Bayern  ┆ 1        ┆ 2023-08-18 │
│            ┆         ┆            ┆            ┆ Bremen     ┆ München    ┆          ┆ T18:30:00Z │
│ 743        ┆ 122839  ┆ 41         ┆ 37         ┆ Bayer 04   ┆ RasenBalls ┆ 1        ┆ 2023-08-19 │
│            

In [4]:
# Cell 4: Load ONE match to inspect structure
print("Loading first match to inspect structure...")

match_id = matches['matchId'][0]
home_team = matches.filter(pl.col('matchId') == match_id)['homeTeam'][0]
away_team = matches.filter(pl.col('matchId') == match_id)['awayTeam'][0]

print(f"\nMatch: {home_team} vs {away_team}")
print(f"Match ID: {match_id}")

# Load match data
dataset = impect.load_open_data(match_id=match_id, competition_id=743)

# Transform to standard coordinate system
df_sample = (
    dataset
    .transform(to_coordinate_system="secondspectrum")
    .to_df(engine="polars")
)

print(f"\n Total events: {len(df_sample)}")
print(f"\nEvent type distribution:")
print(df_sample.group_by('event_type').agg(pl.count().alias('count')).sort('count', descending=True))

print(f"\nAvailable columns ({len(df_sample.columns)}):")
for i, col in enumerate(df_sample.columns, 1):
    print(f"  {i}. {col}")

print(f"\nSample of first 3 events:")
print(df_sample.select([
    'event_id', 'event_type', 'team_id', 'player_id', 
    'coordinates_x', 'coordinates_y', 'result', 'success', 'period_id', 'timestamp'
]).head(3))

Loading first match to inspect structure...

Match: SV Werder Bremen vs FC Bayern München
Match ID: 122838



You are about to use IMPECT public data.
By using this data, you are agreeing to the user agreement. 
The user agreement can be found here: https://github.com/ImpectAPI/open-data/blob/main/LICENSE.pdf




 Total events: 3057

Event type distribution:
shape: (17, 2)
┌───────────────────────┬───────┐
│ event_type            ┆ count │
│ ---                   ┆ ---   │
│ str                   ┆ u32   │
╞═══════════════════════╪═══════╡
│ PASS                  ┆ 1000  │
│ GENERIC:RECEPTION     ┆ 831   │
│ CARRY                 ┆ 735   │
│ RECOVERY              ┆ 111   │
│ DUEL                  ┆ 106   │
│ …                     ┆ …     │
│ GENERIC:NO_VIDEO      ┆ 10    │
│ SUBSTITUTION          ┆ 10    │
│ GENERIC:GOAL          ┆ 4     │
│ GENERIC:OFFSIDE       ┆ 4     │
│ GENERIC:FINAL_WHISTLE ┆ 2     │
└───────────────────────┴───────┘

Available columns (22):
  1. event_id
  2. event_type
  3. period_id
  4. timestamp
  5. end_timestamp
  6. ball_state
  7. ball_owning_team
  8. team_id
  9. player_id
  10. coordinates_x
  11. coordinates_y
  12. end_coordinates_x
  13. end_coordinates_y
  14. receiver_player_id
  15. body_part_type
  16. set_piece_type
  17. result
  18. success
  19. du

(Deprecated in version 0.20.5)
  print(df_sample.group_by('event_type').agg(pl.count().alias('count')).sort('count', descending=True))


In [5]:
# Cell 5: Define pitch zone classification
def classify_zone(x_coordinate):
    """
    Classify pitch location into thirds based on x-coordinate.
    
    Coordinate system (secondspectrum): 
    - x ranges from -52.5 (own goal) to +52.5 (opponent goal)
    - Defensive third: x < -17.5
    - Middle third: -17.5 <= x <= 17.5  
    - Attacking third: x > 17.5
    
    Args:
        x_coordinate: float, x position on pitch
        
    Returns:
        str: 'defensive_third', 'middle_third', or 'attacking_third'
    """
    if x_coordinate < -17.5:
        return "defensive_third"
    elif x_coordinate > 17.5:
        return "attacking_third"
    else:
        return "middle_third"

# Test the function
test_positions = [-40, -10, 0, 15, 35]
print("Testing zone classification:")
for x in test_positions:
    zone = classify_zone(x)
    print(f"  x={x:>5.1f} → {zone}")
    
print("\nZone classification function ready")

Testing zone classification:
  x=-40.0 → defensive_third
  x=-10.0 → middle_third
  x=  0.0 → middle_third
  x= 15.0 → middle_third
  x= 35.0 → attacking_third

Zone classification function ready


In [6]:
# Cell 6: Function to load and process a single match
def load_and_process_match(match_id, competition_id=743):
    """
    Load a single match and add zone classification.
    
    Args:
        match_id: int, match identifier
        competition_id: int, competition identifier (default 743 for Bundesliga)
        
    Returns:
        polars.DataFrame: Processed match events with zones
    """
    # Load match data
    dataset = impect.load_open_data(match_id=match_id, competition_id=competition_id)
    
    # Transform to secondspectrum coordinates and convert to polars
    df = (
        dataset
        .transform(to_coordinate_system="secondspectrum")
        .to_df(engine="polars")
    )
    
    # Add match_id column
    df = df.with_columns([
        pl.lit(match_id).alias('match_id')
    ])
    
    # Add zone classification based on x coordinate
    df = df.with_columns([
        pl.col('coordinates_x').map_elements(
            classify_zone, 
            return_dtype=pl.Utf8
        ).alias('zone')
    ])
    
    # Add zone for end coordinates (for passes, carries)
    df = df.with_columns([
        pl.col('end_coordinates_x').map_elements(
            lambda x: classify_zone(x) if x is not None else None,
            return_dtype=pl.Utf8
        ).alias('end_zone')
    ])
    
    return df

# Test on first match
print("Testing on first match...")
test_df = load_and_process_match(match_id=matches['matchId'][0])

print(f" Processed {len(test_df)} events")
print(f"\nZone distribution:")
print(test_df.group_by('zone').agg(pl.len().alias('count')).sort('zone'))

print(f"\nSample with zones:")
print(test_df.select(['event_type', 'coordinates_x', 'zone', 'end_coordinates_x', 'end_zone']).head(5))

Testing on first match...
 Processed 3057 events

Zone distribution:
shape: (4, 2)
┌─────────────────┬───────┐
│ zone            ┆ count │
│ ---             ┆ ---   │
│ str             ┆ u32   │
╞═════════════════╪═══════╡
│ null            ┆ 57    │
│ attacking_third ┆ 731   │
│ defensive_third ┆ 852   │
│ middle_third    ┆ 1417  │
└─────────────────┴───────┘

Sample with zones:
shape: (5, 5)
┌───────────────────┬───────────────┬─────────────────┬───────────────────┬─────────────────┐
│ event_type        ┆ coordinates_x ┆ zone            ┆ end_coordinates_x ┆ end_zone        │
│ ---               ┆ ---           ┆ ---             ┆ ---               ┆ ---             │
│ str               ┆ f64           ┆ str             ┆ f64               ┆ str             │
╞═══════════════════╪═══════════════╪═════════════════╪═══════════════════╪═════════════════╡
│ PASS              ┆ 0.0           ┆ middle_third    ┆ null              ┆ null            │
│ GENERIC:NO_VIDEO  ┆ null          ┆ n

In [7]:
# Cell 7: Load all matches with caching (FIXED VERSION)
import time

# Check if we already have cached data
cache_file = PROCESSED_DIR / "all_matches_with_zones.parquet"

if cache_file.exists():
    print(f" Found cached data at {cache_file}")
    print("Loading from cache...")
    all_events = pl.read_parquet(cache_file)
    print(f" Loaded {len(all_events):,} events from cache")
else:
    print(f"No cache found. Loading all {len(matches)} matches...")
    print()
    
    all_events_list = []
    failed_matches = []
    
    start_time = time.time()
    
    # Loop through all matches with progress bar
    for idx, match_id in enumerate(tqdm(matches['matchId'], desc="Loading matches"), 1):
        try:
            df = load_and_process_match(match_id)
            all_events_list.append(df)
            
            # Print progress every 50 matches
            if idx % 50 == 0:
                elapsed = time.time() - start_time
                avg_time = elapsed / idx
                remaining = (len(matches) - idx) * avg_time
                print(f"\n  Processed {idx}/{len(matches)} matches | "
                      f"Elapsed: {elapsed/60:.1f}min | "
                      f"Est. remaining: {remaining/60:.1f}min")
                
        except Exception as e:
            print(f"\n   Failed to load match {match_id}: {e}")
            failed_matches.append(match_id)
            continue
    
    # Combine all matches - USE DIAGONAL TO HANDLE MISMATCHED SCHEMAS
    print("\n\nCombining all matches...")
    all_events = pl.concat(all_events_list, how="diagonal")
    
    # Save cache
    print(f"Saving cache to {cache_file}...")
    all_events.write_parquet(cache_file)
    
    elapsed_total = time.time() - start_time
    print(f"\n Loaded {len(all_events):,} events from {len(all_events_list)} matches")
    print(f" Total time: {elapsed_total/60:.1f} minutes")
    
    if failed_matches:
        print(f"\n {len(failed_matches)} matches failed to load: {failed_matches}")

# Summary statistics
print("DATASET SUMMARY")
print(f"Total events: {len(all_events):,}")
print(f"Total matches: {all_events['match_id'].n_unique()}")
print(f"Total players: {all_events['player_id'].n_unique()}")
print(f"\nEvent type distribution:")
print(all_events.group_by('event_type').agg(pl.len().alias('count')).sort('count', descending=True).head(10))
print(f"\nZone distribution:")
print(all_events.group_by('zone').agg(pl.len().alias('count')).sort('zone'))

No cache found. Loading all 306 matches...



Loading matches:   0%|          | 0/306 [00:00<?, ?it/s]


You are about to use IMPECT public data.
By using this data, you are agreeing to the user agreement. 
The user agreement can be found here: https://github.com/ImpectAPI/open-data/blob/main/LICENSE.pdf


You are about to use IMPECT public data.
By using this data, you are agreeing to the user agreement. 
The user agreement can be found here: https://github.com/ImpectAPI/open-data/blob/main/LICENSE.pdf


You are about to use IMPECT public data.
By using this data, you are agreeing to the user agreement. 
The user agreement can be found here: https://github.com/ImpectAPI/open-data/blob/main/LICENSE.pdf


You are about to use IMPECT public data.
By using this data, you are agreeing to the user agreement. 
The user agreement can be found here: https://github.com/ImpectAPI/open-data/blob/main/LICENSE.pdf


You are about to use IMPECT public data.
By using this data, you are agreeing to the user agreement. 
The user agreement can be found here: https://github.com/ImpectAPI/open-data/blob/mai


  Processed 50/306 matches | Elapsed: 0.8min | Est. remaining: 4.1min



You are about to use IMPECT public data.
By using this data, you are agreeing to the user agreement. 
The user agreement can be found here: https://github.com/ImpectAPI/open-data/blob/main/LICENSE.pdf


You are about to use IMPECT public data.
By using this data, you are agreeing to the user agreement. 
The user agreement can be found here: https://github.com/ImpectAPI/open-data/blob/main/LICENSE.pdf


You are about to use IMPECT public data.
By using this data, you are agreeing to the user agreement. 
The user agreement can be found here: https://github.com/ImpectAPI/open-data/blob/main/LICENSE.pdf


You are about to use IMPECT public data.
By using this data, you are agreeing to the user agreement. 
The user agreement can be found here: https://github.com/ImpectAPI/open-data/blob/main/LICENSE.pdf


You are about to use IMPECT public data.
By using this data, you are agreeing to the user agreement. 
The user agreement can be found here: https://github.com/ImpectAPI/open-data/blob/mai


  Processed 100/306 matches | Elapsed: 1.6min | Est. remaining: 3.3min



You are about to use IMPECT public data.
By using this data, you are agreeing to the user agreement. 
The user agreement can be found here: https://github.com/ImpectAPI/open-data/blob/main/LICENSE.pdf


You are about to use IMPECT public data.
By using this data, you are agreeing to the user agreement. 
The user agreement can be found here: https://github.com/ImpectAPI/open-data/blob/main/LICENSE.pdf


You are about to use IMPECT public data.
By using this data, you are agreeing to the user agreement. 
The user agreement can be found here: https://github.com/ImpectAPI/open-data/blob/main/LICENSE.pdf


You are about to use IMPECT public data.
By using this data, you are agreeing to the user agreement. 
The user agreement can be found here: https://github.com/ImpectAPI/open-data/blob/main/LICENSE.pdf


You are about to use IMPECT public data.
By using this data, you are agreeing to the user agreement. 
The user agreement can be found here: https://github.com/ImpectAPI/open-data/blob/mai


  Processed 150/306 matches | Elapsed: 2.3min | Est. remaining: 2.4min



You are about to use IMPECT public data.
By using this data, you are agreeing to the user agreement. 
The user agreement can be found here: https://github.com/ImpectAPI/open-data/blob/main/LICENSE.pdf


You are about to use IMPECT public data.
By using this data, you are agreeing to the user agreement. 
The user agreement can be found here: https://github.com/ImpectAPI/open-data/blob/main/LICENSE.pdf


You are about to use IMPECT public data.
By using this data, you are agreeing to the user agreement. 
The user agreement can be found here: https://github.com/ImpectAPI/open-data/blob/main/LICENSE.pdf


You are about to use IMPECT public data.
By using this data, you are agreeing to the user agreement. 
The user agreement can be found here: https://github.com/ImpectAPI/open-data/blob/main/LICENSE.pdf


You are about to use IMPECT public data.
By using this data, you are agreeing to the user agreement. 
The user agreement can be found here: https://github.com/ImpectAPI/open-data/blob/mai


  Processed 200/306 matches | Elapsed: 3.1min | Est. remaining: 1.6min



You are about to use IMPECT public data.
By using this data, you are agreeing to the user agreement. 
The user agreement can be found here: https://github.com/ImpectAPI/open-data/blob/main/LICENSE.pdf


You are about to use IMPECT public data.
By using this data, you are agreeing to the user agreement. 
The user agreement can be found here: https://github.com/ImpectAPI/open-data/blob/main/LICENSE.pdf


You are about to use IMPECT public data.
By using this data, you are agreeing to the user agreement. 
The user agreement can be found here: https://github.com/ImpectAPI/open-data/blob/main/LICENSE.pdf


You are about to use IMPECT public data.
By using this data, you are agreeing to the user agreement. 
The user agreement can be found here: https://github.com/ImpectAPI/open-data/blob/main/LICENSE.pdf


You are about to use IMPECT public data.
By using this data, you are agreeing to the user agreement. 
The user agreement can be found here: https://github.com/ImpectAPI/open-data/blob/mai


  Processed 250/306 matches | Elapsed: 3.9min | Est. remaining: 0.9min



You are about to use IMPECT public data.
By using this data, you are agreeing to the user agreement. 
The user agreement can be found here: https://github.com/ImpectAPI/open-data/blob/main/LICENSE.pdf


You are about to use IMPECT public data.
By using this data, you are agreeing to the user agreement. 
The user agreement can be found here: https://github.com/ImpectAPI/open-data/blob/main/LICENSE.pdf


You are about to use IMPECT public data.
By using this data, you are agreeing to the user agreement. 
The user agreement can be found here: https://github.com/ImpectAPI/open-data/blob/main/LICENSE.pdf


You are about to use IMPECT public data.
By using this data, you are agreeing to the user agreement. 
The user agreement can be found here: https://github.com/ImpectAPI/open-data/blob/main/LICENSE.pdf


You are about to use IMPECT public data.
By using this data, you are agreeing to the user agreement. 
The user agreement can be found here: https://github.com/ImpectAPI/open-data/blob/mai


  Processed 300/306 matches | Elapsed: 4.6min | Est. remaining: 0.1min



You are about to use IMPECT public data.
By using this data, you are agreeing to the user agreement. 
The user agreement can be found here: https://github.com/ImpectAPI/open-data/blob/main/LICENSE.pdf


You are about to use IMPECT public data.
By using this data, you are agreeing to the user agreement. 
The user agreement can be found here: https://github.com/ImpectAPI/open-data/blob/main/LICENSE.pdf


You are about to use IMPECT public data.
By using this data, you are agreeing to the user agreement. 
The user agreement can be found here: https://github.com/ImpectAPI/open-data/blob/main/LICENSE.pdf


You are about to use IMPECT public data.
By using this data, you are agreeing to the user agreement. 
The user agreement can be found here: https://github.com/ImpectAPI/open-data/blob/main/LICENSE.pdf


You are about to use IMPECT public data.
By using this data, you are agreeing to the user agreement. 
The user agreement can be found here: https://github.com/ImpectAPI/open-data/blob/mai



Combining all matches...
Saving cache to ../data/processed/all_matches_with_zones.parquet...

 Loaded 962,990 events from 306 matches
 Total time: 4.6 minutes
DATASET SUMMARY
Total events: 962,990
Total matches: 306
Total players: 494

Event type distribution:
shape: (10, 2)
┌───────────────────┬────────┐
│ event_type        ┆ count  │
│ ---               ┆ ---    │
│ str               ┆ u32    │
╞═══════════════════╪════════╡
│ PASS              ┆ 322983 │
│ GENERIC:RECEPTION ┆ 254752 │
│ CARRY             ┆ 208997 │
│ RECOVERY          ┆ 46118  │
│ DUEL              ┆ 31562  │
│ BALL_OUT          ┆ 19574  │
│ INTERCEPTION      ┆ 19148  │
│ CLEARANCE         ┆ 14404  │
│ GENERIC:BLOCK     ┆ 11956  │
│ SHOT              ┆ 8094   │
└───────────────────┴────────┘

Zone distribution:
shape: (4, 2)
┌─────────────────┬────────┐
│ zone            ┆ count  │
│ ---             ┆ ---    │
│ str             ┆ u32    │
╞═════════════════╪════════╡
│ null            ┆ 19281  │
│ attacking_third 

In [9]:
# Cell 8: Data quality checks
print("DATA QUALITY CHECKS")
print("="*60)

# Check for null values in key columns
print("\n1. Null values in key columns:")
key_cols = ['event_type', 'team_id', 'player_id', 'coordinates_x', 'coordinates_y', 'zone']
for col in key_cols:
    null_count = all_events[col].null_count()
    null_pct = (null_count / len(all_events)) * 100
    print(f"  {col:20s}: {null_count:>8,} nulls ({null_pct:>5.2f}%)")

# Check coordinate ranges
print("\n2. Coordinate ranges:")
print(f"  X coordinates: [{all_events['coordinates_x'].min():.2f}, {all_events['coordinates_x'].max():.2f}]")
print(f"  Y coordinates: [{all_events['coordinates_y'].min():.2f}, {all_events['coordinates_y'].max():.2f}]")
print(f"  Expected X: [-52.5, 52.5]")
print(f"  Expected Y: [-34.0, 34.0]")

# Check events per match
print("\n3. Events per match:")
events_per_match = all_events.group_by('match_id').agg(pl.len().alias('event_count'))
print(f"  Min events per match: {events_per_match['event_count'].min()}")
print(f"  Max events per match: {events_per_match['event_count'].max()}")
print(f"  Mean events per match: {events_per_match['event_count'].mean():.0f}")

# Sample a few events to inspect
print("\n4. Sample of 5 random events:")
print(all_events.sample(5).select(['event_type', 'team_id', 'player_id', 'zone', 'result', 'success']))

print("\n✅ Data quality checks complete!")

DATA QUALITY CHECKS

1. Null values in key columns:
  event_type          :        0 nulls ( 0.00%)
  team_id             :    7,655 nulls ( 0.79%)
  player_id           :    7,655 nulls ( 0.79%)
  coordinates_x       :   19,281 nulls ( 2.00%)
  coordinates_y       :   19,281 nulls ( 2.00%)
  zone                :   19,281 nulls ( 2.00%)

2. Coordinate ranges:
  X coordinates: [-52.50, 52.50]
  Y coordinates: [-34.00, 34.00]
  Expected X: [-52.5, 52.5]
  Expected Y: [-34.0, 34.0]

3. Events per match:
  Min events per match: 2314
  Max events per match: 4131
  Mean events per match: 3147

4. Sample of 5 random events:
shape: (5, 6)
┌────────────┬─────────┬───────────┬─────────────────┬──────────┬─────────┐
│ event_type ┆ team_id ┆ player_id ┆ zone            ┆ result   ┆ success │
│ ---        ┆ ---     ┆ ---       ┆ ---             ┆ ---      ┆ ---     │
│ str        ┆ str     ┆ str       ┆ str             ┆ str      ┆ bool    │
╞════════════╪═════════╪═══════════╪═════════════════╪══

In [10]:
# Cell 9: Save useful metadata
print("Saving metadata for later analysis...")

# Save matches dataframe
matches.write_parquet(PROCESSED_DIR / "matches_metadata.parquet")
print(f"✅ Saved matches metadata: {len(matches)} matches")

# Save squads dataframe  
squads.write_parquet(PROCESSED_DIR / "squads_metadata.parquet")
print(f"✅ Saved squads metadata: {len(squads)} teams")

# Create player metadata (we'll need this for normalization later)
player_metadata = all_events.group_by(['player_id', 'team_id']).agg([
    pl.len().alias('total_events'),
    pl.col('match_id').n_unique().alias('matches_played')
]).sort('total_events', descending=True)

player_metadata.write_parquet(PROCESSED_DIR / "player_metadata.parquet")
print(f"✅ Saved player metadata: {len(player_metadata)} players")

print("\n" + "="*60)
print("STEP 1 COMPLETE! 🎉")
print("="*60)
print("\nFiles created:")
print(f"  1. {PROCESSED_DIR / 'all_matches_with_zones.parquet'}")
print(f"  2. {PROCESSED_DIR / 'matches_metadata.parquet'}")
print(f"  3. {PROCESSED_DIR / 'squads_metadata.parquet'}")
print(f"  4. {PROCESSED_DIR / 'player_metadata.parquet'}")

Saving metadata for later analysis...
✅ Saved matches metadata: 306 matches
✅ Saved squads metadata: 18 teams
✅ Saved player metadata: 507 players

STEP 1 COMPLETE! 🎉

Files created:
  1. ..\data\processed\all_matches_with_zones.parquet
  2. ..\data\processed\matches_metadata.parquet
  3. ..\data\processed\squads_metadata.parquet
  4. ..\data\processed\player_metadata.parquet

Next: We'll create the event weighting system in Step 2!
