In [None]:
# Cell 1: Imports and load cached data
import polars as pl
import numpy as np
from pathlib import Path

# Paths
DATA_DIR = Path("../data")
PROCESSED_DIR = DATA_DIR / "processed"

# Load cached data
print("Loading cached data...")
all_events = pl.read_parquet(PROCESSED_DIR / "all_matches_with_zones.parquet")
matches = pl.read_parquet(PROCESSED_DIR / "matches_metadata.parquet")
player_metadata = pl.read_parquet(PROCESSED_DIR / "player_metadata.parquet")

print(f"Loaded {len(all_events):,} events")
print(f"Loaded {len(matches)} matches")
print(f"Loaded {len(player_metadata)} players")

Loading cached data...
✅ Loaded 962,990 events
✅ Loaded 306 matches
✅ Loaded 507 players


In [None]:
# Cell 2: Understand what data we have for each event type

# Check what attributes exist for different events
event_types_to_check = ['PASS', 'SHOT', 'DUEL', 'CARRY', 'INTERCEPTION', 'CLEARANCE']

for event_type in event_types_to_check:
    print(f"\n{event_type}:")
    sample = all_events.filter(pl.col('event_type') == event_type).head(100)
    
    # Check what result values exist
    if 'result' in sample.columns:
        results = sample['result'].value_counts().sort('count', descending=True)
        print(f"  Results: {results['result'].to_list()}")
    
    # Check pass types
    if event_type == 'PASS' and 'pass_type' in sample.columns:
        pass_types = sample['pass_type'].drop_nulls().value_counts().sort('count', descending=True)
        print(f"  Pass types: {pass_types['pass_type'].to_list()[:5]}")
    
    # Check duel types
    if event_type == 'DUEL' and 'duel_type' in sample.columns:
        duel_types = sample['duel_type'].drop_nulls().value_counts().sort('count', descending=True)
        print(f"  Duel types: {duel_types['duel_type'].to_list()}")
    
    # Check success rate
    if 'success' in sample.columns:
        success_rate = sample['success'].drop_nulls().mean()
        print(f"  Success rate: {success_rate:.1%}")

EXPLORING EVENT ATTRIBUTES

PASS:
  Results: ['COMPLETE', 'INCOMPLETE']
  Pass types: ['HEAD_PASS', 'CHIPPED_PASS', 'HAND_PASS', 'SHOT_ASSIST']
  Success rate: 78.0%

SHOT:
  Results: ['OFF_TARGET', 'SAVED', 'GOAL', 'OWN_GOAL']
  Success rate: 12.0%

DUEL:
  Results: ['WON', 'LOST']
  Duel types: ['GROUND', 'AERIAL']
  Success rate: 50.0%

CARRY:
  Results: ['COMPLETE', 'INCOMPLETE']
  Success rate: 88.0%

INTERCEPTION:
  Results: ['SUCCESS']
  Success rate: 100.0%

CLEARANCE:
  Results: [None]


TypeError: unsupported format string passed to NoneType.__format__

In [None]:
# Cell 3: Event weighting system
print("DEFINING EVENT WEIGHTING SYSTEM")

ATTACK_WEIGHTS = {
    'SHOT': {
        'GOAL': 10.0,                    # Scoring is most valuable
        'SAVED': 2.5,                    # On target, forced save
        'OFF_TARGET': 0.5,               # At least attempted
        'BLOCKED': 1.0,                  # Tested defense
    },
    'PASS': {
        'SHOT_ASSIST': 8.0,              # Direct assist (pass leading to shot)
        'INTO_PENALTY_AREA': 2.0,        # Dangerous pass
        'PROGRESSIVE_COMPLETE': 1.5,     # Forward pass >10m
        'COMPLETE_ATTACKING': 1.0,       # Successful pass in attacking third
        'COMPLETE_MIDDLE': 0.5,          # Successful pass in middle third
        'COMPLETE_DEFENSIVE': 0.2,       # Successful pass in defensive third
        'INCOMPLETE': -0.3,              # Failed pass penalty (Option B)
    },
    'CARRY': {
        'PROGRESSIVE_COMPLETE': 1.2,     # Dribble forward >10m
        'COMPLETE_ATTACKING': 0.8,       # Carry in attacking third
        'COMPLETE_MIDDLE': 0.4,          # Carry in middle third
        'INCOMPLETE': -0.2,              # Lost ball while carrying
    },
    'DUEL': {
        'WON_ATTACKING': 1.5,            # Win duel in attacking third (pressing)
        'WON_MIDDLE': 0.8,               # Win duel in middle
        'LOST': -0.2,                    # Lost duel penalty
    },
}


DEFENSE_WEIGHTS = {
    'DUEL': {
        'WON_DEFENSIVE': 3.0,            # Critical - won ball in own third
        'WON_MIDDLE': 2.0,               # Important defensive action
        'WON_ATTACKING': 1.0,            # Pressing high up
        'LOST_DEFENSIVE': -1.0,          # Dangerous loss in own third
        'LOST_MIDDLE': -0.5,             # Loss in middle
        'LOST_ATTACKING': -0.2,          # Failed press
    },
    'INTERCEPTION': {
        'DEFENSIVE': 2.5,                # Critical interception
        'MIDDLE': 2.0,                   # Good defensive read
        'ATTACKING': 1.5,                # High press interception
    },
    'CLEARANCE': {
        'DEFENSIVE': 2.0,                # Clearing danger
        'MIDDLE': 1.0,                   # Clearing from midfield
    },
    'RECOVERY': {
        'DEFENSIVE': 1.5,                # Recovering loose ball in own third
        'MIDDLE': 1.0,                   # Recovery in middle
        'ATTACKING': 0.8,                # Recovery high up
    },
}

PASSING_WEIGHTS = {
    'PASS': {
        'COMPLETE_LONG': 1.5,            # Long pass >30m completed
        'COMPLETE_PROGRESSIVE': 1.2,     # Progressive pass >10m
        'COMPLETE': 0.5,                 # Any successful pass
        'INCOMPLETE_LONG': -0.5,         # Failed long pass
        'INCOMPLETE': -0.3,              # Failed pass
        'SHOT_ASSIST': 3.0,              # Key pass (in passing context too)
    },
}

POSITION_WEIGHTS = {
    'forward': {'attack': 0.60, 'passing': 0.30, 'defense': 0.10},
    'midfielder': {'attack': 0.35, 'passing': 0.35, 'defense': 0.30},
    'defender': {'attack': 0.15, 'passing': 0.25, 'defense': 0.60},
    'goalkeeper': {'attack': 0.05, 'passing': 0.25, 'defense': 0.70},
}

print("\nAttack rating components:", len(ATTACK_WEIGHTS))
print("Defense rating components:", len(DEFENSE_WEIGHTS))
print("Passing rating components:", len(PASSING_WEIGHTS))

DEFINING EVENT WEIGHTING SYSTEM

✅ Weighting system defined

Attack rating components: 4
Defense rating components: 4
Passing rating components: 1


In [None]:
# Cell 4: Helper functions
def is_progressive_pass(row):
    """
    Check if a pass is progressive (forward >10m).
    """
    if row['end_coordinates_x'] is None or row['coordinates_x'] is None:
        return False
    
    forward_distance = row['end_coordinates_x'] - row['coordinates_x']
    return forward_distance >= 10.0

def is_progressive_carry(row):
    """
    Check if a carry is progressive (forward >10m).
    """
    if row['end_coordinates_x'] is None or row['coordinates_x'] is None:
        return False
    
    forward_distance = row['end_coordinates_x'] - row['coordinates_x']
    return forward_distance >= 10.0

def is_long_pass(row):
    """
    Check if a pass is long (>30m).
    """
    if row['end_coordinates_x'] is None or row['coordinates_x'] is None:
        return False
    if row['end_coordinates_y'] is None or row['coordinates_y'] is None:
        return False
    
    distance = np.sqrt(
        (row['end_coordinates_x'] - row['coordinates_x'])**2 + 
        (row['end_coordinates_y'] - row['coordinates_y'])**2
    )
    return distance >= 30.0

def is_into_penalty_area(row):
    """
    Check if pass ends in penalty area.
    Penalty area: x > 35.5 and -9.15 < y < 9.15 (approximately)
    """
    if row['end_coordinates_x'] is None or row['end_coordinates_y'] is None:
        return False
    
    # Adjust for secondspectrum coordinates
    in_penalty_x = row['end_coordinates_x'] > 35.5
    in_penalty_y = -9.15 < row['end_coordinates_y'] < 9.15
    
    return in_penalty_x and in_penalty_y

print("  - is_progressive_pass (>10m forward)")
print("  - is_progressive_carry (>10m forward)")
print("  - is_long_pass (>30m)")
print("  - is_into_penalty_area (ends in box)")

✅ Helper functions defined
  - is_progressive_pass (>10m forward)
  - is_progressive_carry (>10m forward)
  - is_long_pass (>30m)
  - is_into_penalty_area (ends in box)


In [None]:
# Cell 5: Calculate attack points
def calculate_attack_points(row):
    """
    Calculate attack rating points for a single event.
    """
    event_type = row['event_type']
    result = row['result']
    zone = row['zone']
    
    points = 0.0
    
    # SHOTS
    if event_type == 'SHOT':
        if result == 'GOAL':
            points = ATTACK_WEIGHTS['SHOT']['GOAL']
        elif result == 'SAVED':
            points = ATTACK_WEIGHTS['SHOT']['SAVED']
        elif result == 'OFF_TARGET':
            points = ATTACK_WEIGHTS['SHOT']['OFF_TARGET']
        elif result == 'BLOCKED':
            points = ATTACK_WEIGHTS['SHOT']['BLOCKED']
    
    # PASSES
    elif event_type == 'PASS':
        pass_type = row.get('pass_type', None)
        
        # Check for assist
        if pass_type == 'SHOT_ASSIST':
            points = ATTACK_WEIGHTS['PASS']['SHOT_ASSIST']
        
        # Check if into penalty area
        elif is_into_penalty_area(row):
            points = ATTACK_WEIGHTS['PASS']['INTO_PENALTY_AREA']
        
        # Check if progressive
        elif result == 'COMPLETE' and is_progressive_pass(row):
            points = ATTACK_WEIGHTS['PASS']['PROGRESSIVE_COMPLETE']
        
        # Zone-based scoring for complete passes
        elif result == 'COMPLETE':
            if zone == 'attacking_third':
                points = ATTACK_WEIGHTS['PASS']['COMPLETE_ATTACKING']
            elif zone == 'middle_third':
                points = ATTACK_WEIGHTS['PASS']['COMPLETE_MIDDLE']
            elif zone == 'defensive_third':
                points = ATTACK_WEIGHTS['PASS']['COMPLETE_DEFENSIVE']
        
        # Failed pass penalty
        elif result == 'INCOMPLETE':
            points = ATTACK_WEIGHTS['PASS']['INCOMPLETE']
    
    # CARRIES
    elif event_type == 'CARRY':
        if result == 'COMPLETE':
            if is_progressive_carry(row):
                points = ATTACK_WEIGHTS['CARRY']['PROGRESSIVE_COMPLETE']
            elif zone == 'attacking_third':
                points = ATTACK_WEIGHTS['CARRY']['COMPLETE_ATTACKING']
            elif zone == 'middle_third':
                points = ATTACK_WEIGHTS['CARRY']['COMPLETE_MIDDLE']
        elif result == 'INCOMPLETE':
            points = ATTACK_WEIGHTS['CARRY']['INCOMPLETE']
    
    # DUELS (attacking contribution)
    elif event_type == 'DUEL':
        if result == 'WON':
            if zone == 'attacking_third':
                points = ATTACK_WEIGHTS['DUEL']['WON_ATTACKING']
            elif zone == 'middle_third':
                points = ATTACK_WEIGHTS['DUEL']['WON_MIDDLE']
        elif result == 'LOST':
            points = ATTACK_WEIGHTS['DUEL']['LOST']
    
    return points

print("Testing attack points calculation on sample events...")

# Test on a few events
test_events = all_events.filter(
    pl.col('event_type').is_in(['PASS', 'SHOT', 'CARRY', 'DUEL'])
).sample(5)

for row in test_events.iter_rows(named=True):
    pts = calculate_attack_points(row)
    print(f"{row['event_type']:15s} | {str(row['result']):15s} | {row['zone']:20s} | Points: {pts:>5.2f}")


Testing attack points calculation on sample events...
CARRY           | COMPLETE        | middle_third         | Points:  0.40
CARRY           | COMPLETE        | attacking_third      | Points:  0.80
CARRY           | COMPLETE        | middle_third         | Points:  0.40
CARRY           | COMPLETE        | middle_third         | Points:  0.40
PASS            | COMPLETE        | defensive_third      | Points:  0.20

✅ Attack points function ready


In [None]:
# Cell 6: Calculate defense points
def calculate_defense_points(row):
    """
    Calculate defense rating points for a single event.
    """
    event_type = row['event_type']
    result = row['result']
    zone = row['zone']
    
    points = 0.0
    
    # DUELS (defensive contribution)
    if event_type == 'DUEL':
        if result == 'WON':
            if zone == 'defensive_third':
                points = DEFENSE_WEIGHTS['DUEL']['WON_DEFENSIVE']
            elif zone == 'middle_third':
                points = DEFENSE_WEIGHTS['DUEL']['WON_MIDDLE']
            elif zone == 'attacking_third':
                points = DEFENSE_WEIGHTS['DUEL']['WON_ATTACKING']
        elif result == 'LOST':
            if zone == 'defensive_third':
                points = DEFENSE_WEIGHTS['DUEL']['LOST_DEFENSIVE']
            elif zone == 'middle_third':
                points = DEFENSE_WEIGHTS['DUEL']['LOST_MIDDLE']
            elif zone == 'attacking_third':
                points = DEFENSE_WEIGHTS['DUEL']['LOST_ATTACKING']
    
    # INTERCEPTIONS
    elif event_type == 'INTERCEPTION':
        if zone == 'defensive_third':
            points = DEFENSE_WEIGHTS['INTERCEPTION']['DEFENSIVE']
        elif zone == 'middle_third':
            points = DEFENSE_WEIGHTS['INTERCEPTION']['MIDDLE']
        elif zone == 'attacking_third':
            points = DEFENSE_WEIGHTS['INTERCEPTION']['ATTACKING']
    
    # CLEARANCES
    elif event_type == 'CLEARANCE':
        if zone == 'defensive_third':
            points = DEFENSE_WEIGHTS['CLEARANCE']['DEFENSIVE']
        elif zone == 'middle_third':
            points = DEFENSE_WEIGHTS['CLEARANCE']['MIDDLE']
    
    # RECOVERIES
    elif event_type == 'RECOVERY':
        if zone == 'defensive_third':
            points = DEFENSE_WEIGHTS['RECOVERY']['DEFENSIVE']
        elif zone == 'middle_third':
            points = DEFENSE_WEIGHTS['RECOVERY']['MIDDLE']
        elif zone == 'attacking_third':
            points = DEFENSE_WEIGHTS['RECOVERY']['ATTACKING']
    
    return points

print("Testing defense points calculation on sample events...")

# Test on defensive events
test_events = all_events.filter(
    pl.col('event_type').is_in(['DUEL', 'INTERCEPTION', 'CLEARANCE', 'RECOVERY'])
).sample(5)

for row in test_events.iter_rows(named=True):
    pts = calculate_defense_points(row)
    print(f"{row['event_type']:15s} | {str(row['result']):15s} | {row['zone']:20s} | Points: {pts:>5.2f}")

print("\n Defense points function ready")

Testing defense points calculation on sample events...
CLEARANCE       | None            | middle_third         | Points:  1.00
RECOVERY        | None            | middle_third         | Points:  1.00
DUEL            | WON             | defensive_third      | Points:  3.00
RECOVERY        | None            | attacking_third      | Points:  0.80
CLEARANCE       | None            | middle_third         | Points:  1.00

✅ Defense points function ready


In [None]:
# Cell 7: Calculate passing points
def calculate_passing_points(row):
    """
    Calculate passing rating points for a single event.
    """
    event_type = row['event_type']
    result = row['result']
    
    points = 0.0
    
    # Only PASS events contribute to passing rating
    if event_type == 'PASS':
        pass_type = row.get('pass_type', None)
        
        # Assist bonus
        if pass_type == 'SHOT_ASSIST':
            points = PASSING_WEIGHTS['PASS']['SHOT_ASSIST']
        
        # Complete passes
        elif result == 'COMPLETE':
            # Long pass bonus
            if is_long_pass(row):
                points = PASSING_WEIGHTS['PASS']['COMPLETE_LONG']
            # Progressive pass bonus
            elif is_progressive_pass(row):
                points = PASSING_WEIGHTS['PASS']['COMPLETE_PROGRESSIVE']
            # Regular complete pass
            else:
                points = PASSING_WEIGHTS['PASS']['COMPLETE']
        
        # Failed passes
        elif result == 'INCOMPLETE':
            # Long pass failure
            if is_long_pass(row):
                points = PASSING_WEIGHTS['PASS']['INCOMPLETE_LONG']
            # Regular failure
            else:
                points = PASSING_WEIGHTS['PASS']['INCOMPLETE']
    
    return points

print("Testing passing points calculation on sample events...")

# Test on passes
test_events = all_events.filter(
    pl.col('event_type') == 'PASS'
).sample(5)

for row in test_events.iter_rows(named=True):
    pts = calculate_passing_points(row)
    is_prog = is_progressive_pass(row)
    is_long = is_long_pass(row)
    print(f"{row['result']:15s} | Progressive: {is_prog} | Long: {is_long} | Points: {pts:>5.2f}")


Testing passing points calculation on sample events...
COMPLETE        | Progressive: True | Long: False | Points:  1.20
COMPLETE        | Progressive: False | Long: False | Points:  0.50
COMPLETE        | Progressive: False | Long: False | Points:  0.50
INCOMPLETE      | Progressive: True | Long: True | Points: -0.50
COMPLETE        | Progressive: False | Long: False | Points:  0.50

✅ Passing points function ready


In [None]:
# Cell 8: Calculate points for all events
print("Calculating points for all 962,990 events...")

import time
start_time = time.time()

# Convert to list of dicts for faster iteration (Polars is fast, but Python functions are the bottleneck)
events_list = all_events.to_dicts()

# Calculate points
attack_points_list = []
defense_points_list = []
passing_points_list = []

for i, row in enumerate(events_list):
    attack_points_list.append(calculate_attack_points(row))
    defense_points_list.append(calculate_defense_points(row))
    passing_points_list.append(calculate_passing_points(row))
    
    # Progress indicator
    if (i + 1) % 100000 == 0:
        elapsed = time.time() - start_time
        progress = (i + 1) / len(events_list)
        remaining = (elapsed / progress) * (1 - progress)
        print(f"  Processed {i+1:,}/{len(events_list):,} events ({progress:.1%}) | "
              f"Elapsed: {elapsed:.1f}s | Est. remaining: {remaining:.1f}s")

# Add points as new columns
all_events = all_events.with_columns([
    pl.Series("attack_points", attack_points_list),
    pl.Series("defense_points", defense_points_list),
    pl.Series("passing_points", passing_points_list),
])

elapsed_total = time.time() - start_time
print(f"\n Calculated points for all events in {elapsed_total:.1f} seconds")

# Summary statistics
print("POINTS DISTRIBUTION SUMMARY")

print("\nAttack Points:")
print(f"  Total: {all_events['attack_points'].sum():,.0f}")
print(f"  Mean: {all_events['attack_points'].mean():.3f}")
print(f"  Positive events: {(all_events['attack_points'] > 0).sum():,}")
print(f"  Negative events: {(all_events['attack_points'] < 0).sum():,}")

print("\nDefense Points:")
print(f"  Total: {all_events['defense_points'].sum():,.0f}")
print(f"  Mean: {all_events['defense_points'].mean():.3f}")
print(f"  Positive events: {(all_events['defense_points'] > 0).sum():,}")
print(f"  Negative events: {(all_events['defense_points'] < 0).sum():,}")

print("\nPassing Points:")
print(f"  Total: {all_events['passing_points'].sum():,.0f}")
print(f"  Mean: {all_events['passing_points'].mean():.3f}")
print(f"  Positive events: {(all_events['passing_points'] > 0).sum():,}")
print(f"  Negative events: {(all_events['passing_points'] < 0).sum():,}")

# Sample events with points
print("SAMPLE EVENTS WITH POINTS")
print(all_events.select([
    'event_type', 'result', 'zone', 
    'attack_points', 'defense_points', 'passing_points'
]).sample(10))

Calculating points for all 962,990 events...
⏳ This may take 2-3 minutes...
  Processed 100,000/962,990 events (10.4%) | Elapsed: 18.1s | Est. remaining: 156.0s
  Processed 200,000/962,990 events (20.8%) | Elapsed: 19.0s | Est. remaining: 72.5s
  Processed 300,000/962,990 events (31.2%) | Elapsed: 19.8s | Est. remaining: 43.8s
  Processed 400,000/962,990 events (41.5%) | Elapsed: 20.6s | Est. remaining: 29.0s
  Processed 500,000/962,990 events (51.9%) | Elapsed: 21.3s | Est. remaining: 19.7s
  Processed 600,000/962,990 events (62.3%) | Elapsed: 21.8s | Est. remaining: 13.2s
  Processed 700,000/962,990 events (72.7%) | Elapsed: 22.3s | Est. remaining: 8.4s
  Processed 800,000/962,990 events (83.1%) | Elapsed: 23.1s | Est. remaining: 4.7s
  Processed 900,000/962,990 events (93.5%) | Elapsed: 23.7s | Est. remaining: 1.7s

✅ Calculated points for all events in 24.1 seconds

POINTS DISTRIBUTION SUMMARY

Attack Points:
  Total: 301,062
  Mean: 0.313
  Positive events: 407,988
  Negative even

In [None]:
# Cell 9: Save processed events
print("Saving events with calculated points...")

all_events.write_parquet(PROCESSED_DIR / "all_events_with_points.parquet")

print(f"Saved to: {PROCESSED_DIR / 'all_events_with_points.parquet'}")
print(f"   File size: {(PROCESSED_DIR / 'all_events_with_points.parquet').stat().st_size / 1_000_000:.1f} MB")

Saving events with calculated points...
✅ Saved to: ..\data\processed\all_events_with_points.parquet
   File size: 17.0 MB


In [None]:
# Cell 10: Aggregate points by player
print("Aggregating points by player...")

player_ratings = (
    all_events
    .filter(pl.col('player_id').is_not_null())  # Remove null player_ids
    .group_by(['player_id', 'team_id'])
    .agg([
        # Sum all points
        pl.sum('attack_points').alias('total_attack_points'),
        pl.sum('defense_points').alias('total_defense_points'),
        pl.sum('passing_points').alias('total_passing_points'),
        
        # Count events for diagnostics
        pl.len().alias('total_events'),
        pl.col('match_id').n_unique().alias('matches_played'),
        
        # Calculate average field position for position classification
        pl.mean('coordinates_x').alias('avg_x_position'),
        pl.mean('coordinates_y').alias('avg_y_position'),
    ])
    .sort('total_attack_points', descending=True)
)

print(f"\nAggregated ratings for {len(player_ratings)} players")
print(f"\nTop 10 players by attack points:")
print(player_ratings.select(['player_id', 'team_id', 'total_attack_points', 'matches_played']).head(10))

print(f"\nTop 10 players by defense points:")
print(player_ratings.sort('total_defense_points', descending=True).select(['player_id', 'team_id', 'total_defense_points', 'matches_played']).head(10))

print(f"\nTop 10 players by passing points:")
print(player_ratings.sort('total_passing_points', descending=True).select(['player_id', 'team_id', 'total_passing_points', 'matches_played']).head(10))

Aggregating points by player...

✅ Aggregated ratings for 506 players

Top 10 players by attack points:
shape: (10, 4)
┌───────────┬─────────┬─────────────────────┬────────────────┐
│ player_id ┆ team_id ┆ total_attack_points ┆ matches_played │
│ ---       ┆ ---     ┆ ---                 ┆ ---            │
│ str       ┆ str     ┆ f64                 ┆ u32            │
╞═══════════╪═════════╪═════════════════════╪════════════════╡
│ 281       ┆ 41      ┆ 3405.0              ┆ 33             │
│ 98        ┆ 33      ┆ 2724.5              ┆ 28             │
│ 5616      ┆ 41      ┆ 2683.7              ┆ 33             │
│ 27123     ┆ 46      ┆ 2573.0              ┆ 31             │
│ 64023     ┆ 41      ┆ 2471.3              ┆ 32             │
│ 1481      ┆ 416     ┆ 2349.1              ┆ 32             │
│ 13823     ┆ 37      ┆ 2340.7              ┆ 31             │
│ 32214     ┆ 29      ┆ 2287.9              ┆ 33             │
│ 1359      ┆ 46      ┆ 2153.0              ┆ 33             │

In [None]:
# Cell 10A: Extract player names from match metadata
from kloppy import impect
from tqdm.notebook import tqdm
print("Extracting player names from dataset metadata...")

# Load one match to get player metadata structure
sample_match_id = matches['matchId'][0]
sample_dataset = impect.load_open_data(match_id=sample_match_id, competition_id=743)

# Check metadata structure
print("\nDataset metadata structure:")
print(f"  Teams: {len(sample_dataset.metadata.teams)}")
print(f"  Periods: {len(sample_dataset.metadata.periods)}")

# Extract player information from metadata
player_names_dict = {}

print("\nExtracting player names from all matches...")
for match_id in tqdm(matches['matchId'][:10], desc="Sampling matches for player names"):  # Sample first 10 matches
    try:
        dataset = impect.load_open_data(match_id=match_id, competition_id=743)
        
        # Get players from both teams
        for team in dataset.metadata.teams:
            for player in team.players:
                if player.player_id not in player_names_dict:
                    player_names_dict[player.player_id] = {
                        'player_name': player.name if hasattr(player, 'name') else f"Player_{player.player_id}",
                        'jersey_no': player.jersey_no if hasattr(player, 'jersey_no') else None
                    }
    except Exception as e:
        continue

print(f"\n Found names for {len(player_names_dict)} players")

# Convert to dataframe
player_names_df = pl.DataFrame([
    {'player_id': str(pid), 'player_name': info['player_name'], 'jersey_no': info['jersey_no']}
    for pid, info in player_names_dict.items()
])

# Join with player ratings
player_ratings = player_ratings.join(
    player_names_df,
    on='player_id',
    how='left'
)

print("\nTop 10 by attack points (with names):")
print(player_ratings.select(['player_name', 'player_id', 'total_attack_points', 'matches_played']).head(10))


Extracting player names from dataset metadata...



You are about to use IMPECT public data.
By using this data, you are agreeing to the user agreement. 
The user agreement can be found here: https://github.com/ImpectAPI/open-data/blob/main/LICENSE.pdf




Dataset metadata structure:
  Teams: 2
  Periods: 2

Extracting player names from all matches...


Sampling matches for player names:   0%|          | 0/10 [00:00<?, ?it/s]


✅ Found names for 363 players

Top 10 by attack points (with names):
shape: (10, 4)
┌────────────────────┬───────────┬─────────────────────┬────────────────┐
│ player_name        ┆ player_id ┆ total_attack_points ┆ matches_played │
│ ---                ┆ ---       ┆ ---                 ┆ ---            │
│ str                ┆ str       ┆ f64                 ┆ u32            │
╞════════════════════╪═══════════╪═════════════════════╪════════════════╡
│ Granit Xhaka       ┆ 281       ┆ 3405.0              ┆ 33             │
│ Joshua Kimmich     ┆ 98        ┆ 2724.5              ┆ 28             │
│ Alejandro Grimaldo ┆ 5616      ┆ 2683.7              ┆ 33             │
│ Angelo Stiller     ┆ 27123     ┆ 2573.0              ┆ 31             │
│ Florian Wirtz      ┆ 64023     ┆ 2471.3              ┆ 32             │
│ Kevin Stöger       ┆ 1481      ┆ 2349.1              ┆ 32             │
│ David Raum         ┆ 13823     ┆ 2340.7              ┆ 31             │
│ Nico Schlotterbeck ┆ 3221

In [31]:
# Check where null names come from
print("Investigating null-named players...")

# Check in old ratings
null_in_old = player_ratings.filter(pl.col('player_name').is_null())
print(f"\nNull names in old ratings: {len(null_in_old)}")
if len(null_in_old) > 0:
    print(null_in_old.select(['player_id', 'team_id', 'matches_played', 'total_events']).head())

# Check if they exist in events
if len(null_players) > 0:
    sample_null_id = null_players['player_id'][0]
    null_events = all_events.filter(pl.col('player_id') == sample_null_id)
    print(f"\nEvents for first null player (ID={sample_null_id}): {len(null_events)}")
    print(f"Matches played: {null_events['match_id'].n_unique()}")

Investigating null-named players...

Null names in old ratings: 151
shape: (5, 4)
┌───────────┬─────────┬────────────────┬──────────────┐
│ player_id ┆ team_id ┆ matches_played ┆ total_events │
│ ---       ┆ ---     ┆ ---            ┆ ---          │
│ str       ┆ str     ┆ u32            ┆ u32          │
╞═══════════╪═════════╪════════════════╪══════════════╡
│ 1333      ┆ 30      ┆ 30             ┆ 4387         │
│ 6371      ┆ 432     ┆ 31             ┆ 2987         │
│ 53110     ┆ 41      ┆ 25             ┆ 4230         │
│ 1047      ┆ 39      ┆ 30             ┆ 4509         │
│ 568       ┆ 32      ┆ 30             ┆ 5113         │
└───────────┴─────────┴────────────────┴──────────────┘

Events for first null player (ID=1333): 4387
Matches played: 30


In [18]:
print("\nTop 10 by attack points (with names):")
print(player_ratings.select(['player_name', 'player_id', 'total_attack_points', 'matches_played']).head(10))
print(f"\nTop 10 players by defense points:")
print(player_ratings.sort('total_defense_points', descending=True).select(['player_name','player_id', 'team_id', 'total_defense_points', 'matches_played']).head(10))

print(f"\nTop 10 players by passing points:")
print(player_ratings.sort('total_passing_points', descending=True).select(['player_name','player_id', 'team_id', 'total_passing_points', 'matches_played']).head(10))


Top 10 by attack points (with names):
shape: (10, 4)
┌────────────────────┬───────────┬─────────────────────┬────────────────┐
│ player_name        ┆ player_id ┆ total_attack_points ┆ matches_played │
│ ---                ┆ ---       ┆ ---                 ┆ ---            │
│ str                ┆ str       ┆ f64                 ┆ u32            │
╞════════════════════╪═══════════╪═════════════════════╪════════════════╡
│ Granit Xhaka       ┆ 281       ┆ 3405.0              ┆ 33             │
│ Joshua Kimmich     ┆ 98        ┆ 2724.5              ┆ 28             │
│ Alejandro Grimaldo ┆ 5616      ┆ 2683.7              ┆ 33             │
│ Angelo Stiller     ┆ 27123     ┆ 2573.0              ┆ 31             │
│ Florian Wirtz      ┆ 64023     ┆ 2471.3              ┆ 32             │
│ Kevin Stöger       ┆ 1481      ┆ 2349.1              ┆ 32             │
│ David Raum         ┆ 13823     ┆ 2340.7              ┆ 31             │
│ Nico Schlotterbeck ┆ 32214     ┆ 2287.9              ┆ 3

In [None]:
# Cell 11: Classify positions using average x-coordinate
print("Classifying player positions based on average field position...")
print("="*60)

def classify_position(avg_x, attack_ratio, defense_ratio):
    """
    Classify player position based on:
    - avg_x: Average x-coordinate (field position)
    - attack_ratio: Proportion of points from attack
    - defense_ratio: Proportion of points from defense
    
    Position zones (secondspectrum coordinates):
    - Forward: avg_x > 10
    - Midfielder: -10 <= avg_x <= 10
    - Defender: avg_x < -10
    """
    
    # Field position-based classification
    if avg_x > 10:
        return 'forward'
    elif avg_x < -10:
        return 'defender'
    else:
        # Midfielders - further classify by attack/defense ratio
        return 'midfielder'

# Calculate attack/defense ratios
player_ratings = player_ratings.with_columns([
    # Total points (for ratio calculation)
    (pl.col('total_attack_points') + pl.col('total_defense_points') + pl.col('total_passing_points')).alias('total_points'),
])

# Calculate ratios (avoid division by zero)
player_ratings = player_ratings.with_columns([
    (pl.col('total_attack_points') / pl.col('total_points')).fill_nan(0).alias('attack_ratio'),
    (pl.col('total_defense_points') / pl.col('total_points')).fill_nan(0).alias('defense_ratio'),
])

# Apply position classification
positions = []
for row in player_ratings.iter_rows(named=True):
    pos = classify_position(
        row['avg_x_position'] if row['avg_x_position'] is not None else 0,
        row['attack_ratio'],
        row['defense_ratio']
    )
    positions.append(pos)

player_ratings = player_ratings.with_columns([
    pl.Series('position', positions)
])

# Position distribution
print("\nPosition distribution:")
print(player_ratings.group_by('position').agg([
    pl.len().alias('count'),
    pl.mean('avg_x_position').alias('avg_x'),
    pl.mean('attack_ratio').alias('avg_attack_ratio'),
    pl.mean('defense_ratio').alias('avg_defense_ratio'),
]).sort('position'))

print("\n Positions classified")

Classifying player positions based on average field position...

Position distribution:
shape: (3, 5)
┌────────────┬───────┬────────────┬──────────────────┬───────────────────┐
│ position   ┆ count ┆ avg_x      ┆ avg_attack_ratio ┆ avg_defense_ratio │
│ ---        ┆ ---   ┆ ---        ┆ ---              ┆ ---               │
│ str        ┆ u32   ┆ f64        ┆ f64              ┆ f64               │
╞════════════╪═══════╪════════════╪══════════════════╪═══════════════════╡
│ defender   ┆ 123   ┆ -22.713444 ┆ 0.41697          ┆ 0.281092          │
│ forward    ┆ 127   ┆ 14.075714  ┆ 0.712896         ┆ 0.121037          │
│ midfielder ┆ 256   ┆ 1.165859   ┆ 0.527223         ┆ 0.25599           │
└────────────┴───────┴────────────┴──────────────────┴───────────────────┘

✅ Positions classified


In [None]:
# Cell 12 - FIXED: Better minutes estimation
print("Estimating minutes played per player (FIXED)...")
print("="*60)

# Better approach: Assume players with more events played more minutes
# Use a simple heuristic: events_per_90 approach

# Calculate match-level statistics
match_stats = all_events.group_by('match_id').agg([
    pl.len().alias('total_events'),
])

avg_events_per_match = match_stats['total_events'].mean()
print(f"Average events per match: {avg_events_per_match:.0f}")

# For each player, estimate minutes based on their event participation rate
# Assumption: A player with 100 events in 1 match likely played ~90 minutes
# A player with 50 events in 1 match likely played ~45 minutes

# Calculate events per match for each player
player_ratings = player_ratings.with_columns([
    (pl.col('total_events') / pl.col('matches_played')).alias('events_per_match')
])

# Estimate minutes per match: 
# Scale linearly: if avg player has ~35 events per match over 90 min,
# then minutes = (player_events / 35) * 90, capped at 90
avg_starter_events = 35  # Rough estimate for a full-90 player

player_ratings = player_ratings.with_columns([
    (
        (pl.col('events_per_match') / avg_starter_events) * 90
    ).clip(0, 90).alias('estimated_minutes_per_match')
])

# Total minutes
player_ratings = player_ratings.with_columns([
    (pl.col('estimated_minutes_per_match') * pl.col('matches_played')).alias('estimated_total_minutes')
])

print("\nMinutes played distribution:")
print(f"  Min: {player_ratings['estimated_total_minutes'].min():.0f}")
print(f"  Max: {player_ratings['estimated_total_minutes'].max():.0f}")
print(f"  Mean: {player_ratings['estimated_total_minutes'].mean():.0f}")
print(f"  Median: {player_ratings['estimated_total_minutes'].median():.0f}")

# Filter for significant playing time
player_ratings_filtered = player_ratings.filter(pl.col('estimated_total_minutes') >= 500)

print(f"\n Players with >500 minutes: {len(player_ratings_filtered)} / {len(player_ratings)}")
print(f" Players with >1000 minutes: {(player_ratings['estimated_total_minutes'] >= 1000).sum()} / {len(player_ratings)}")
print(f" Players with >2000 minutes: {(player_ratings['estimated_total_minutes'] >= 2000).sum()} / {len(player_ratings)}")

# Show distribution
print("\nSample of players with minutes:")
print(player_ratings.select([
    'player_name','player_id', 'matches_played', 'total_events', 'events_per_match', 
    'estimated_minutes_per_match', 'estimated_total_minutes'
]).sort('estimated_total_minutes', descending=True).head(10))

Estimating minutes played per player (FIXED)...
Average events per match: 3147

Minutes played distribution:
  Min: 8
  Max: 3060
  Mean: 1650
  Median: 1890

✅ Players with >500 minutes: 396 / 506
✅ Players with >1000 minutes: 347 / 506
✅ Players with >2000 minutes: 229 / 506

Sample of players with minutes:
shape: (10, 7)
┌────────────────┬───────────┬────────────────┬──────────────┬──────────────────┬───────────────────┬──────────────────┐
│ player_name    ┆ player_id ┆ matches_played ┆ total_events ┆ events_per_match ┆ estimated_minutes ┆ estimated_total_ │
│ ---            ┆ ---       ┆ ---            ┆ ---          ┆ ---              ┆ _per_match        ┆ minutes          │
│ str            ┆ str       ┆ u32            ┆ u32          ┆ f64              ┆ ---               ┆ ---              │
│                ┆           ┆                ┆              ┆                  ┆ f64               ┆ f64              │
╞════════════════╪═══════════╪════════════════╪══════════════╪═══════

In [None]:
# Cell 13: Normalize to per-90 minutes
print("Normalizing ratings to per-90 minutes...")

# Calculate per-90 ratings
player_ratings = player_ratings.with_columns([
    ((pl.col('total_attack_points') / pl.col('estimated_total_minutes')) * 90).alias('attack_per90'),
    ((pl.col('total_defense_points') / pl.col('estimated_total_minutes')) * 90).alias('defense_per90'),
    ((pl.col('total_passing_points') / pl.col('estimated_total_minutes')) * 90).alias('passing_per90'),
])

# Filter for players with significant playing time (>500 minutes)
player_ratings_filtered = player_ratings.filter(pl.col('estimated_total_minutes') >= 500)

print(f" {len(player_ratings_filtered)} players with >500 minutes")

print("\nTop 10 by attack per-90:")
print(player_ratings_filtered
      .select(['player_name', 'position', 'attack_per90', 'estimated_total_minutes', 'matches_played'])
      .sort('attack_per90', descending=True)
      .head(10))

print("\nTop 10 by defense per-90:")
print(player_ratings_filtered
      .select(['player_name', 'position', 'defense_per90', 'estimated_total_minutes', 'matches_played'])
      .sort('defense_per90', descending=True)
      .head(10))

print("\nTop 10 by passing per-90:")
print(player_ratings_filtered
      .select(['player_name', 'position', 'passing_per90', 'estimated_total_minutes', 'matches_played'])
      .sort('passing_per90', descending=True)
      .head(10))

Normalizing ratings to per-90 minutes...
✅ 396 players with >500 minutes

Top 10 by attack per-90:
shape: (10, 5)
┌────────────────────┬────────────┬──────────────┬─────────────────────────┬────────────────┐
│ player_name        ┆ position   ┆ attack_per90 ┆ estimated_total_minutes ┆ matches_played │
│ ---                ┆ ---        ┆ ---          ┆ ---                     ┆ ---            │
│ str                ┆ str        ┆ f64          ┆ f64                     ┆ u32            │
╞════════════════════╪════════════╪══════════════╪═════════════════════════╪════════════════╡
│ Granit Xhaka       ┆ midfielder ┆ 103.181818   ┆ 2970.0                  ┆ 33             │
│ Joshua Kimmich     ┆ midfielder ┆ 97.303571    ┆ 2520.0                  ┆ 28             │
│ Angelo Stiller     ┆ midfielder ┆ 83.0         ┆ 2790.0                  ┆ 31             │
│ Exequiel Palacios  ┆ midfielder ┆ 81.379167    ┆ 2160.0                  ┆ 24             │
│ Alejandro Grimaldo ┆ midfielder ┆ 81.3

In [None]:
# Cell 14: Scale ratings to 0-100
from sklearn.preprocessing import MinMaxScaler

print("Scaling ratings to 0-100 scale...")

# Extract data for scaling
attack_per90 = player_ratings_filtered['attack_per90'].to_numpy().reshape(-1, 1)
defense_per90 = player_ratings_filtered['defense_per90'].to_numpy().reshape(-1, 1)
passing_per90 = player_ratings_filtered['passing_per90'].to_numpy().reshape(-1, 1)

# Scale to 0-100
scaler_attack = MinMaxScaler(feature_range=(0, 100))
scaler_defense = MinMaxScaler(feature_range=(0, 100))
scaler_passing = MinMaxScaler(feature_range=(0, 100))

attack_rating = scaler_attack.fit_transform(attack_per90).flatten()
defense_rating = scaler_defense.fit_transform(defense_per90).flatten()
passing_rating = scaler_passing.fit_transform(passing_per90).flatten()

# Add scaled ratings
player_ratings_filtered = player_ratings_filtered.with_columns([
    pl.Series('attack_rating', attack_rating),
    pl.Series('defense_rating', defense_rating),
    pl.Series('passing_rating', passing_rating),
])

print(" Ratings scaled to 0-100")

print("\nRating distributions:")
print(f"  Attack:  min={player_ratings_filtered['attack_rating'].min():.1f}, "
      f"max={player_ratings_filtered['attack_rating'].max():.1f}, "
      f"mean={player_ratings_filtered['attack_rating'].mean():.1f}")
print(f"  Defense: min={player_ratings_filtered['defense_rating'].min():.1f}, "
      f"max={player_ratings_filtered['defense_rating'].max():.1f}, "
      f"mean={player_ratings_filtered['defense_rating'].mean():.1f}")
print(f"  Passing: min={player_ratings_filtered['passing_rating'].min():.1f}, "
      f"max={player_ratings_filtered['passing_rating'].max():.1f}, "
      f"mean={player_ratings_filtered['passing_rating'].mean():.1f}")

print("\nTop 5 by each rating:")
print("\nAttack Rating:")
print(player_ratings_filtered
      .select(['player_name', 'position', 'attack_rating', 'matches_played'])
      .sort('attack_rating', descending=True)
      .head(5))

print("\nDefense Rating:")
print(player_ratings_filtered
      .select(['player_name', 'position', 'defense_rating', 'matches_played'])
      .sort('defense_rating', descending=True)
      .head(5))

print("\nPassing Rating:")
print(player_ratings_filtered
      .select(['player_name', 'position', 'passing_rating', 'matches_played'])
      .sort('passing_rating', descending=True)
      .head(5))

Scaling ratings to 0-100 scale...
✅ Ratings scaled to 0-100

Rating distributions:
  Attack:  min=0.0, max=100.0, mean=25.2
  Defense: min=0.0, max=100.0, mean=30.7
  Passing: min=0.0, max=100.0, mean=24.5

Top 5 by each rating:

Attack Rating:
shape: (5, 4)
┌────────────────────┬────────────┬───────────────┬────────────────┐
│ player_name        ┆ position   ┆ attack_rating ┆ matches_played │
│ ---                ┆ ---        ┆ ---           ┆ ---            │
│ str                ┆ str        ┆ f64           ┆ u32            │
╞════════════════════╪════════════╪═══════════════╪════════════════╡
│ Granit Xhaka       ┆ midfielder ┆ 100.0         ┆ 33             │
│ Joshua Kimmich     ┆ midfielder ┆ 93.92114      ┆ 28             │
│ Angelo Stiller     ┆ midfielder ┆ 79.129415     ┆ 31             │
│ Exequiel Palacios  ┆ midfielder ┆ 77.453265     ┆ 24             │
│ Alejandro Grimaldo ┆ midfielder ┆ 77.396467     ┆ 33             │
└────────────────────┴────────────┴───────────────┴

In [None]:
# Cell 15: Calculate overall rating (position-specific)
print("Calculating position-specific overall ratings...")

# Apply position-specific weights
def calculate_overall_rating(row):
    position = row['position']
    attack = row['attack_rating']
    defense = row['defense_rating']
    passing = row['passing_rating']
    
    weights = POSITION_WEIGHTS.get(position, POSITION_WEIGHTS['midfielder'])  # Default to midfielder
    
    overall = (
        attack * weights['attack'] +
        defense * weights['defense'] +
        passing * weights['passing']
    )
    
    return overall

# Calculate overall ratings
overall_ratings = []
for row in player_ratings_filtered.iter_rows(named=True):
    overall_ratings.append(calculate_overall_rating(row))

player_ratings_filtered = player_ratings_filtered.with_columns([
    pl.Series('overall_rating', overall_ratings)
])

print(" Overall ratings calculated")

# Show position weight distribution
print("\nPosition-specific weight schema:")
for pos, weights in POSITION_WEIGHTS.items():
    print(f"  {pos:12s}: Attack={weights['attack']:.0%}, "
          f"Passing={weights['passing']:.0%}, "
          f"Defense={weights['defense']:.0%}")

print("\n" + "="*60)
print("TOP 20 PLAYERS - OVERALL RATING")
print("="*60)
print(player_ratings_filtered
      .select(['player_name', 'position', 'overall_rating', 
               'attack_rating', 'defense_rating', 'passing_rating', 
               'matches_played'])
      .sort('overall_rating', descending=True)
      .head(20))

print("\n" + "="*60)
print("TOP 10 BY POSITION")
print("="*60)

for position in ['forward', 'midfielder', 'defender']:
    print(f"\nTop 10 {position}s:")
    print(player_ratings_filtered
          .filter(pl.col('position') == position)
          .select(['player_name', 'overall_rating', 'attack_rating', 'defense_rating', 'passing_rating'])
          .sort('overall_rating', descending=True)
          .head(10))

Calculating position-specific overall ratings...
✅ Overall ratings calculated

Position-specific weight schema:
  forward     : Attack=60%, Passing=30%, Defense=10%
  midfielder  : Attack=35%, Passing=35%, Defense=30%
  defender    : Attack=15%, Passing=25%, Defense=60%
  goalkeeper  : Attack=5%, Passing=25%, Defense=70%

TOP 20 PLAYERS - OVERALL RATING
shape: (20, 7)
┌────────────────────┬────────────┬────────────────┬───────────────┬────────────────┬────────────────┬────────────────┐
│ player_name        ┆ position   ┆ overall_rating ┆ attack_rating ┆ defense_rating ┆ passing_rating ┆ matches_played │
│ ---                ┆ ---        ┆ ---            ┆ ---           ┆ ---            ┆ ---            ┆ ---            │
│ str                ┆ str        ┆ f64            ┆ f64           ┆ f64            ┆ f64            ┆ u32            │
╞════════════════════╪════════════╪════════════════╪═══════════════╪════════════════╪════════════════╪════════════════╡
│ Min-jae Kim        ┆ defend

In [None]:
# Cell 15B: Add position-normalized overall rating
print("Creating position-normalized overall ratings...")

position_normalized_ratings = []

for position in ['forward', 'midfielder', 'defender']:
    # Get players in this position
    position_players = player_ratings_filtered.filter(pl.col('position') == position)
    
    # Scale their overall ratings to 0-100 within position
    scaler = MinMaxScaler(feature_range=(0, 100))
    position_overall = position_players['overall_rating'].to_numpy().reshape(-1, 1)
    normalized = scaler.fit_transform(position_overall).flatten()
    
    # Store with player_id
    for i, player_id in enumerate(position_players['player_id']):
        position_normalized_ratings.append({
            'player_id': player_id,
            'position_normalized_rating': normalized[i]
        })

# Join back
position_norm_df = pl.DataFrame(position_normalized_ratings)
player_ratings_filtered = player_ratings_filtered.join(position_norm_df, on='player_id', how='left')

print("\n Added position-normalized ratings")

print("\nTop 5 forwards (position-normalized):")
print(player_ratings_filtered
      .filter(pl.col('position') == 'forward')
      .select(['player_name', 'overall_rating', 'position_normalized_rating'])
      .sort('position_normalized_rating', descending=True)
      .head(5))

print("\nTop forwards now have ratings in 80s-100s!")

Creating position-normalized overall ratings...

✅ Added position-normalized ratings

Top 5 forwards (position-normalized):
shape: (5, 3)
┌────────────────┬────────────────┬────────────────────────────┐
│ player_name    ┆ overall_rating ┆ position_normalized_rating │
│ ---            ┆ ---            ┆ ---                        │
│ str            ┆ f64            ┆ f64                        │
╞════════════════╪════════════════╪════════════════════════════╡
│ Florian Wirtz  ┆ 59.269926      ┆ 100.0                      │
│ Jonas Hofmann  ┆ 47.229855      ┆ 79.448339                  │
│ Leroy Sané     ┆ 46.758601      ┆ 78.643937                  │
│ Xavi Simons    ┆ 46.702979      ┆ 78.548994                  │
│ Kingsley Coman ┆ 43.442421      ┆ 72.983421                  │
└────────────────┴────────────────┴────────────────────────────┘

Top forwards now have ratings in 80s-100s!


In [None]:
# Cell 16: Save final player ratings
print("Saving final player ratings...")

# Save to processed directory
player_ratings_filtered.write_parquet(PROCESSED_DIR / "player_ratings_final.parquet")
player_ratings_filtered.write_csv(PROCESSED_DIR / "player_ratings_final.csv")

print(f" Saved to:")
print(f"   {PROCESSED_DIR / 'player_ratings_final.parquet'}")
print(f"   {PROCESSED_DIR / 'player_ratings_final.csv'}")

print(f"\nFinal dataset: {len(player_ratings_filtered)} players")
print(f"\nColumns: {player_ratings_filtered.columns}")

Saving final player ratings...
✅ Saved to:
   ..\data\processed\player_ratings_final.parquet
   ..\data\processed\player_ratings_final.csv

STEP 2 COMPLETE! 🎉

Final dataset: 402 players

Columns: ['player_id', 'team_id', 'total_attack_points', 'total_defense_points', 'total_passing_points', 'total_events', 'matches_played', 'avg_x_position', 'avg_y_position', 'total_points', 'attack_ratio', 'defense_ratio', 'position', 'estimated_minutes_per_match', 'estimated_total_minutes', 'player_name', 'jersey_no', 'events_per_match', 'attack_per90', 'defense_per90', 'passing_per90', 'attack_rating', 'defense_rating', 'passing_rating', 'overall_rating', 'position_normalized_rating']
