# Club América - Player Recommendation System
## Initial Exploratory Data Analysis

This notebook explores the StatsBomb data for Liga MX (Apertura 2021 - Clausura 2025) to understand:
- Available competitions and seasons
- Teams and matches
- Player data
- Event data structure
- 360 data availability

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsbombpy import sb
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Authentication
Connect to StatsBomb API with provided credentials

In [None]:
# Authenticate with StatsBomb API
# Credentials from credenciales.txt
USERNAME = "itam_hackathon@hudl.com"
PASSWORD = "pGwIprel"

# Note: StatsBombPy doesn't require explicit login for basic access
# Authentication is handled automatically when accessing premium data
print("StatsBomb connection ready")

## 2. Explore Available Competitions
Let's see what competitions are available, focusing on Liga MX

In [None]:
# Get all competitions
competitions = sb.competitions()
print(f"Total competitions available: {len(competitions)}")
print("\nFirst few competitions:")
competitions.head(10)

In [None]:
# Filter for Liga MX
liga_mx = competitions[competitions['competition_name'].str.contains('Liga MX', case=False, na=False)]
print(f"Liga MX seasons available: {len(liga_mx)}")
liga_mx.sort_values('season_name')

## 3. Explore Matches
Get all matches for Liga MX seasons

In [None]:
# Get matches for each Liga MX season
all_matches = []

for idx, row in liga_mx.iterrows():
    comp_id = row['competition_id']
    season_id = row['season_id']
    season_name = row['season_name']
    
    print(f"Fetching matches for {season_name}...")
    try:
        matches = sb.matches(competition_id=comp_id, season_id=season_id)
        matches['season_name'] = season_name
        all_matches.append(matches)
        print(f"  Found {len(matches)} matches")
    except Exception as e:
        print(f"  Error: {e}")

# Combine all matches
if all_matches:
    df_matches = pd.concat(all_matches, ignore_index=True)
    print(f"\nTotal matches: {len(df_matches)}")
else:
    print("No matches found")

In [None]:
# Explore matches data structure
print("Matches columns:")
print(df_matches.columns.tolist())
print("\nSample matches:")
df_matches[['season_name', 'match_date', 'home_team', 'away_team', 'home_score', 'away_score']].head(10)

## 4. Focus on Club América
Filter and analyze Club América's matches

In [None]:
# Find Club América matches
america_matches = df_matches[
    (df_matches['home_team'].str.contains('América', case=False, na=False)) | 
    (df_matches['away_team'].str.contains('América', case=False, na=False))
]

print(f"Club América matches: {len(america_matches)}")
print(f"\nMatches by season:")
print(america_matches['season_name'].value_counts().sort_index())

# Show sample matches
print("\nSample Club América matches:")
america_matches[['season_name', 'match_date', 'home_team', 'away_team', 'home_score', 'away_score']].head(10)

In [None]:
# Club América performance
def get_team_result(row, team_name='América'):
    """Determine if team won, drew, or lost"""
    if team_name.lower() in row['home_team'].lower():
        if row['home_score'] > row['away_score']:
            return 'Win'
        elif row['home_score'] < row['away_score']:
            return 'Loss'
        else:
            return 'Draw'
    else:
        if row['away_score'] > row['home_score']:
            return 'Win'
        elif row['away_score'] < row['home_score']:
            return 'Loss'
        else:
            return 'Draw'

america_matches['result'] = america_matches.apply(get_team_result, axis=1)
print("\nClub América results:")
print(america_matches['result'].value_counts())
print(f"\nWin rate: {(america_matches['result'] == 'Win').sum() / len(america_matches) * 100:.1f}%")

## 5. Sample Event Data
Explore event data structure from a sample match

In [None]:
# Get event data for a sample Club América match
sample_match_id = america_matches.iloc[0]['match_id']
print(f"Loading events for match ID: {sample_match_id}")
print(f"Match: {america_matches.iloc[0]['home_team']} vs {america_matches.iloc[0]['away_team']}")

try:
    events = sb.events(match_id=sample_match_id)
    print(f"\nTotal events: {len(events)}")
    print(f"\nEvent columns ({len(events.columns)}):")
    print(events.columns.tolist())
except Exception as e:
    print(f"Error loading events: {e}")

In [None]:
# Explore event types
if 'events' in locals():
    print("Event types:")
    print(events['type'].value_counts())
    
    print("\nSample events:")
    display(events[['minute', 'second', 'type', 'team', 'player', 'position']].head(20))

## 6. Check for 360 Data
Explore if 360 freeze frame data is available

In [None]:
# Check for 360 data columns
if 'events' in locals():
    print("Checking for 360 data...")
    
    # Look for freeze frame or 360 related columns
    cols_360 = [col for col in events.columns if '360' in col.lower() or 'freeze' in col.lower()]
    print(f"\n360-related columns: {cols_360}")
    
    # Check if any events have 360 data
    if cols_360:
        for col in cols_360:
            non_null = events[col].notna().sum()
            print(f"  {col}: {non_null} events with data ({non_null/len(events)*100:.1f}%)")

## 7. Player Statistics Overview
Analyze players involved in the data

In [None]:
# Get unique players from events
if 'events' in locals():
    players = events[events['player'].notna()][['player', 'team', 'position']].drop_duplicates()
    print(f"Unique players in sample match: {len(players)}")
    print("\nPlayers by team:")
    print(players['team'].value_counts())
    
    print("\nPlayers by position:")
    print(players['position'].value_counts())

## 8. Summary Statistics
Key insights from initial exploration

In [None]:
print("=" * 60)
print("INITIAL EDA SUMMARY")
print("=" * 60)
print(f"\n1. DATA AVAILABILITY")
print(f"   - Liga MX seasons: {len(liga_mx)}")
print(f"   - Total matches: {len(df_matches)}")
print(f"   - Club América matches: {len(america_matches)}")

print(f"\n2. CLUB AMÉRICA PERFORMANCE")
print(f"   - Wins: {(america_matches['result'] == 'Win').sum()}")
print(f"   - Draws: {(america_matches['result'] == 'Draw').sum()}")
print(f"   - Losses: {(america_matches['result'] == 'Loss').sum()}")
print(f"   - Win rate: {(america_matches['result'] == 'Win').sum() / len(america_matches) * 100:.1f}%")

if 'events' in locals():
    print(f"\n3. EVENT DATA (Sample Match)")
    print(f"   - Total events: {len(events)}")
    print(f"   - Event types: {events['type'].nunique()}")
    print(f"   - Players involved: {len(players)}")

print("\n" + "=" * 60)

## Next Steps

1. **Deep dive into Club América's tactical profile**
   - Analyze passing patterns
   - Defensive actions and positioning
   - Attacking style and player movements

2. **Player-level analysis**
   - Extract player statistics across all matches
   - Identify key performance indicators
   - Analyze player roles and positions

3. **Event data deep dive**
   - Shot analysis (xG)
   - Passing networks
   - Defensive actions
   - Set pieces (corners, free kicks)

4. **Framework design**
   - Define metrics for player fit
   - Build comparison framework
   - Develop recommendation system