# Football Analytics - Exploratory Data Analysis
## Phase 3: Understanding the Data

**Dataset**: StatsBomb Open Data (2022-2024)

**Objectives**:
1. Understand data distributions
2. Identify correlations
3. Detect outliers
4. Validate data quality

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from pathlib import Path

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("âœ“ Libraries imported")

## 1. Load Data

In [None]:
# Load processed data
data_dir = Path('../data/processed')

season_stats = pd.read_csv(data_dir / 'players_season_stats.csv')
print(f"âœ“ Loaded {len(season_stats)} player records")

# Display first rows
season_stats.head()

## 2. Data Overview

In [None]:
# Basic info
print("=" * 60)
print("DATA OVERVIEW")
print("=" * 60)
season_stats.info()

In [None]:
# Descriptive statistics
season_stats.describe()

In [None]:
# Missing values
missing = season_stats.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)

if len(missing) > 0:
    print("Missing Values:")
    print(missing)
else:
    print("âœ“ No missing values!")

## 3. Distribution Analysis

In [None]:
# Position distribution
if 'position_category' in season_stats.columns:
    fig = px.pie(season_stats, names='position_category', 
                 title='Player Distribution by Position')
    fig.show()
    
    print("\nPosition Counts:")
    print(season_stats['position_category'].value_counts())

In [None]:
# Key metrics distributions
metrics = ['goals_per90', 'assists_per90', 'xg_per90', 'tackles_per90']
available_metrics = [m for m in metrics if m in season_stats.columns]

fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.ravel()

for idx, metric in enumerate(available_metrics[:4]):
    axes[idx].hist(season_stats[metric].dropna(), bins=30, color='skyblue', edgecolor='black')
    axes[idx].set_title(f'{metric} Distribution')
    axes[idx].set_xlabel(metric)
    axes[idx].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## 4. Correlation Analysis

In [None]:
# Select numeric columns for correlation
numeric_cols = season_stats.select_dtypes(include=[np.number]).columns
correlation_cols = [c for c in numeric_cols if 'per90' in c or 'score' in c]

if len(correlation_cols) > 0:
    corr_matrix = season_stats[correlation_cols].corr()
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
                center=0, square=True, linewidths=1)
    plt.title('Correlation Matrix - Key Metrics')
    plt.tight_layout()
    plt.show()

## 5. Position-Based Analysis

In [None]:
# Average stats by position
if 'position_category' in season_stats.columns:
    position_stats = season_stats.groupby('position_category').agg({
        'goals_per90': 'mean',
        'assists_per90': 'mean',
        'tackles_per90': 'mean',
        'offensive_score': 'mean',
        'defensive_score': 'mean'
    }).round(2)
    
    print("Average Statistics by Position:")
    print(position_stats)

In [None]:
# Box plots by position
if 'position_category' in season_stats.columns and 'goals_per90' in season_stats.columns:
    fig = px.box(season_stats, x='position_category', y='goals_per90',
                 title='Goals per 90 by Position',
                 color='position_category')
    fig.show()

## 6. Top Performers

In [None]:
# Top scorers
if 'goals_per90' in season_stats.columns:
    top_scorers = season_stats.nlargest(10, 'goals_per90')[['player_name', 'position_category', 'goals_per90', 'matches_played']]
    print("\nâš½ Top 10 Scorers (per 90):")
    print(top_scorers.to_string(index=False))

In [None]:
# Top assisters
if 'assists_per90' in season_stats.columns:
    top_assisters = season_stats.nlargest(10, 'assists_per90')[['player_name', 'position_category', 'assists_per90', 'matches_played']]
    print("\nðŸŽ¨ Top 10 Assisters (per 90):")
    print(top_assisters.to_string(index=False))

## 7. Multi-Dimensional Analysis

In [None]:
# Scatter plot: Goals vs Assists
if 'goals_per90' in season_stats.columns and 'assists_per90' in season_stats.columns:
    fig = px.scatter(season_stats, 
                     x='goals_per90', 
                     y='assists_per90',
                     color='position_category',
                     hover_data=['player_name'],
                     title='Goals vs Assists (per 90)',
                     size='matches_played' if 'matches_played' in season_stats.columns else None)
    fig.show()

## 8. Key Insights Summary

In [None]:
print("="*60)
print("KEY INSIGHTS")
print("="*60)

print(f"\n1. Dataset Size: {len(season_stats)} players")

if 'position_category' in season_stats.columns:
    print(f"\n2. Position Distribution:")
    for pos, count in season_stats['position_category'].value_counts().items():
        print(f"   - {pos}: {count} players")

if 'goals_per90' in season_stats.columns:
    print(f"\n3. Goals per 90:")
    print(f"   - Average: {season_stats['goals_per90'].mean():.3f}")
    print(f"   - Max: {season_stats['goals_per90'].max():.3f}")

if 'primary_style' in season_stats.columns:
    print(f"\n4. Playing Styles Identified: {season_stats['primary_style'].nunique()}")
    print(f"   Top 3 styles:")
    for style, count in season_stats['primary_style'].value_counts().head(3).items():
        print(f"   - {style}: {count} players")

print("\n" + "="*60)
print("âœ“ EDA Complete!")
print("="*60)