# Data Exploration 

Objective: Understand the data structure, identify patterns, and validate
          temporal consistency before feature engineering.

Investigation Questions:
1. Is the data temporally ordered?
2. Are there missing patterns (systematic vs random)?
3. What's the distribution of our target variable?
4. Do patterns differ by position?
5. Are there data quality issues?

## CONFIGURATION

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Path configuration
BASE_DIR = Path.cwd().parent
PROCESSED_DIR = BASE_DIR / "data" / "processed"
DATA_FILE = PROCESSED_DIR / "fpl_unified_preprocessed.csv"

# Load preprocessed data
df = pd.read_csv(DATA_FILE)
print(f"Dataset Shape: {df.shape}")
print(f"Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

## 1. Temporal Structure Analysis

In [None]:
# Verify temporal ordering
df['season_gw'] = df['season'] + '_GW' + df['GW'].astype(str)
print("Temporal Coverage:")
print(df.groupby('season')['GW'].agg(['min', 'max', 'count']))

# Check for gaps in gameweeks
print("\nGameweek Continuity Check:")
for season in df['season'].unique():
    season_gws = df[df['season'] == season]['GW'].unique()
    expected = set(range(1, season_gws.max() + 1))
    actual = set(season_gws)
    missing = expected - actual
    if missing:
        print(f"{season}: Missing GWs {sorted(missing)}")
    else:
        print(f"{season}: Complete (GW 1-{season_gws.max()})")

## 2. Target Variable Distribution Analysis

In [None]:
# Distribution of points (our target)
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Overall distribution
axes[0].hist(df['total_points'], bins=50, edgecolor='black', alpha=0.7)
axes[0].axvline(df['total_points'].median(), color='red', linestyle='--', 
                label=f'Median: {df["total_points"].median():.1f}')
axes[0].axvline(df['total_points'].mean(), color='blue', linestyle='--',
                label=f'Mean: {df["total_points"].mean():.1f}')
axes[0].set_xlabel('Total Points')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of FPL Points (All Positions)')
axes[0].legend()

# By position
df.boxplot(column='total_points', by='position', ax=axes[1])
axes[1].set_title('Points Distribution by Position')
axes[1].set_xlabel('Position')
axes[1].set_ylabel('Total Points')

plt.tight_layout()
plt.show()

print("\nTarget Variable Statistics:")
print(df.groupby('position')['total_points'].describe())


In [None]:
# Zero-inflation analysis (players who didn't play)
zero_points_pct = (df['total_points'] == 0).sum() / len(df) * 100
zero_minutes_pct = (df['minutes'] == 0).sum() / len(df) * 100

print(f"\nZero Points: {zero_points_pct:.2f}% of observations")
print(f"Zero Minutes: {zero_minutes_pct:.2f}% of observations")
print("Note: Zero points often correlate with not playing (minutes=0)")


## 3. Missing Data Forensics

In [None]:
# Missing value analysis
missing_summary = pd.DataFrame({
    'Missing_Count': df.isnull().sum(),
    'Missing_Pct': (df.isnull().sum() / len(df) * 100).round(2)
}).sort_values('Missing_Count', ascending=False)

print("Missing Values by Column:")
print(missing_summary[missing_summary['Missing_Count'] > 0])

In [None]:
# Expected goals missing pattern (systematic in 2021-22)
xg_cols = ['expected_goals', 'expected_assists', 'expected_goal_involvements', 
           'expected_goals_conceded']

print("\nExpected Goals Availability by Season:")
for col in xg_cols:
    if col in df.columns:
        print(f"\n{col}:")
        print(df.groupby('season')[col].apply(lambda x: f"{x.notna().sum()} / {len(x)} ({x.notna().mean()*100:.1f}%)"))

## 4. Player Activity Patterns

In [None]:
# How many gameweeks does each player appear in?
player_activity = df.groupby('element').agg({
    'GW': 'count',
    'minutes': 'sum',
    'total_points': 'sum'
}).rename(columns={'GW': 'appearances'})

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Appearances distribution
axes[0].hist(player_activity['appearances'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Number of Gameweeks')
axes[0].set_ylabel('Number of Players')
axes[0].set_title('Player Activity: Gameweek Appearances')
axes[0].axvline(player_activity['appearances'].median(), color='red', 
                linestyle='--', label=f'Median: {player_activity["appearances"].median():.0f}')
axes[0].legend()

# Total minutes distribution
axes[1].hist(player_activity['minutes'], bins=50, edgecolor='black', alpha=0.7)
axes[1].set_xlabel('Total Minutes Played')
axes[1].set_ylabel('Number of Players')
axes[1].set_title('Player Activity: Total Minutes')
axes[1].axvline(player_activity['minutes'].median(), color='red',
                linestyle='--', label=f'Median: {player_activity["minutes"].median():.0f}')
axes[1].legend()

plt.tight_layout()
plt.show()

print("\nPlayer Activity Summary:")
print(player_activity.describe())

## 5. Position-Specific Patterns

In [None]:
# Average points by position over time
position_trends = df.groupby(['season', 'GW', 'position'])['total_points'].mean().reset_index()

fig, ax = plt.subplots(figsize=(15, 6))
for position in ['GK', 'DEF', 'MID', 'FWD']:
    pos_data = position_trends[position_trends['position'] == position]
    pos_data['time_idx'] = range(len(pos_data))
    ax.plot(pos_data['time_idx'], pos_data['total_points'], label=position, linewidth=2)

ax.set_xlabel('Time (Sequential Gameweeks Across Seasons)')
ax.set_ylabel('Average Points')
ax.set_title('Position-Specific Point Trends Over Time')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Correlation between key metrics (by position)
metrics = ['total_points', 'minutes', 'goals_scored', 'assists', 
           'clean_sheets', 'bonus', 'ict_index', 'bps']

available_metrics = [col for col in metrics if col in df.columns]

for position in ['GK', 'DEF', 'MID', 'FWD']:
    pos_df = df[df['position'] == position][available_metrics]
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(pos_df.corr(), annot=True, cmap='coolwarm', center=0,
                square=True, linewidths=1, cbar_kws={"shrink": 0.8})
    plt.title(f'Feature Correlation Matrix: {position}')
    plt.tight_layout()
    plt.show()

## 6. Data Quality Checks

In [None]:
# Check for impossible values
print("Data Quality Checks:")
print(f"Negative points: {(df['total_points'] < 0).sum()}")
print(f"Minutes > 90: {(df['minutes'] > 90).sum()}")
print(f"Goals > 5 (single match): {(df['goals_scored'] > 5).sum()}")

# Check for duplicate player-gameweek records
duplicates = df.duplicated(subset=['element', 'GW', 'season'], keep=False)
print(f"Duplicate player-gameweek records: {duplicates.sum()}")

if duplicates.sum() > 0:
    print("\nSample of duplicates:")
    print(df[duplicates][['element', 'name', 'season', 'GW', 'total_points', 'fixture']].head(10))

## 7. Key Insights Summary

In [None]:
print("="*80)
print("FORENSIC ANALYSIS SUMMARY")
print("="*80)

print("\n1. Dataset Coverage:")
print(f"   - Total Observations: {len(df):,}")
print(f"   - Unique Players: {df['element'].nunique():,}")
print(f"   - Seasons: {df['season'].nunique()}")
print(f"   - Gameweeks per Season: ~{df.groupby('season')['GW'].max().mean():.0f}")

print("\n2. Target Variable (Points):")
print(f"   - Mean: {df['total_points'].mean():.2f}")
print(f"   - Median: {df['total_points'].median():.2f}")
print(f"   - Std Dev: {df['total_points'].std():.2f}")
print(f"   - Zero-inflated: {zero_points_pct:.1f}% are zero points")

print("\n3. Missing Data:")
print("   - Expected Goals: Missing in 2021-22 (systematic)")
print("   - Defensive Metrics: Missing pre-2025-26 (systematic)")
print("   - Strategy: Handle via imputation or separate models")

print("\n4. Position Distribution:")
print(df['position'].value_counts())

print("\n5. Temporal Validity:")
print("   - Data is sequential and suitable for time series modeling")
print("   - No major gaps detected in gameweek continuity")

print("\n6. Next Steps:")
print("   → Proceed to Feature Engineering (Notebook 02)")
print("   → Focus on rolling statistics and momentum indicators")
print("   → Handle missing xG data via season-specific features")