In [None]:
# notebooks/eda.ipynb

# 1️⃣ Imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 2️⃣ Load processed data
df = pd.read_csv("../data/processed/processed_matches.csv")
df.head()

# 3️⃣ Basic stats
print("Dataset shape:", df.shape)
print(df.describe())

# 4️⃣ Check missing values
print("Missing values:\n", df.isnull().sum())

# 5️⃣ Distribution of target variable (final_score)
plt.figure(figsize=(8,5))
sns.histplot(df['final_score'], bins=10, kde=True)
plt.title("Distribution of Final Scores")
plt.xlabel("Final Score")
plt.ylabel("Count")
plt.show()

# 6️⃣ Boxplot: Score by venue
if 'venue' in df.columns:
    plt.figure(figsize=(10,5))
    sns.boxplot(x='venue', y='final_score', data=df)
    plt.xticks(rotation=45)
    plt.title("Score distribution by venue")
    plt.show()

# 7️⃣ Pairplot of numerical features (optional)
num_cols = ['overs_completed', 'runs_so_far', 'wickets_so_far', 'final_score']
sns.pairplot(df[num_cols])
plt.show()

# 8️⃣ Correlation heatmap
plt.figure(figsize=(6,5))
sns.heatmap(df[num_cols].corr(), annot=True, cmap='coolwarm')
plt.title("Correlation between numerical features")
plt.show()
