In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats


# File path
file_path = "/kaggle/input/data-s/supplementary_data.csv"

# CSV load karna, DtypeWarning avoid karne ke liye
df = pd.read_csv(file_path, low_memory=False)

# Basic check
print("‚úÖ Data Loaded Successfully")
print("Shape of Dataset:", df.shape)
print("Columns:", df.columns)

In [None]:
# ---------------------------------------------
# üîç STEP 3: Basic Information
# ---------------------------------------------
print("="*60)
print("‚úÖ BASIC INFORMATION")
print("="*60)
print("Rows:", df.shape[0])
print("Columns:", df.shape[1])
print("\nColumn Names:\n", df.columns.tolist())
print("\nData Types:\n", df.dtypes.value_counts())

In [None]:
# Detailed info
print("\n--- Detailed DataFrame Info ---")
print(df.info())

In [None]:
# ---------------------------------------------
# üî¢ STEP 4: Missing Values Analysis
# ---------------------------------------------
print("\n" + "="*60)
print("üö® MISSING VALUE ANALYSIS")
print("="*60)

missing_df = df.isnull().sum().reset_index()
missing_df.columns = ['Column', 'Missing_Count']
missing_df['Missing_%'] = round(missing_df['Missing_Count'] / len(df) * 100, 2)
print(missing_df.sort_values(by='Missing_%', ascending=False).head(15))

plt.figure(figsize=(10,5))
sns.barplot(data=missing_df.sort_values(by='Missing_%', ascending=False).head(10),
            x='Column', y='Missing_%', color='tomato')
plt.title("Top 10 Columns with Most Missing Values")
plt.xticks(rotation=45)
plt.show()

In [None]:
# ---------------------------------------------
# üìä STEP 5: Descriptive Statistics
# ---------------------------------------------
print("\n" + "="*60)
print("üìà NUMERICAL DATA SUMMARY")
print("="*60)
print(df.describe().T)

# Extra: skewness and kurtosis
num_cols = df.select_dtypes(include=np.number).columns
skew_kurt = pd.DataFrame({
    'Skewness': df[num_cols].skew(),
    'Kurtosis': df[num_cols].kurt()
}).sort_values(by='Skewness', ascending=False)
print("\nSkewness & Kurtosis:\n", skew_kurt.head(10))


In [None]:
# ---------------------------------------------
# üß† STEP 6: Categorical Column Analysis
# ---------------------------------------------
print("\n" + "="*60)
print("üß© CATEGORICAL FEATURE ANALYSIS")
print("="*60)

cat_cols = df.select_dtypes(exclude=np.number).columns
for col in cat_cols:
    print(f"\n‚ñ∂Ô∏è {col}")
    print("Unique Values:", df[col].nunique())
    print(df[col].value_counts().head(5))


In [None]:
# ---------------------------------------------
# üßÆ STEP 7: Advanced Grouped Insights
# ---------------------------------------------
print("\n" + "="*60)
print("üìä ADVANCED GROUP-BY STATISTICS")
print("="*60)

# Average yards gained by quarter and down
agg_stats = df.groupby(['quarter','down']).agg({
    'yards_gained':'mean',
    'expected_points_added':'mean',
    'pass_length':'mean'
}).reset_index()

print("\nAverage metrics per Quarter & Down:\n", agg_stats.head(12))


In [None]:
# ---------------------------------------------
# üßæ STEP 8: Outlier Detection (IQR Method)
# ---------------------------------------------
print("\n" + "="*60)
print("üöß OUTLIER DETECTION (IQR METHOD)")
print("="*60)

for col in ['yards_gained', 'pass_length', 'expected_points_added']:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    outliers = df[(df[col] < Q1 - 1.5*IQR) | (df[col] > Q3 + 1.5*IQR)]
    print(f"{col}: {len(outliers)} outliers ({round(len(outliers)/len(df)*100, 2)}%)")


In [None]:
# ===============================================================
# üìä STEP 9: Basic Statistical Checks (SciPy)
# ===============================================================

# Mean, Median, Mode check

num_df = df.select_dtypes(include=[np.number])

print("\n--- Central Tendency Measures ---")
for col in ['yards_gained', 'expected_points_added', 'pass_length']:
    if col in num_df.columns:
        mean_val = np.mean(num_df[col])
        median_val = np.median(num_df[col])
        mode_val = stats.mode(num_df[col], keepdims=True).mode[0]
        print(f"{col}: Mean={mean_val:.2f}, Median={median_val:.2f}, Mode={mode_val:.2f}")


In [None]:

# ===============================================================
# üßÆ STEP 10: Normality Tests (Shapiro-Wilk & Kolmogorov‚ÄìSmirnov)
# ===============================================================
print("\n--- Normality Tests ---")
for col in ['yards_gained', 'expected_points_added', 'pass_length']:
    if col in num_df.columns:
        subset = num_df[col].dropna().sample(min(500, len(num_df)))  # small sample for speed
        shapiro_stat, shapiro_p = stats.shapiro(subset)
        ks_stat, ks_p = stats.kstest(subset, 'norm', args=(subset.mean(), subset.std()))
        print(f"{col}: Shapiro p={shapiro_p:.4f}, KS p={ks_p:.4f}")


In [None]:
# ===============================================================
# üîó STEP 11: Correlation Tests (Pearson, Spearman, Kendall)
# ===============================================================
print("\n--- Correlation Tests ---")
if 'yards_gained' in num_df.columns and 'expected_points_added' in num_df.columns:
    pear_corr, pear_p = stats.pearsonr(num_df['yards_gained'], num_df['expected_points_added'])
    spear_corr, spear_p = stats.spearmanr(num_df['yards_gained'], num_df['expected_points_added'])
    kend_corr, kend_p = stats.kendalltau(num_df['yards_gained'], num_df['expected_points_added'])
    print(f"Pearson Corr={pear_corr:.3f}, p={pear_p:.4f}")
    print(f"Spearman Corr={spear_corr:.3f}, p={spear_p:.4f}")
    print(f"Kendall Corr={kend_corr:.3f}, p={kend_p:.4f}")


In [None]:

# ===============================================================
# ‚öñÔ∏è STEP 12: Variance & T-tests
# ===============================================================
print("\n--- Variance & T-test ---")
if 'yards_gained' in df.columns and 'pass_result' in df.columns:
    # Split yards_gained by pass_result type
    comp = df[df['pass_result'] == 'C']['yards_gained'].dropna()
    incomp = df[df['pass_result'] == 'I']['yards_gained'].dropna()
    if len(comp) > 30 and len(incomp) > 30:
        t_stat, t_p = stats.ttest_ind(comp, incomp, equal_var=False)
        print(f"T-test (Complete vs Incomplete): t={t_stat:.3f}, p={t_p:.4f}")
    else:
        print("Not enough samples for t-test.")

In [None]:

# ===============================================================
# üß† STEP 13: ANOVA Test (One-way)
# ===============================================================
print("\n--- ANOVA Test: Yards Gained by Down ---")
if 'down' in df.columns:
    anova_data = [group['yards_gained'].dropna() for name, group in df.groupby('down') if len(group) > 10]
    if len(anova_data) > 1:
        f_stat, p_val = stats.f_oneway(*anova_data)
        print(f"F={f_stat:.3f}, p={p_val:.4f}")
    else:
        print("Not enough groups for ANOVA.")

In [None]:
# ===============================================================
# üß© STEP 14: Chi-Square Test (Categorical Association)
# ===============================================================
print("\n--- Chi-Square Test: pass_result vs play_action ---")
if 'pass_result' in df.columns and 'play_action' in df.columns:
    cont_table = pd.crosstab(df['pass_result'], df['play_action'])
    chi2, p, dof, expected = stats.chi2_contingency(cont_table)
    print(f"Chi2={chi2:.3f}, p={p:.4f}, dof={dof}")

In [None]:
# ===============================================================
# üßæ STEP 15: Visualization (optional for understanding distributions)
# ===============================================================
plt.figure(figsize=(10,5))
sns.histplot(df['yards_gained'].dropna(), bins=30, kde=True)
plt.title("Distribution of Yards Gained")
plt.show()

plt.figure(figsize=(6,4))
sns.boxplot(x='down', y='yards_gained', data=df)
plt.title("Yards Gained Distribution by Down")
plt.show()

print("\n‚úÖ SciPy Statistical Analysis Completed Successfully!")

In [None]:
# ---------------------------------------------
# ‚è± STEP 16: Temporal Trend Analysis
# ---------------------------------------------
df['game_date'] = pd.to_datetime(df['game_date'])
trend = df.groupby('game_date')['yards_gained'].mean()
plt.figure(figsize=(12,5))
plt.plot(trend.index, trend.values)
plt.title("Trend of Average Yards Gained Over Time")
plt.xlabel("Date")
plt.ylabel("Avg Yards")
plt.grid(True)
plt.show()

In [None]:
# ---------------------------------------------
# üß† Step 17: Important categorical insights
# ---------------------------------------------
print("\nUnique Play Results:", df['pass_result'].unique())
print("\nUnique Offensive Formations:", df['offense_formation'].unique())
print("\nUnique Coverage Types:", df['team_coverage_type'].unique())

# Count of play results
play_counts = df['pass_result'].value_counts()
plt.figure(figsize=(7,5))
sns.barplot(x=play_counts.index, y=play_counts.values)
plt.title("Distribution of Pass Results")
plt.xlabel("Pass Result")
plt.ylabel("Count")
plt.show()



In [None]:
# ---------------------------------------------
# üèà Step 18: Team Performance Analysis
# ---------------------------------------------
# Average yards gained by team
team_perf = df.groupby('possession_team')['yards_gained'].mean().sort_values(ascending=False).head(10)
plt.figure(figsize=(10,5))
sns.barplot(x=team_perf.index, y=team_perf.values)
plt.title("Top 10 Teams by Average Yards Gained")
plt.xlabel("Team")
plt.ylabel("Avg Yards Gained")
plt.xticks(rotation=45)
plt.show()


In [None]:
# ---------------------------------------------
# üìä Step 19: Quarter and Down-wise Analysis
# ---------------------------------------------
# Avg yards per quarter
quarter_perf = df.groupby('quarter')['yards_gained'].mean()
plt.figure(figsize=(6,4))
sns.barplot(x=quarter_perf.index, y=quarter_perf.values)
plt.title("Average Yards Gained per Quarter")
plt.xlabel("Quarter")
plt.ylabel("Average Yards Gained")
plt.show()

# Avg yards per down
down_perf = df.groupby('down')['yards_gained'].mean()
plt.figure(figsize=(6,4))
sns.barplot(x=down_perf.index, y=down_perf.values)
plt.title("Average Yards Gained per Down")
plt.xlabel("Down")
plt.ylabel("Average Yards Gained")
plt.show()


In [None]:
# ---------------------------------------------
# üìà Step 20: Expected Points & Pass Length Relation
# ---------------------------------------------
plt.figure(figsize=(7,5))
sns.scatterplot(x='pass_length', y='expected_points_added', data=df, alpha=0.6)
plt.title("Pass Length vs Expected Points Added")
plt.xlabel("Pass Length (yards)")
plt.ylabel("Expected Points Added")
plt.show()

In [None]:
# ---------------------------------------------
# üî• Step 21: Correlation Heatmap
# ---------------------------------------------
numeric_df = df.select_dtypes(include=['number'])
corr = numeric_df.corr()

plt.figure(figsize=(12,8))
sns.heatmap(corr, cmap="coolwarm", annot=False)
plt.title("Correlation Heatmap (Numeric Columns)")
plt.show()

In [None]:
# ---------------------------------------------
# üß© STEP 22: Correlation Insights
# ---------------------------------------------
print("\n" + "="*60)
print("üîó CORRELATION BETWEEN NUMERIC FEATURES")
print("="*60)
corr_matrix = df[num_cols].corr()
print(corr_matrix[['yards_gained', 'expected_points_added']].sort_values(by='expected_points_added', ascending=False))

plt.figure(figsize=(10,7))
sns.heatmap(corr_matrix, cmap='coolwarm', annot=False)
plt.title("Full Correlation Heatmap")
plt.show()