# Exploratory Data Analysis (Upgraded)
EDA tasks:
- Load generated dataset
- Visualize distributions, relationships
- Check missing values, outliers
- Quick correlation heatmap and feature-target relationships




In [None]:
import pandas as pd
import yaml
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

cfg = yaml.safe_load(open('config.yaml'))
df = pd.read_csv(Path(cfg['paths']['data_dir']) / 'dataset.csv')
df.head()


In [None]:
# Overview stats
df.describe(include='all').T



In [None]:
# Missing values
print("Missing values per column:")
print(df.isna().sum())


In [None]:
# Distribution plots for numeric features
num_cols = ['age','income','transactions']
for c in num_cols:
    plt.figure(figsize=(6,3))
    sns.histplot(df[c], kde=True)
    plt.title(c)
    plt.show()


In [None]:
# Boxplot to inspect outliers
for c in num_cols:
    plt.figure(figsize=(6,3))
    sns.boxplot(x=df[c])
    plt.title(f"Boxplot {c}")
    plt.show()


In [None]:
# Correlation heatmap among numeric features (and engineered ones if present)
from feature_engineering import create_features
X, y = create_features(df, training=True)
plt.figure(figsize=(10,8))
corr = X.select_dtypes(include=['number']).corr()
sns.heatmap(corr, annot=True, fmt=".2f", cmap='vlag')
plt.title("Numeric feature correlation")
plt.show()


In [None]:
# Target distribution and relationship with categorical features
print("Target distribution:")
print(y.value_counts(normalize=True))

plt.figure(figsize=(6,4))
sns.countplot(x=y)
plt.title("Target counts")
plt.show()

# Membership vs Target
plt.figure(figsize=(6,4))
sns.barplot(x=df['membership'], y=df['target'])
plt.title("Membership vs mean Target")
plt.show()


In [None]:
# Pairplot of a few numeric columns (sampled for speed)
sns.pairplot(df.sample(min(500, len(df))), vars=num_cols, hue='membership', corner=True)
