# 🔍 Exploratory Data Analysis (EDA) — Practical Notebook

**Generated:** 2025-09-02 01:35 UTC  
**Stack:** pandas ≥ 2.x · NumPy · Seaborn · Matplotlib · SciPy (optional)

This notebook is a hands‑on EDA guide you can run top‑down or by section:

1. Setup & data loading (or generate a synthetic dataset)
2. Data overview & structure
3. Missing values & cardinality
4. Distributions & outliers
5. Relationships: numeric↔numeric, categorical↔numeric
6. Correlations & heatmaps
7. Time‑series EDA (if a datetime column exists)
8. Grouped summaries & pivots
9. Automated quick‑report helpers

> Tip: Replace the synthetic dataset with your CSV/Parquet in the **Load your data** cell.

## 0) Setup

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

# Plot style
sns.set_theme(context='notebook', style='whitegrid', palette='viridis')

# Display options
pd.set_option('display.max_rows', 60)
pd.set_option('display.max_columns', 60)
pd.set_option('display.width', 140)

RNG = np.random.default_rng(7)

pd.__version__

## 1) Load your data (or use the synthetic dataset)

In [None]:
# Option A: Load your own file
# df = pd.read_csv('your_file.csv')
# df = pd.read_parquet('your_file.parquet')

# Option B: Generate a realistic synthetic dataset for demo
n = 2000
start = np.datetime64('2024-01-01')
dates = start + np.arange(n) % 180  # wrap within ~6 months

# Numeric base features
age = RNG.normal(35, 9, n).clip(18, 75)
experience = np.maximum(0, age - RNG.normal(22, 4, n))
base_income = RNG.normal(45000, 12000, n)
# Seasonality & trend for revenue
season = (np.sin((np.arange(n) % 30)/30*2*np.pi) + 1) * 0.1
trend = np.linspace(0, 0.15, n)
noise = RNG.normal(0, 0.25, n)
revenue = (200 + 5*experience + 0.0004*base_income) * (1 + season + trend + noise)

# Categoricals
city = RNG.choice(['Paris','Lyon','Lille','Marseille','Bordeaux'], size=n, p=[.32,.18,.16,.22,.12])
segment = RNG.choice(['SMB','Mid-Market','Enterprise'], size=n, p=[.5,.35,.15])
channel = RNG.choice(['Web','Field','Partner','Reseller'], size=n)

# Construct frame
synthetic = pd.DataFrame({
    'date': pd.to_datetime(dates),
    'customer_id': RNG.integers(10000, 99999, n),
    'age': age.round(0).astype('int'),
    'experience_yrs': experience.round(1),
    'base_income': base_income.round(2),
    'monthly_revenue': revenue.round(2),
    'city': city,
    'segment': segment,
    'channel': channel
})

# Inject a bit of messiness
mask_nan = RNG.choice([True, False], size=n, p=[0.05, 0.95])
synthetic.loc[mask_nan, 'experience_yrs'] = np.nan
synthetic.loc[synthetic.sample(frac=0.02, random_state=1).index, 'monthly_revenue'] *= RNG.uniform(1.5, 2.5, size=int(0.02*n))

# Use synthetic as default df
df = synthetic.copy()
df.head()

## 2) Data overview & structure

In [None]:
df.shape, df.index[:3], df.columns.tolist()

In [None]:
df.info()

In [None]:
# Summary of numeric columns
num_desc = df.describe().T
num_desc

In [None]:
# Quick peek at categorical columns (top categories)
cat_cols = [c for c in df.select_dtypes(include=['object','category']).columns]
{c: df[c].value_counts(dropna=False).head(5) for c in cat_cols}

## 3) Missing values & cardinality

In [None]:
missing = df.isna().sum().sort_values(ascending=False)
missing[missing>0]

In [None]:
# Distinct counts per column
cardinality = df.nunique(dropna=False).sort_values(ascending=False)
cardinality

In [None]:
# Visualize missing values as a heatmap-like mask
plt.figure(figsize=(10,3))
plt.imshow(df.isna(), aspect='auto', interpolation='nearest', cmap='magma_r')
plt.title('Missing value pattern (dark = missing)')
plt.xlabel('Columns'); plt.ylabel('Rows (sample)')
plt.show()

## 4) Distributions & outliers

In [None]:
num_cols = [c for c in df.select_dtypes(include=[np.number]).columns if c not in ['customer_id']]

# Histograms + KDE
ncols = 3
nrows = int(np.ceil(len(num_cols)/ncols))
fig, axes = plt.subplots(nrows, ncols, figsize=(14, 3*nrows))
axes = axes.ravel()
for ax, col in zip(axes, num_cols):
    sns.histplot(df[col], bins=30, kde=True, ax=ax)
    ax.set_title(col)
for ax in axes[len(num_cols):]:
    ax.axis('off')
plt.tight_layout(); plt.show()

In [None]:
# Boxplots to spot outliers
plt.figure(figsize=(12, 0.5*len(num_cols)+1))
sns.boxplot(data=df[num_cols], orient='h')
plt.title('Boxplots of numeric features')
plt.show()

## 5) Relationships (numeric ↔ numeric)

In [None]:
# Pairplot on a sample to avoid heavy rendering
sample = df.sample(n=min(600, len(df)), random_state=0)
sns.pairplot(sample, vars=['age','experience_yrs','base_income','monthly_revenue'], diag_kind='kde')
plt.show()

In [None]:
# Scatter with regression fits
fig, axes = plt.subplots(1, 3, figsize=(14,4))
sns.regplot(data=df, x='experience_yrs', y='monthly_revenue', scatter_kws={'alpha':0.3}, ax=axes[0])
sns.regplot(data=df, x='base_income', y='monthly_revenue', scatter_kws={'alpha':0.3}, ax=axes[1])
sns.regplot(data=df, x='age', y='monthly_revenue', scatter_kws={'alpha':0.3}, ax=axes[2])
axes[0].set_title('Revenue vs Experience'); axes[1].set_title('Revenue vs Income'); axes[2].set_title('Revenue vs Age')
plt.tight_layout(); plt.show()

## 6) Relationships (categorical ↔ numeric)

In [None]:
# Box & violin by segment for monthly_revenue
fig, axes = plt.subplots(1,2, figsize=(12,4))
sns.boxplot(data=df, x='segment', y='monthly_revenue', ax=axes[0])
sns.violinplot(data=df, x='segment', y='monthly_revenue', inner='quartile', ax=axes[1])
axes[0].set_title('Revenue by Segment — Boxplot')
axes[1].set_title('Revenue by Segment — Violin')
plt.tight_layout(); plt.show()

In [None]:
# Pointplot shows mean ± CI across categories
plt.figure(figsize=(6,4))
sns.pointplot(data=df, x='city', y='monthly_revenue', estimator=np.mean, errorbar=('ci',95))
plt.xticks(rotation=20)
plt.title('Average revenue by city (95% CI)')
plt.tight_layout(); plt.show()

## 7) Correlations & heatmaps

In [None]:
# Pearson for linear, Spearman for monotonic
corr_pearson = df[num_cols].corr(method='pearson')
corr_spearman = df[num_cols].corr(method='spearman')

fig, axes = plt.subplots(1,2, figsize=(14,5))
sns.heatmap(corr_pearson, vmin=-1, vmax=1, center=0, cmap='coolwarm', annot=True, fmt='.2f', ax=axes[0])
axes[0].set_title('Pearson correlation')
sns.heatmap(corr_spearman, vmin=-1, vmax=1, center=0, cmap='coolwarm', annot=True, fmt='.2f', ax=axes[1])
axes[1].set_title('Spearman correlation')
plt.tight_layout(); plt.show()

In [None]:
# Optional: simple helper metrics

def cramers_v_from_series(x, y):
    # Bias-corrected Cramer's V for categorical-categorical association
    tbl = pd.crosstab(x, y)
    chi2 = stats.chi2_contingency(tbl, correction=False)[0]
    n = tbl.values.sum()
    r, k = tbl.shape
    phi2 = chi2 / n
    phi2corr = max(0, phi2 - (k-1)*(r-1)/(n-1))
    rcorr = r - (r-1)**2/(n-1)
    kcorr = k - (k-1)**2/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1))) if min((kcorr-1), (rcorr-1))>0 else np.nan


def correlation_ratio(categories, measurements):
    # Eta squared: categorical -> numeric
    cats = pd.Categorical(categories)
    y = pd.Series(measurements)
    groups = [y[cats == cat] for cat in cats.categories]
    n_total = sum(g.count() for g in groups)
    y_total_mean = y.mean()
    ss_between = sum(g.count() * (g.mean() - y_total_mean)**2 for g in groups)
    ss_total = ((y - y_total_mean)**2).sum()
    return float(ss_between / ss_total) if ss_total>0 else np.nan

# Example usage
cramers_by = cramers_v_from_series(df['segment'], df['channel'])
eta_sq = correlation_ratio(df['segment'], df['monthly_revenue'])
cramers_by, eta_sq

## 8) Time‑series EDA (if `date` exists)

In [None]:
if 'date' in df:
    ts = df.set_index('date').sort_index()

    # Rolling mean and daily totals
    daily = ts.resample('D').agg({'monthly_revenue':'sum'})
    roll = daily.rolling(7, min_periods=1).mean()

    fig, ax = plt.subplots(1,1, figsize=(12,4))
    ax.plot(daily.index, daily['monthly_revenue'], alpha=0.5, label='Daily sum')
    ax.plot(roll.index, roll['monthly_revenue'], color='crimson', label='7D rolling mean', linewidth=2)
    ax.set_title('Daily revenue with 7‑day rolling mean')
    ax.legend(); plt.show()

    # Seasonal pattern by weekday & month
    ts_features = df[['date','monthly_revenue']].copy()
    ts_features['weekday'] = ts_features['date'].dt.day_name()
    ts_features['month'] = ts_features['date'].dt.month_name()

    plt.figure(figsize=(10,4))
    sns.boxplot(data=ts_features, x='weekday', y='monthly_revenue', order=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'])
    plt.xticks(rotation=20); plt.title('Revenue by weekday'); plt.tight_layout(); plt.show()

    plt.figure(figsize=(8,4))
    sns.boxplot(data=ts_features, x='month', y='monthly_revenue')
    plt.xticks(rotation=20); plt.title('Revenue by month'); plt.tight_layout(); plt.show()

## 9) Grouped summaries & pivot tables

In [None]:
# Grouped KPIs
kpis = (df
    .groupby(['segment','city'])
    .agg(
        count=('customer_id','count'),
        rev_mean=('monthly_revenue','mean'),
        rev_median=('monthly_revenue','median'),
        rev_std=('monthly_revenue','std')
    )
    .sort_values('rev_mean', ascending=False)
)
kpis.head(10)

In [None]:
# Pivot table: average revenue by segment x channel
pivot = pd.pivot_table(df, values='monthly_revenue', index='segment', columns='channel', aggfunc='mean')
plt.figure(figsize=(6,4))
sns.heatmap(pivot, annot=True, fmt='.0f', cmap='YlGnBu')
plt.title('Average revenue by segment × channel')
plt.tight_layout(); plt.show()

## 10) Quick‑report helpers (callable functions)

In [None]:
from typing import List, Optional

def quick_overview(df: pd.DataFrame, sample: int = 5):
    display(df.head(sample))
    print('
Shape:', df.shape)
    print('
Dtypes:')
    display(df.dtypes)
    print('
Missing values:')
    display(df.isna().sum()[lambda s: s>0].sort_values(ascending=False))
    print('
Numeric summary:')
    display(df.describe().T)


def num_profile(df: pd.DataFrame, cols: Optional[List[str]] = None):
    cols = cols or df.select_dtypes(include=[np.number]).columns.tolist()
    out = []
    for c in cols:
        s = df[c]
        out.append({
            'col': c,
            'count': int(s.count()),
            'mean': float(s.mean()),
            'std': float(s.std(ddof=1)),
            'min': float(s.min()),
            'p25': float(s.quantile(0.25)),
            'median': float(s.median()),
            'p75': float(s.quantile(0.75)),
            'max': float(s.max()),
            'skew': float(stats.skew(s.dropna())) if s.dropna().size>2 else np.nan,
            'kurtosis': float(stats.kurtosis(s.dropna())) if s.dropna().size>2 else np.nan,
        })
    return pd.DataFrame(out).set_index('col')

# Demo
quick_overview(df)
num_profile(df)

---
## Wrap‑up
- Use this notebook as a template for new datasets.
- Replace the synthetic data in **Section 1** with your own file.
- Start with **overview → distributions → relationships → time series (if applicable)**.
- Save useful helper functions (Section 10) into a shared utilities module.