# =============================================================
# MILESTONE 2: Advanced Data Analysis and Feature Engineering
# =============================================================

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from pathlib import Path
import sys
import warnings
warnings.filterwarnings('ignore')

In [None]:
PROJECT_ROOT = Path.cwd().parent if 'notebooks' in str(Path.cwd()) else Path.cwd()
sys.path.insert(0, str(PROJECT_ROOT))

from src.utils.helpers import (
    engineer_features, handle_missing_values, encode_categorical_features
)
from scipy.stats import ttest_ind, chi2_contingency
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

In [None]:
PEACH      = '#FFCBA4'
PEACH_DARK = '#FF9A76'
SAGE       = '#A8C686'
SAGE_DARK  = '#7A9B57'
NEUTRAL    = '#F5F5DC'
ACCENT     = '#E07B39'

COLORS = {
    'churn_yes': PEACH_DARK,
    'churn_no': SAGE_DARK,
    'selected': SAGE_DARK,
    'rejected': PEACH,
    'palette': [SAGE_DARK, PEACH_DARK, SAGE, PEACH, ACCENT]
}

In [None]:
CLEANED_PATH = PROJECT_ROOT / "data" / "processed" / "cleaned_data.csv"
VIZ_DIR = PROJECT_ROOT / "visualizations" / "interactive"
VIZ_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
df = pd.read_csv(CLEANED_PATH)
df['Churn'] = df['Churn'].astype(int)
print(f"Loaded {df.shape[0]:,} customers × {df.shape[1]} features | Churn rate: {df['Churn'].mean()*100:.2f}%")

In [None]:
%pip install nbformat>=4.2.0

In [None]:
numerical_cols = df.select_dtypes(include=[np.number]).columns.drop('Churn')
t_results = []

for col in numerical_cols:
    churn_yes = df[df['Churn'] == 1][col]
    churn_no  = df[df['Churn'] == 0][col]
    t_stat, p_val = ttest_ind(churn_yes, churn_no, equal_var=False)
    
    # Cohen's d
    pooled_std = np.sqrt((churn_yes.std()**2 + churn_no.std()**2) / 2)
    cohens_d = (churn_yes.mean() - churn_no.mean()) / pooled_std if pooled_std > 0 else 0
    
    t_results.append({
        'Feature': col,
        'p_value': p_val,
        '-log10(p)': -np.log10(p_val) if p_val > 0 else 20,
        'cohens_d': cohens_d,
        'significant': p_val < 0.05
    })

t_df = pd.DataFrame(t_results).sort_values('p_value')
sig_count = t_df['significant'].sum()
print(f"{sig_count}/{len(numerical_cols)} numerical features are statistically significant (p < 0.05)")

fig = go.Figure()
top_sig = t_df[t_df['significant']].head(12)
fig.add_trace(go.Bar(
    y=top_sig['Feature'],
    x=top_sig['-log10(p)'],
    orientation='h',
    marker_color=[PEACH_DARK if d > 0 else SAGE_DARK for d in top_sig['cohens_d']],
    text=top_sig['p_value'].apply(lambda x: f'p={x:.2e}'),
    textposition='outside'
))
fig.update_layout(
    title="Top 12 Statistically Significant Numerical Features (T-Test)<br><sub>Color: Red = Higher in churners | Green = Lower</sub>",
    xaxis_title="-log₁₀(p-value)",
    yaxis=dict(autorange="reversed"),
    height=600,
    template="plotly_white"
)
fig.show()
fig.write_html(VIZ_DIR / "01_ttest_significance.html")

In [None]:
print("\n[3/7] Chi-squared tests on categorical features...")
cat_cols = ['International plan', 'Voice mail plan']
chi2_results = []

for col in cat_cols:
    if col not in df.columns: continue
    contingency = pd.crosstab(df[col], df['Churn'])
    chi2, p, dof, expected = chi2_contingency(contingency)
    n = contingency.sum().sum()
    cramers_v = np.sqrt(chi2 / (n * (min(contingency.shape) - 1)))
    
    chi2_results.append({
        'Feature': col,
        'chi2': chi2,
        'p_value': p,
        'cramers_v': cramers_v
    })

chi2_df = pd.DataFrame(chi2_results)
print(chi2_df)

fig = px.bar(chi2_df, x='Feature', y='cramers_v', color='Feature',
             color_discrete_map={'International plan': PEACH_DARK, 'Voice mail plan': SAGE_DARK},
             text='cramers_v', text_auto='.3f')
fig.update_layout(title="Cramér's V Effect Size (Categorical Features)",
                  yaxis_title="Cramér's V", template="plotly_white", height=500)
fig.show()
fig.write_html(VIZ_DIR / "02_chi2_effect_size.html")

In [None]:
print("\n[4/7] Applying feature engineering from src/utils/helpers.py...")
X_raw = df.drop('Churn', axis=1)
y = df['Churn']

X_engineered = engineer_features(X_raw, is_training=True)
print(f"Features: {X_raw.shape[1]} → {X_engineered.shape[1]} (+{X_engineered.shape[1] - X_raw.shape[1]})")

new_features = set(X_engineered.columns) - set(X_raw.columns)
print("New engineered features:", ", ".join(sorted(new_features)))

In [None]:
print("\n[5/7] Training Random Forest for feature importance...")
rf = RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1)
rf.fit(X_engineered, y)

importance_df = pd.DataFrame({
    'Feature': X_engineered.columns,
    'Importance': rf.feature_importances_
}).sort_values('Importance', ascending=False)

fig = px.bar(importance_df.head(20), x='Importance', y='Feature', orientation='h',
             color='Importance', color_continuous_scale=[[0, SAGE_DARK], [1, PEACH_DARK]],
             text='Importance', text_auto='.4f')
fig.update_layout(title="Top 20 Features – Random Forest Importance",
                  yaxis=dict(autorange="reversed"), height=700, template="plotly_white")
fig.show()
fig.write_html(VIZ_DIR / "03_rf_importance.html")

In [None]:
print("\n[6/7] Running RFE (same as training pipeline)...")
rfe = RFE(estimator=RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1),
          n_features_to_select=20, step=1)
rfe.fit(X_engineered, y)

selected = X_engineered.columns[rfe.support_].tolist()
ranking = pd.DataFrame({'Feature': X_engineered.columns, 'Ranking': rfe.ranking_})
ranking = ranking.sort_values('Ranking')

print(f"Selected {len(selected)} features:")
for i, f in enumerate(selected, 1):
    print(f"  {i:2d}. {f}")

fig = px.bar(ranking.head(25), y='Feature', x='Ranking', orientation='h',
             color='Ranking', color_continuous_scale='Peach', text='Ranking')
fig.update_layout(title="RFE Feature Ranking (Lower = Better)", 
                  yaxis=dict(autorange="reversed"), height=700, template="plotly_white")
fig.show()
fig.write_html(VIZ_DIR / "04_rfe_ranking.html")

In [None]:
print("\n[7/7] Building customer segmentation analysis...")
df_seg = X_engineered.copy()
df_seg['Churn'] = y

# Usage segment
df_seg['Usage_Segment'] = pd.cut(df_seg.get('Total_Minutes', df_seg.iloc[:,0]), 
                                 bins=[0, 500, 900, float('inf')], 
                                 labels=['Low', 'Medium', 'High'])

# Service calls segment
df_seg['Service_Segment'] = pd.cut(df_seg['Customer service calls'],
                                   bins=[-1, 1, 3, float('inf')],
                                   labels=['Low (0-1)', 'Medium (2-3)', 'High (4+)'])

# Churn by segment
seg1 = df_seg.groupby('Usage_Segment')['Churn'].mean().reset_index()
seg2 = df_seg.groupby('Service_Segment')['Churn'].mean().reset_index()
pivot = df_seg.groupby(['Usage_Segment', 'Service_Segment'])['Churn'].mean().unstack() * 100

fig = make_subplots(rows=1, cols=3, subplot_titles=(
    "Churn by Total Usage", "Churn by Service Calls", "High-Risk Heatmap"))
fig.add_trace(go.Bar(x=seg1['Usage_Segment'], y=seg1['Churn']*100, marker_color=[SAGE, PEACH, PEACH_DARK],
                     text=(seg1['Churn']*100).round(1), texttemplate='%{text}%'), row=1, col=1)
fig.add_trace(go.Bar(x=seg2['Service_Segment'], y=seg2['Churn']*100, marker_color=[SAGE_DARK, PEACH, PEACH_DARK],
                     text=(seg2['Churn']*100).round(1), texttemplate='%{text}%'), row=1, col=2)
fig.add_trace(go.Heatmap(z=pivot.values, x=pivot.columns, y=pivot.index,
                         colorscale=[[0, SAGE_DARK], [0.5, NEUTRAL], [1, PEACH_DARK]],
                         text=pivot.values.round(1), texttemplate='%{text}%'), row=1, col=3)
fig.update_layout(title_text="Customer Segmentation: Where Churn Lives", height=500, template="plotly_white")
fig.show()
fig.write_html(VIZ_DIR / "05_segmentation_dashboard.html")

In [None]:
import joblib
from sklearn.metrics import roc_curve, roc_auc_score
import shap
shap.initjs()

print("\n[8/8] Building ROC Curve (Before vs After) + SHAP Bar Chart...")

# ----------------------------------------------------
# Load baseline (logistic regression) and best model
# ----------------------------------------------------
MODELS_DIR = PROJECT_ROOT / "models" / "trained_models"
ARTIFACTS_DIR = PROJECT_ROOT / "models" / "artifacts"

X_test, y_test = joblib.load(ARTIFACTS_DIR / "test_data.pkl")

baseline_model = joblib.load(MODELS_DIR / "logistic_regression.pkl")
best_model = joblib.load(ARTIFACTS_DIR / "best_model_final.pkl")

# ---------------------
# ROC Curve computation
# ---------------------
y_prob_before = baseline_model.predict_proba(X_test)[:, 1]
y_prob_after = best_model.predict_proba(X_test)[:, 1]

fpr_b, tpr_b, _ = roc_curve(y_test, y_prob_before)
fpr_a, tpr_a, _ = roc_curve(y_test, y_prob_after)

auc_before = roc_auc_score(y_test, y_prob_before)
auc_after  = roc_auc_score(y_test, y_prob_after)

# ---------------------
# Plotly ROC Curve
# ---------------------
fig_roc = go.Figure()

fig_roc.add_trace(go.Scatter(
    x=fpr_b, y=tpr_b,
    mode="lines",
    name=f"Before (AUC = {auc_before:.3f})",
    line=dict(color=PEACH_DARK, width=3)
))

fig_roc.add_trace(go.Scatter(
    x=fpr_a, y=tpr_a,
    mode="lines",
    name=f"After (AUC = {auc_after:.3f})",
    line=dict(color=SAGE_DARK, width=3)
))

fig_roc.add_trace(go.Scatter(
    x=[0,1], y=[0,1],
    mode="lines",
    line=dict(dash="dash", color="#888"),
    name="Random"
))

fig_roc.update_layout(
    title="Before vs After ROC Curve",
    xaxis_title="False Positive Rate",
    yaxis_title="True Positive Rate",
    template="plotly_white",
    height=500
)

fig_roc.show()
fig_roc.write_html(VIZ_DIR / "06_before_vs_after_roc.html")

In [None]:
# -------------------------------------
# SHAP Feature Importance (Top 8)
# -------------------------------------
# XGBoost or Tree models → TreeExplainer
if "XGBClassifier" in str(type(best_model)) or "Forest" in str(type(best_model)):
    explainer = shap.TreeExplainer(best_model)
    shap_values = explainer.shap_values(X_test)
else:
    # Fallback for Logistic Regression etc.
    explainer = shap.KernelExplainer(best_model.predict_proba, shap.sample(X_test, 200))
    shap_values = explainer.shap_values(X_test)

shap_df = pd.DataFrame(shap_values[:, :, 1] if shap_values.ndim == 3 else shap_values,
                       columns=X_test.columns)

importance = shap_df.abs().mean().sort_values(ascending=False).head(8)

fig_shap = go.Figure(go.Bar(
    x=importance.values[::-1],
    y=importance.index[::-1],
    orientation="h",
    marker_color=SAGE_DARK,
    text=[f"{v:.4f}" for v in importance.values[::-1]],
    textposition="outside"
))

fig_shap.update_layout(
    title="Top 8 SHAP Features<br><sup>Mean |SHAP| (Global Importance)</sup>",
    xaxis_title="Mean |SHAP Value|",
    template="plotly_white",
    height=500
)

fig_shap.show()
fig_shap.write_html(VIZ_DIR / "07_shap_top8.html")

print("✔ Visualizations saved to /visualizations/interactive/")