# =============================================================
# MILESTONE 1: Data Collection, Exploration, and Preprocessing
# =============================================================

In [None]:
%pip install seaborn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

In [None]:
PEACH = '#FFCBA4'
PEACH_DARK = '#FF9A76'
SAGE = '#A8C686'
SAGE_DARK = '#7A9B57'
NEUTRAL = '#F5F5DC'
ACCENT = '#E07B39'

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12
plt.rcParams['axes.titleweight'] = 'bold'
plt.rcParams['axes.titlesize'] = 16

custom_palette = [SAGE_DARK, PEACH_DARK]
churn_colors = {False: SAGE_DARK, True: PEACH_DARK}

In [None]:
PROJECT_ROOT = Path.cwd().parent if 'notebooks' in str(Path.cwd()) else Path.cwd()
RAW_PATH = PROJECT_ROOT / "data" / "raw" / "merged_churn_data.csv"
VIZ_DIR = PROJECT_ROOT / "visualizations" / "static"
VIZ_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
df = pd.read_csv(RAW_PATH)
print(f"Loaded {df.shape[0]:,} rows Ã— {df.shape[1]} columns")

In [None]:
df.shape

In [None]:
list(df.columns)

In [None]:
# Target encoding check
if df['Churn'].dtype == bool:
    df['Churn'] = df['Churn'].astype(int)
elif df['Churn'].dtype == 'object':
    df['Churn'] = (df['Churn'] == True).astype(int)

In [None]:
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({"Missing": missing, "%": missing_pct}).sort_values("%", ascending=False)
print("Missing Values:\n", missing_df[missing_df["Missing"] > 0])

In [None]:
duplicates = df.duplicated().sum()
print(f"Duplicate rows: {duplicates} ({duplicates/len(df)*100:.2f}%)")

In [None]:
churn_rate = df['Churn'].mean() * 100
print(f"Churn rate: {churn_rate:.2f}% (imbalanced)" if churn_rate < 30 else "balanced")

In [None]:
plt.figure()
ax = sns.countplot(data=df, x='Churn', palette=[SAGE_DARK, PEACH_DARK])
ax.set_title('Customer Churn Distribution')
ax.set_xlabel('Churn (1 = Yes)')
ax.set_ylabel('Count')
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x() + p.get_width()/2., height + 50,
            f'{int(height)}\n({height/len(df)*100:.1f}%)', ha="center", fontsize=12)
plt.tight_layout()
plt.savefig(VIZ_DIR / "01_churn_distribution.png", dpi=200, bbox_inches='tight')
plt.show()

In [None]:
num_cols = ['Account length', 'Total day minutes', 'Total eve minutes',
            'Total night minutes', 'Total intl minutes', 'Customer service calls',
            'Number vmail messages']

fig, axes = plt.subplots(3, 3, figsize=(16, 14))
axes = axes.ravel()
for i, col in enumerate(num_cols + ['Total day calls']):  # pad if needed
    if i >= len(axes): break
    sns.histplot(data=df, x=col, hue='Churn', kde=True, palette=churn_colors,
                 ax=axes[i], alpha=0.7, bins=40, stat="density")
    axes[i].set_title(f'Distribution of {col}')
    axes[i].legend(['No Churn', 'Churn'], title='Churn')
plt.suptitle('Numerical Feature Distributions by Churn', fontsize=18, y=0.98)
plt.tight_layout()
plt.savefig(VIZ_DIR / "02_numerical_distributions.png", dpi=200, bbox_inches='tight')
plt.show()

In [None]:
cat_cols = ['International plan', 'Voice mail plan']

fig, axes = plt.subplots(1, 2, figsize=(14, 6))
for idx, col in enumerate(cat_cols):
    if col not in df.columns:
        continue
    cross = pd.crosstab(df[col], df['Churn'], normalize='index') * 100
    cross.plot(kind='bar', stacked=True, ax=axes[idx], color=[SAGE_DARK, PEACH_DARK])
    axes[idx].set_title(f'Churn Rate by {col}')
    axes[idx].set_ylabel('Percentage (%)')
    axes[idx].legend(['No Churn', 'Churn'])
    for container in axes[idx].containers:
        axes[idx].bar_label(container, fmt='%.1f%%', fontsize=10)
plt.suptitle('Churn Rate by Categorical Features', fontsize=16)
plt.tight_layout()
plt.savefig(VIZ_DIR / "03_categorical_churn_rates.png", dpi=200, bbox_inches='tight')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='Customer service calls', hue='Churn', palette=churn_colors)
plt.title('Customer Service Calls vs Churn (Strong Predictor)')
plt.xlabel('Number of Customer Service Calls')
plt.ylabel('Count')
plt.legend(['No Churn', 'Churn'])
for p in plt.gca().patches:
    height = p.get_height()
    if height > 0:
        plt.gca().text(p.get_x() + p.get_width()/2., height + 20,
                       f'{int(height)}', ha="center", fontsize=10)
plt.tight_layout()
plt.savefig(VIZ_DIR / "04_service_calls_vs_churn.png", dpi=200, bbox_inches='tight')
plt.show()

In [None]:
# Prepare numeric df
df_corr = df.copy()
for col in ['International plan', 'Voice mail plan']:
    if col in df_corr.columns and df_corr[col].dtype == 'object':
        df_corr[col] = (df_corr[col] == 'Yes').astype(int)

df_numeric = df_corr.select_dtypes(include=[np.number])
corr = df_numeric.corr()

plt.figure(figsize=(14, 11))
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, annot=True, fmt='.2f', cmap='RdYlGn',
            center=0, square=True, linewidths=0.5, cbar_kws={"shrink": .7})
plt.title('Feature Correlation Matrix', pad=20)
plt.tight_layout()
plt.savefig(VIZ_DIR / "05_correlation_heatmap.png", dpi=200, bbox_inches='tight')
plt.show()

# Top correlations with Churn
top_corr = corr['Churn'].drop('Churn').abs().sort_values(ascending=False).head(10)
print("\nTop 10 Features Correlated with Churn:")
print(top_corr)

In [None]:
key_features = ['Total day minutes', 'Total eve minutes', 'Total night minutes',
                'Total intl minutes', 'Customer service calls']

outliers_summary = []
for col in key_features:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower) | (df[col] > upper)]
    outliers_summary.append({
        'Feature': col,
        'Outliers': len(outliers),
        '%': len(outliers)/len(df)*100
    })

outlier_df = pd.DataFrame(outliers_summary)
print("\nOutlier Summary (IQR Method):")
print(outlier_df)

In [None]:
# 2. Account Length Density
plt.figure()
sns.kdeplot(data=df[df['Churn'] == 0], x='Account length', fill=True, color=SAGE_DARK, label='No Churn')
sns.kdeplot(data=df[df['Churn'] == 1], x='Account length', fill=True, color=PEACH_DARK, label='Churn')
plt.title('Account Length Density by Churn', fontsize=14, fontweight='bold')
plt.legend()
plt.savefig(VIZ_DIR / "02_account_length_density.png", dpi=200, bbox_inches='tight')
plt.show()

# 3. Total Day Minutes Density
plt.figure()
sns.kdeplot(data=df[df['Churn'] == 0], x='Total day minutes', fill=True, color=SAGE_DARK, label='No Churn')
sns.kdeplot(data=df[df['Churn'] == 1], x='Total day minutes', fill=True, color=PEACH_DARK, label='Churn')
plt.title('Total Day Minutes Density by Churn', fontsize=14, fontweight='bold')
plt.legend()
plt.savefig(VIZ_DIR / "03_day_minutes_density.png", dpi=200, bbox_inches='tight')
plt.show()

# 4. Total Eve Minutes Density
plt.figure()
sns.kdeplot(data=df[df['Churn'] == 0], x='Total eve minutes', fill=True, color=SAGE_DARK, label='No Churn')
sns.kdeplot(data=df[df['Churn'] == 1], x='Total eve minutes', fill=True, color=PEACH_DARK, label='Churn')
plt.title('Total Eve Minutes Density by Churn', fontsize=14, fontweight='bold')
plt.legend()
plt.savefig(VIZ_DIR / "04_eve_minutes_density.png", dpi=200, bbox_inches='tight')
plt.show()

# 5. Total Night Minutes Density
plt.figure()
sns.kdeplot(data=df[df['Churn'] == 0], x='Total night minutes', fill=True, color=SAGE_DARK, label='No Churn')
sns.kdeplot(data=df[df['Churn'] == 1], x='Total night minutes', fill=True, color=PEACH_DARK, label='Churn')
plt.title('Total Night Minutes Density by Churn', fontsize=14, fontweight='bold')
plt.legend()
plt.savefig(VIZ_DIR / "05_night_minutes_density.png", dpi=200, bbox_inches='tight')
plt.show()

# 6. Total Intl Minutes Density
plt.figure()
sns.kdeplot(data=df[df['Churn'] == 0], x='Total intl minutes', fill=True, color=SAGE_DARK, label='No Churn')
sns.kdeplot(data=df[df['Churn'] == 1], x='Total intl minutes', fill=True, color=PEACH_DARK, label='Churn')
plt.title('Total International Minutes Density by Churn', fontsize=14, fontweight='bold')
plt.legend()
plt.savefig(VIZ_DIR / "06_intl_minutes_density.png", dpi=200, bbox_inches='tight')
plt.show()

# 8. Customer Service Calls Density 
plt.figure()
sns.kdeplot(data=df[df['Churn'] == 0], x='Customer service calls', fill=True, color=SAGE_DARK, label='No Churn')
sns.kdeplot(data=df[df['Churn'] == 1], x='Customer service calls', fill=True, color=PEACH_DARK, label='Churn')
plt.title('Customer Service Calls Density by Churn', fontsize=14, fontweight='bold')
plt.legend()
plt.savefig(VIZ_DIR / "08_svc_calls_density.png", dpi=200, bbox_inches='tight')
plt.show()

In [None]:
# 9. International Plan vs Churn
plt.figure()
sns.countplot(data=df, x='International plan', hue='Churn', palette=[SAGE_DARK, PEACH_DARK])
plt.title('International Plan vs Customer Churn', fontsize=14, fontweight='bold')
plt.legend(['No Churn', 'Churn'])
plt.savefig(VIZ_DIR / "09_international_plan_churn.png", dpi=200, bbox_inches='tight')
plt.show()

# 10. Voice Mail Plan vs Churn
plt.figure()
sns.countplot(data=df, x='Voice mail plan', hue='Churn', palette=[SAGE_DARK, PEACH_DARK])
plt.title('Voice Mail Plan vs Customer Churn', fontsize=14, fontweight='bold')
plt.legend(['No Churn', 'Churn'])
plt.savefig(VIZ_DIR / "10_voice_mail_plan_churn.png", dpi=200, bbox_inches='tight')
plt.show()

In [None]:
# 11. Final Correlation Heatmap (classic style)
df_corr = df.copy()
df_corr['International plan'] = (df_corr['International plan'] == 'Yes').astype(int)
df_corr['Voice mail plan'] = (df_corr['Voice mail plan'] == 'Yes').astype(int)

corr = df_corr.select_dtypes(include=[np.number]).corr()
plt.figure(figsize=(14, 11))
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": .7})
plt.title('Feature Correlation Matrix', fontsize=16, pad=20, fontweight='bold')
plt.savefig(VIZ_DIR / "11_correlation_heatmap.png", dpi=200, bbox_inches='tight')
plt.show()