# =============================================================
# MILESTONE 2: Advanced Data Analysis and Feature Engineering
# =============================================================

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from scipy import stats
from scipy.stats import chi2_contingency

from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [22]:
# Set plotting style
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

In [23]:
url = "../data/interim/initial_cleaned_data.csv"
df = pd.read_csv(url)

print(f"Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")


Dataset loaded: 3333 rows, 67 columns


# ========================
# 1. Statistical Tests
# ========================

In [24]:
# T-test for numerical features
print("\n--- T-test Results for Numerical Features ---")
numerical_cols = df.select_dtypes(include=[np.number]).columns.drop(['Churn', 'High_Customer_Service']).tolist()
t_test_results = []
for col in numerical_cols:
    group_churn = df[df['Churn'] == True][col]
    group_no_churn = df[df['Churn'] == False][col]
    t_stat, p_value = stats.ttest_ind(group_no_churn, group_churn, equal_var=False)
    significance = 'Significant' if p_value < 0.2 else 'Not Significant'  # Relaxed threshold
    print(f"{col:<25}: t-stat={t_stat:.3f}, p-value={p_value:.5f} -> {significance}")
    t_test_results.append({'Feature': col, 't-stat': t_stat, 'p-value': p_value, 'Significance': significance})


--- T-test Results for Numerical Features ---
Account length           : t-stat=-0.948, p-value=0.34333 -> Not Significant
Number vmail messages    : t-stat=5.821, p-value=0.00000 -> Significant
Total day minutes        : t-stat=-9.697, p-value=0.00000 -> Significant
Total day calls          : t-stat=-1.087, p-value=0.27731 -> Not Significant
Total eve minutes        : t-stat=-5.234, p-value=0.00000 -> Significant
Total eve calls          : t-stat=-0.472, p-value=0.63733 -> Not Significant
Total night minutes      : t-stat=-2.205, p-value=0.02780 -> Significant
Total night calls        : t-stat=-0.346, p-value=0.72950 -> Not Significant
Total intl minutes       : t-stat=-3.793, p-value=0.00016 -> Significant
Total intl calls         : t-stat=3.402, p-value=0.00071 -> Significant
Customer service calls   : t-stat=-7.589, p-value=0.00000 -> Significant


In [25]:
# Chi-squared tests for categorical features
categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
print("\n--- Chi-squared Test Results ---")
chi2_results = []
for col in categorical_cols:
    contingency_table = pd.crosstab(df[col], df['Churn'])
    chi2, p_value, _, _ = chi2_contingency(contingency_table)
    significance = 'Significant' if p_value < 0.2 else 'Not Significant'  # Relaxed threshold
    chi2_results.append({'Feature': col, 'Chi2': chi2, 'p-value': p_value, 'Significance': significance})

chi2_df = pd.DataFrame(chi2_results).sort_values(by='p-value')
print(chi2_df)


--- Chi-squared Test Results ---
                   Feature        Chi2       p-value     Significance
52  International plan_Yes  222.565757  2.493108e-50      Significant
53     Voice mail plan_Yes   34.131660  5.150640e-09      Significant
30                State_NJ    7.082219  7.785253e-03      Significant
42                State_TX    5.720015  1.677259e-02      Significant
19                State_MD    4.757345  2.917352e-02      Significant
44                State_VA    3.435014  6.382803e-02      Significant
39                State_SC    3.162475  7.534888e-02      Significant
3                 State_CA    3.061232  8.018087e-02      Significant
2                 State_AZ    2.930757  8.690672e-02      Significant
21                State_MI    2.737304  9.803006e-02      Significant
10                State_HI    2.704046  1.000940e-01      Significant
24                State_MS    2.108423  1.464902e-01      Significant
32                State_NV    1.932201  1.645178e-01    

# ========================
# 2. Feature Engineering 
# ========================

In [26]:
# Create new features
# Customer tenure (normalize Account length)
df['Tenure'] = df['Account length'] / df['Account length'].max()

# Usage patterns: Total minutes and calls across all periods
df['Total_minutes'] = df['Total day minutes'] + df['Total eve minutes'] + df['Total night minutes'] + df['Total intl minutes']
df['Total_calls'] = df['Total day calls'] + df['Total eve calls'] + df['Total night calls'] + df['Total intl calls']

# Usage ratios
df['Day_minutes_per_call'] = df['Total day minutes'] / (df['Total day calls'] + 1)  # Add 1 to avoid division by zero
df['Eve_minutes_per_call'] = df['Total eve minutes'] / (df['Total eve calls'] + 1)
df['Night_minutes_per_call'] = df['Total night minutes'] / (df['Total night calls'] + 1)

# Interaction term: Customer service calls * International plan
df['Intl_plan_service_interaction'] = df['Customer service calls'] * df['International plan_Yes']

In [27]:
# Update numerical columns with new features
numerical_cols += ['Tenure', 'Total_minutes', 'Total_calls', 'Day_minutes_per_call', 
                   'Eve_minutes_per_call', 'Night_minutes_per_call', 'Intl_plan_service_interaction']

# ========================
# 3. Feature Selection
# ========================

In [28]:
# Select significant features (p < 0.2 + domain knowledge)
significant_features = [
    col for col in numerical_cols if col not in ['Account length'] or t_test_results[numerical_cols.index(col)]['p-value'] < 0.2
] + [
    col for col in categorical_cols if chi2_df[chi2_df['Feature'] == col]['p-value'].iloc[0] < 0.2
] + ['High_Customer_Service']
domain_features = ['Account length', 'Total day calls', 'Total eve calls', 'Total night calls']
significant_features = list(set(significant_features + domain_features))
print(f"\nSignificant features (p < 0.2 + domain): {len(significant_features)} features - {significant_features}")



Significant features (p < 0.2 + domain): 34 features - ['Total intl calls', 'State_MS', 'Eve_minutes_per_call', 'State_WA', 'Total day minutes', 'Number vmail messages', 'Total_calls', 'State_MD', 'Night_minutes_per_call', 'Total_minutes', 'High_Customer_Service', 'State_TX', 'International plan_Yes', 'Total intl minutes', 'Total eve calls', 'State_NV', 'Tenure', 'State_CA', 'Voice mail plan_Yes', 'State_VA', 'Total eve minutes', 'Total day calls', 'Total night calls', 'State_AZ', 'State_HI', 'State_SC', 'Account length', 'Customer service calls', 'State_MI', 'Total night minutes', 'Day_minutes_per_call', 'State_WV', 'Intl_plan_service_interaction', 'State_NJ']


In [29]:
# Remove highly correlated features (threshold 0.95)
corr_matrix = df[significant_features].corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
print(f"\nDropping highly correlated features: {to_drop}")
df = df.drop(to_drop, axis=1)
significant_features = [f for f in significant_features if f not in to_drop]


Dropping highly correlated features: ['Voice mail plan_Yes', 'Account length']


In [30]:
# Scale numerical features
scaler = StandardScaler()
numerical_cols = [col for col in significant_features if col in df.select_dtypes(include=[np.number]).columns]
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [31]:
# RFE with Random Forest
X = df[significant_features]
y = df['Churn']
model = RandomForestClassifier(random_state=42)
rfe = RFE(model, n_features_to_select=15)  
rfe.fit(X, y)
selected_features = X.columns[rfe.support_].tolist()
print(f"\nSelected features by RFE (Random Forest): {selected_features}")


Selected features by RFE (Random Forest): ['Total intl calls', 'Eve_minutes_per_call', 'Total day minutes', 'Number vmail messages', 'Night_minutes_per_call', 'Total_minutes', 'High_Customer_Service', 'International plan_Yes', 'Total intl minutes', 'Tenure', 'Total eve minutes', 'Total day calls', 'Customer service calls', 'Total night minutes', 'Day_minutes_per_call']


In [32]:
# Feature importance for validation
model_rf = RandomForestClassifier(random_state=42)
model_rf.fit(X, y)
importances = pd.Series(model_rf.feature_importances_, index=X.columns)
print("\nFeature Importances (Random Forest):")
print(importances.sort_values(ascending=False))


Feature Importances (Random Forest):
Total day minutes                0.144338
Total_minutes                    0.130230
Day_minutes_per_call             0.062577
International plan_Yes           0.059450
Customer service calls           0.057966
Total intl minutes               0.056727
High_Customer_Service            0.056603
Total eve minutes                0.050086
Total intl calls                 0.047459
Number vmail messages            0.038168
Eve_minutes_per_call             0.036510
Total night minutes              0.035782
Total day calls                  0.034838
Night_minutes_per_call           0.032644
Total_calls                      0.030745
Tenure                           0.027771
Total night calls                0.027098
Total eve calls                  0.026928
Intl_plan_service_interaction    0.020525
State_TX                         0.002930
State_NJ                         0.002912
State_MS                         0.002744
State_MI                         0.002

In [33]:
# Save processed dataset
output_dir = "../data/processed"
os.makedirs(output_dir, exist_ok=True)
df_selected = df[selected_features + ['Churn']]
df_selected.to_csv(os.path.join(output_dir, "processed_data.csv"), index=False)
print(f"\nProcessed dataset saved: {os.path.join(output_dir, 'processed_data.csv')}")


Processed dataset saved: ../data/processed\processed_data.csv


In [34]:
df_selected.shape

(3333, 16)

# ========================
# 3. Data Visualization
# ========================

In [35]:
# Box plot: Numerical features by Churn
if numerical_cols:
    top_n_features = importances[numerical_cols].sort_values(ascending=False).index[:3].tolist()  # Top 3 numerical features
    box_fig = make_subplots(rows=1, cols=len(top_n_features), subplot_titles=top_n_features)
    for i, feature in enumerate(top_n_features, 1):
        box_fig.add_trace(
            go.Box(x=df_selected['Churn'], y=df_selected[feature], marker_color=colors[True], showlegend=False),
            row=1, col=i
        )
    box_fig.update_layout(
        title_text="Box Plots of Top Numerical Features by Churn", title_x=0.5,
        template="plotly_white", height=500, width=1200
    )
    box_fig.show()
    box_fig.write_html(os.path.join(output_dir_viz, "box_plots.html"))
    print(f"Box plots saved: {os.path.join(output_dir_viz, 'box_plots.html')}")

# Correlation heatmap for numerical features
if numerical_cols:
    corr_matrix = df_selected[numerical_cols].corr()
    heatmap_fig = px.imshow(
        corr_matrix, text_auto=".2f", color_continuous_scale="RdBu_r",
        title="Correlation Heatmap of Numerical Features"
    )
    heatmap_fig.update_layout(height=600, width=800, title_x=0.5, template="plotly_white")
    heatmap_fig.show()
    heatmap_fig.write_html(os.path.join(output_dir_viz, "correlation_heatmap.html"))
    print(f"Correlation heatmap saved: {os.path.join(output_dir_viz, 'correlation_heatmap.html')}")

# Scatter plot: Top two numerical features by Churn
if len(numerical_cols) >= 2:
    top_two_features = importances[numerical_cols].sort_values(ascending=False).index[:2].tolist()
    scatter_fig = px.scatter(
        df_selected, x=top_two_features[0], y=top_two_features[1], color="Churn",
        color_discrete_map=colors, title=f"Scatter Plot: {top_two_features[0]} vs {top_two_features[1]} by Churn",
        opacity=0.6
    )
    scatter_fig.update_layout(template="plotly_white", height=600, width=800, title_x=0.5)
    scatter_fig.show()
    scatter_fig.write_html(os.path.join(output_dir_viz, "scatter_plot.html"))
    print(f"Scatter plot saved: {os.path.join(output_dir_viz, 'scatter_plot.html')}")
else:
    print("Not enough numerical features for scatter plot.")


Box plots saved: ../visualizations/interactive\box_plots.html


KeyError: "['Total_calls', 'Total eve calls', 'Total night calls', 'Intl_plan_service_interaction'] not in index"

# -------------------------------------
# 3.2.dashboard
# -------------------------------------

In [None]:
# Color scheme
colors = {False: "#1f77b4", True: "#ff4040"}

# Pie chart: Churn distribution
fig_pie = px.pie(
    df_selected, names="Churn", color="Churn",
    color_discrete_map=colors, hole=0.4,
    title="Churn Distribution"
)
fig_pie.update_traces(
    textinfo="percent+label", pull=[0, 0.05], 
    marker=dict(line=dict(color="#ffffff", width=2)),
    hovertemplate="Churn: %{label}<br>Percentage: %{percent:.1%}<extra></extra>"
)

# Bar chart: Significant categorical features
sig_cat_cols = [col for col in selected_features if col in df.select_dtypes(exclude=[np.number]).columns]
if sig_cat_cols:
    churn_by_cat = df_selected[sig_cat_cols + ['Churn']].melt(id_vars='Churn', var_name='Feature', value_name='Value')
    churn_by_cat = churn_by_cat[churn_by_cat['Value'] == 1].groupby(['Feature', 'Churn']).size().reset_index(name='Count')
    fig_bar = px.bar(
        churn_by_cat, x='Feature', y='Count', color='Churn',
        color_discrete_map=colors, barmode='group',
        title="Churn by Categorical Features"
    )
    fig_bar.update_layout(xaxis_tickangle=-45)
    fig_bar.update_traces(marker=dict(line=dict(color="#ffffff", width=1.5)))
else:
    fig_bar = px.bar(title="No significant categorical features selected")

# Histogram: Top numerical feature
numerical_cols = [col for col in selected_features if col in df.select_dtypes(include=[np.number]).columns]
if numerical_cols:
    top_num_feature = importances[numerical_cols].idxmax()
    fig_hist = px.histogram(
        df_selected, x=top_num_feature, color="Churn", nbins=40, barmode="overlay",
        color_discrete_map=colors, title=f"Distribution of {top_num_feature}"
    )
    fig_hist.update_traces(opacity=0.75, marker=dict(line=dict(color="#ffffff", width=1)))
else:
    fig_hist = px.histogram(title="No numerical features selected")

# Box plot: Second most important numerical feature
if len(numerical_cols) > 1:
    second_num_feature = importances[numerical_cols].sort_values(ascending=False).index[1]
    fig_box = px.box(
        df_selected, x="Churn", y=second_num_feature, color="Churn",
        color_discrete_map=colors, title=f"{second_num_feature} by Churn"
    )
    fig_box.update_traces(marker=dict(line=dict(color="#ffffff", width=1)))
else:
    fig_box = px.box(title="Not enough numerical features for box plot")

# Combine into dashboard
dashboard = make_subplots(
    rows=2, cols=2,
    specs=[[{'type': 'domain'}, {'type': 'xy'}], [{'type': 'xy'}, {'type': 'xy'}]],
    subplot_titles=("Churn Distribution", "Categorical Features", f"{top_num_feature} Distribution", f"{second_num_feature if len(numerical_cols) > 1 else 'No'} Box Plot"),
    horizontal_spacing=0.1, vertical_spacing=0.15
)

for trace in fig_pie.data:
    dashboard.add_trace(trace, row=1, col=1)
for trace in fig_bar.data:
    dashboard.add_trace(trace, row=1, col=2)
for trace in fig_hist.data:
    dashboard.add_trace(trace, row=2, col=1)
for trace in fig_box.data:
    dashboard.add_trace(trace, row=2, col=2)

# Update layout for visual appeal
dashboard.update_layout(
    height=900, width=1400, title_text="Enhanced Customer Churn Dashboard", title_x=0.5, title_font=dict(size=24, family="Arial", color="#333333"),
    template="plotly_white", 
    legend=dict(orientation="h", y=-0.1, x=0.5, xanchor="center", font=dict(size=12)),
    margin=dict(l=60, r=60, t=150, b=80),
    plot_bgcolor="#f8f9fa", paper_bgcolor="#f8f9fa",
    font=dict(family="Arial", color="#333333")
)

# Update axes for consistency
dashboard.update_xaxes(showgrid=True, gridcolor="#e9ecef", zeroline=False)
dashboard.update_yaxes(showgrid=True, gridcolor="#e9ecef", zeroline=False)

# Show and save dashboard
dashboard.show()
output_dir_viz = "../visualizations/interactive"
os.makedirs(output_dir_viz, exist_ok=True)
dashboard.write_html(os.path.join(output_dir_viz, "churn_dashboard.html"))
print(f"\nDashboard saved: {os.path.join(output_dir_viz, 'churn_dashboard.html')}")


Dashboard saved: ../visualizations/interactive\churn_dashboard.html
