In [None]:
# --- Statistical Test ---
import pandas as pd
from scipy.stats import chi2_contingency
import plotly.express as px

# Load data

try:
    df = pd.read_csv("/mnt/data/cleaned_bank_churn.csv")
except FileNotFoundError:
    df = pd.read_csv("../data/cleaned_bank_churn.csv")  # code refactored to handle file not found error using Copilot

# Recreate Income Category if one-hot encoded
income_cols = [c for c in df.columns if c.startswith('Income_Category_')]
if income_cols:
    df['Income_Category'] = df[income_cols].idxmax(axis=1).str.replace('Income_Category_', '')

# Map churn column
df['Attrition_Flag'] = df['Churn'].map({1: 'Attrited Customer', 0: 'Existing Customer'})

# Chi-square test
cont_table = pd.crosstab(df['Income_Category'], df['Attrition_Flag'])
chi2, p, dof, exp = chi2_contingency(cont_table)

print("Chi-square test for Income Category vs Churn")
print(f"Chi2 = {chi2:.3f}, p = {p:.6f}, dof = {dof}")
print("Contingency Table:")
print(cont_table)


Chi-square test for Income Category vs Churn
Chi2 = 10.990, p = 0.026673, dof = 4
Contingency Table:
Attrition_Flag   Attrited Customer  Existing Customer
Income_Category                                      
$40K - $60K                    397               2120
$60K - $80K                    189               1213
$80K - $120K                   242               1293
Less than $40K                 612               2949
Unknown                        187                925


In [3]:
# --- Plotly Visual ---
plot_df = df.groupby(['Income_Category','Attrition_Flag']).size().reset_index(name='count')
fig = px.bar(plot_df, x='Income_Category', y='count', color='Attrition_Flag',
             title='Churn by Income Category', barmode='stack')
fig.update_layout(xaxis_title="Income Category", yaxis_title="Customer Count", xaxis_tickangle=-45)
fig.show()


In [None]:
# save plotly visual to outputs folder

import os

os.makedirs("data/outputs", exist_ok=True)
fig.write_html("data/outputs/income_category_vs_churn.html")