In [71]:
df[["Satisfaction Score", "Monthly Charge", "Tenure in Months"]].dtypes


Satisfaction Score      int64
Monthly Charge        float64
Tenure in Months        int64
dtype: object

In [72]:
num_cols = ["Satisfaction Score", "Monthly Charge", "Tenure in Months"]

for col in num_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")


In [73]:
df[num_cols].isnull().sum()


Satisfaction Score    0
Monthly Charge        0
Tenure in Months      0
dtype: int64

In [74]:
df[num_cols] = df[num_cols].fillna(df[num_cols].median())


In [75]:
df[num_cols].describe()


Unnamed: 0,Satisfaction Score,Monthly Charge,Tenure in Months
count,7043.0,7043.0,7043.0
mean,3.244924,64.761692,32.386767
std,1.201657,30.090047,24.542061
min,1.0,18.25,1.0
25%,3.0,35.5,9.0
50%,3.0,70.35,29.0
75%,4.0,89.85,55.0
max,5.0,118.75,72.0


In [None]:
'''
Note:
Several numerical columns were initially stored as object types due to data
quality issues in the raw dataset. These columns were explicitly converted
to numeric formats before statistical analysis.
'''

In [None]:
# Statistical Hypothesis Testing for Customer Churn Analysis

#Purpose:
#To statistically validate whether the churn patterns observed during EDA are significant and not due to random chance.

#I've used a 5% significance level (α = 0.05) for all tests.

In [76]:
from scipy import stats
import numpy as np


In [None]:
# Hypothesis 1: Contract Type vs Churn
'''
Do customers on month-to-month contracts churn more than customers on long-term contracts?

Hypotheses-
H₀ (Null): Contract type and churn are independent
H₁ (Alternative): Contract type and churn are associated

Statistical Test-
Chi-Square Test of Independence
'''

In [77]:
contingency_contract = pd.crosstab(df["Contract"], df["Churn Label"])

chi2, p_value, dof, expected = stats.chi2_contingency(contingency_contract)

chi2, p_value


(1445.2932428377585, 0.0)

In [78]:
n = contingency_contract.sum().sum()
cramers_v = np.sqrt(chi2 / (n * (min(contingency_contract.shape)-1)))
cramers_v


0.4530009803905603

In [None]:
''' Since p < 0.05, we reject the null hypothesis.
Contract type has a statistically significant relationship with churn.
Month-to-month customers are at significantly higher churn risk, suggesting
that long-term contract incentives can reduce churn.
'''



In [None]:
# The Effect Size shows there is a moderate, meaningful difference between the two groups.
#Although not extremely large, the effect is practically meaningful and indicates a noticeable difference between groups

In [None]:
# Hypothesis 2: Satisfaction Score vs Churn
'''
Do churned customers have lower satisfaction scores?

Hypotheses-
H₀: Mean satisfaction score is the same for churned and retained customers
H₁: Mean satisfaction score differs between the two groups

Statistical Test-
Independent Samples t-test
'''

In [79]:
churned = df[df["Churn Label"] == 1]["Satisfaction Score"]
retained = df[df["Churn Label"] == 0]["Satisfaction Score"]

t_stat, p_value = stats.ttest_ind(churned, retained, equal_var=False)

t_stat, p_value


(-94.96694169823348, 0.0)

In [80]:
stats.ttest_ind(
    churned,
    retained,
    equal_var=False,
    nan_policy="omit"
)


TtestResult(statistic=-94.96694169823348, pvalue=0.0, df=3208.609473496411)

In [81]:
def cohens_d(x, y):
    nx, ny = len(x), len(y)
    pooled_std = np.sqrt(((nx-1)*x.var() + (ny-1)*y.var()) / (nx+ny-2))
    return (x.mean() - y.mean()) / pooled_std

cohens_d(churned, retained)


-2.6045587052962307

In [None]:
''This hypothesis showed an exceptionally large effect size, meaning the difference between groups is not only statistically significant but also 
highly impactful from a business perspective

In [None]:
'''The difference in satisfaction scores is statistically significant (p < 0.05).
Churned customers report meaningfully lower satisfaction, making satisfaction
score a strong churn predictor.'''


In [None]:
# Hypothesis 3: Monthly Charges vs Churn
'''
Do higher monthly charges increase churn likelihood?

Hypotheses-
H₀: Average monthly charge is the same for churned and retained customers
H₁: Average monthly charge differs between the two groups

Statistical Test-
Independent Samples t-test
'''

In [82]:
churned_charge = df[df["Churn Label"] == 1]["Monthly Charge"]
retained_charge = df[df["Churn Label"] == 0]["Monthly Charge"]

t_stat, p_value = stats.ttest_ind(churned_charge, retained_charge, equal_var=False)

t_stat, p_value


(18.407526676414655, 8.592449331549745e-73)

In [None]:
'''
Note:
Extremely small p-values are expected given the large dataset size; therefore, effect size metrics were also evaluated to assess practical
significance.
'''

In [83]:
cohens_d(churned_charge, retained_charge)


0.4462834969632656

In [None]:
# The effect size suggests a moderate and consistent difference, reinforcing the reliability of the observed pattern

In [None]:
'''Monthly charges differ significantly between churned and retained customers.
Higher pricing increases churn risk, indicating price sensitivity among customers.'''


In [None]:
# Hypothesis 4: Tenure vs Churn
'''
Do customers with shorter tenure churn more?

Hypotheses-
H₀: Tenure distribution is the same for churned and retained customers
H₁: Tenure distribution differs

Statistical Test-
Mann-Whitney U Test
'''

In [84]:
churned_tenure = df[df["Churn Label"] == 1]["Tenure in Months"]
retained_tenure = df[df["Churn Label"] == 0]["Tenure in Months"]

stats.mannwhitneyu(churned_tenure, retained_tenure, alternative="two-sided")


MannwhitneyuResult(statistic=2505137.5, pvalue=3.386768917118873e-210)

In [None]:
'''
The p-value is effectively zero, providing strong statistical evidence that tenure and satisfaction significantly differ between churned and retained 
customers.
    '''

In [None]:
'''Tenure differs significantly between churned and retained customers.
Early-tenure customers are significantly more likely to churn, highlighting
the importance of onboarding and early engagement.'''


In [None]:
# Hypothesis 5: Premium Tech Support vs Churn
'''
Does premium tech support reduce churn?

Hypotheses-
H₀: Tech support and churn are independent
H₁: Tech support impacts churn

Statistical Test-
Chi-Square Test
'''

In [85]:
contingency_support = pd.crosstab(
    df["Premium Tech Support"],
    df["Churn Label"]
)

chi2, p_value, _, _ = stats.chi2_contingency(contingency_support)

chi2, p_value


(190.16684201526067, 2.9235674453140758e-43)

In [13]:
n = contingency_support.sum().sum()
cramers_v = np.sqrt(chi2 / (n * (min(contingency_support.shape)-1)))
cramers_v


0.1643192912540774

In [None]:
# Although statistically significant, the small effect size suggests limited practical impact, which is important when making real business decisions.

In [None]:
'''Premium tech support has a statistically significant association with churn.
Customers without support are more likely to leave, suggesting bundling support
services as a retention strategy.
'''

In [None]:
## Summary of Statistical Findings
'''
- Contract type has a significant impact on churn
- Lower satisfaction scores strongly predict churn
- Higher monthly charges increase churn likelihood
- Early-tenure customers are significantly more vulnerable
- Premium tech support reduces churn risk

These results statistically validate the patterns observed during EDA and
provide strong evidence for data-driven retention strategies.
'''