# Brendon's Churn Analysis

# Telco Customer Churn â€” Data Analysis Notebook

This notebook examines a telco customer churn dataset to identify trends, correlations, and drivers of churn, and to produce a simple short-term forecast of churn rates by customer tenure. The dataset contains customer demographics, account information, services subscribed, and whether the customer churned. Key goals: understand which features most strongly relate to churn, build interpretable predictive models, visualize relationships, and produce a pragmatic short-term churn projection useful for a hiring portfolio.

Primary observations to explore:

- Basic distribution of churn and tenure.
- How contract type, monthly charges, total charges and services (streaming, internet, tech support) relate to churn.
- Multicollinearity and correlations between numerical features.
- Feature importance from logistic regression (coefficients) and a tree-based model.
- A simple forecasting exercise: compute churn rate by tenure month, fit a trend model, and project churn rate for the next 6 tenure-months as an illustrative forecast.


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
data_path = Path('/mnt/data/WA_Fn-UseC_-Telco-Customer-Churn.csv')
df = pd.read_csv(data_path)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['SeniorCitizen'] = df['SeniorCitizen'].astype(int)
missing = df.isna().sum()
missing[missing>0]

In [None]:
df = df.dropna().reset_index(drop=True)
df['ChurnFlag'] = (df['Churn'] == 'Yes').astype(int)
df.describe(include='all').T

In [None]:
import matplotlib.pyplot as plt
plt.rcParams['figure.dpi'] = 100
churn_rate = df['ChurnFlag'].mean()
fig, ax = plt.subplots()
ax.bar(['Stayed','Churned'], [1-churn_rate, churn_rate])
ax.set_title('Overall Churn Rate')
ax.set_ylabel('Proportion')
plt.show()

In [None]:
num_cols = ['tenure','MonthlyCharges','TotalCharges']
corr = df[num_cols + ['ChurnFlag']].corr()
corr

In [None]:
fig, ax = plt.subplots()
cax = ax.matshow(corr)
fig.colorbar(cax)
ax.set_xticks(range(len(corr.columns)))
ax.set_yticks(range(len(corr.index)))
ax.set_xticklabels(corr.columns, rotation=45, ha='left')
ax.set_yticklabels(corr.index)
ax.set_title('Correlation matrix (numeric)')
plt.show()

In [None]:
group = df.groupby('Contract')['ChurnFlag'].mean().sort_values(ascending=False)
group

In [None]:
fig, ax = plt.subplots()
ax.bar(group.index, group.values)
ax.set_ylabel('Churn Rate')
ax.set_title('Churn Rate by Contract Type')
plt.show()

In [None]:
cat_cols = [c for c in df.columns if df[c].dtype == 'object' and c not in ['customerID','Churn']]
cat_cols[:10]

In [None]:
df_encoded = pd.get_dummies(df.drop(columns=['customerID','Churn']), drop_first=True)
df_encoded.shape

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,roc_auc_score,classification_report

X = df_encoded.drop(columns=['ChurnFlag'])
y = df_encoded['ChurnFlag']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42, test_size=0.25)

lr = LogisticRegression(max_iter=1000, solver='liblinear')
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
y_proba_lr = lr.predict_proba(X_test)[:,1]

rf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:,1]

metrics = {
'logistic_accuracy': accuracy_score(y_test, y_pred_lr),
'logistic_roc_auc': roc_auc_score(y_test, y_proba_lr),
'rf_accuracy': accuracy_score(y_test, y_pred_rf),
'rf_roc_auc': roc_auc_score(y_test, y_proba_rf)
}
metrics

In [None]:
from sklearn.preprocessing import StandardScaler
num_ix = [i for i,c in enumerate(X.columns) if c in ['tenure','MonthlyCharges','TotalCharges']]
scaler = StandardScaler()
X_scaled = X.copy()
X_scaled[['tenure','MonthlyCharges','TotalCharges']] = scaler.fit_transform(X[['tenure','MonthlyCharges','TotalCharges']])
coef_df = pd.Series(lr.coef_[0], index=X.columns).sort_values(key=abs, ascending=False).head(20)
coef_df

In [None]:
imp = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False).head(20)
imp

In [None]:
fig, ax = plt.subplots(figsize=(8,6))
coef_df.plot.bar(ax=ax)
ax.set_title('Top 20 logistic regression coefficients (abs sorted)')
plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(8,6))
imp.plot.bar(ax=ax)
ax.set_title('Top 20 Random Forest feature importances')
plt.tight_layout()
plt.show()

In [None]:
by_tenure = df.groupby('tenure')['ChurnFlag'].mean().reset_index()
by_tenure = by_tenure.sort_values('tenure')
by_tenure.head()

In [None]:
from sklearn.linear_model import LinearRegression
X_t = by_tenure[['tenure']].values
y_t = by_tenure['ChurnFlag'].values
model_trend = LinearRegression()
model_trend.fit(X_t, y_t)
future_tenure = np.arange(by_tenure['tenure'].max()+1, by_tenure['tenure'].max()+7).reshape(-1,1)
proj = model_trend.predict(future_tenure)
fig, ax = plt.subplots()
ax.plot(by_tenure['tenure'], by_tenure['ChurnFlag'], label='Observed churn rate by tenure')
ax.plot(future_tenure.flatten(), proj, linestyle='--', label='Projected churn rate')
ax.set_xlabel('Tenure (months)')
ax.set_ylabel('Churn rate')
ax.legend()
plt.show()
pd.DataFrame({'tenure': future_tenure.flatten(), 'projected_churn_rate': proj})

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_rf)
cm