# Telecom Customer Churn Analysis Notebook

## --- 1. Import Libraries ---

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve

## --- 2. Load Data ---

In [None]:
data_path = "../raw_data/telecom_churn.csv"
df = pd.read_csv(data_path)

print("Shape of dataset:", df.shape)
df.head()

## --- 3. Data Cleaning ---

In [None]:
# Check for missing values
print(df.isnull().sum())

In [None]:
# Drop duplicates if any
df.drop_duplicates(inplace=True)

In [None]:
# Handle missing values (example: fill numeric with median, categorical with mode)
for col in df.select_dtypes(include=['float64','int64']).columns:
    df[col] = df[col].fillna(df[col].median())

for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].fillna(df[col].mode()[0])
# Convert categorical variables to numerical using one-hot encoding
df = pd.get_dummies(df, drop_first=True)

## --- 4. Exploratory Data Analysis (EDA) ---

In [None]:
plt.figure(figsize=(6,4))
sns.countplot(data=df, x='Churn')
plt.title("Churn Distribution")
plt.show()

In [None]:
# Example: Churn by Contract type
plt.figure(figsize=(6,4))
sns.countplot(data=df, x='Contract', hue='Churn')
plt.title("Churn by Contract Type")
plt.show()


In [None]:
# Correlation heatmap (numeric features)
plt.figure(figsize=(10,6))
sns.heatmap(df.corr(), annot=False, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()


## --- 5. Feature Engineering ---

In [None]:
# Convert categorical to dummy variables
df_encoded = pd.get_dummies(df, drop_first=True)


In [None]:
# Split features and target
X = df_encoded.drop('Churn_Yes', axis=1)
y = df_encoded['Churn_Yes']


In [None]:
# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
# Standardize numeric features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


## --- 6. Model Training ---

In [None]:
# Logistic Regression
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
y_pred_lr = logreg.predict(X_test)

print("Logistic Regression Report:")
print(classification_report(y_test, y_pred_lr))

In [None]:
# Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("Random Forest Report:")
print(classification_report(y_test, y_pred_rf))

## --- 7. Evaluation ---

In [None]:
# Confusion Matrix for Random Forest
cm = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix - Random Forest")
plt.show()

In [None]:
# ROC Curve
fpr, tpr, _ = roc_curve(y_test, rf.predict_proba(X_test)[:,1])
plt.plot(fpr, tpr, label='Random Forest')
plt.plot([0,1],[0,1],'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

print("Random Forest ROC-AUC:", roc_auc_score(y_test, rf.predict_proba(X_test)[:,1]))

## --- 8. Insights & Recommendations ---

In [None]:
# - Customers on month-to-month contracts churn more
# - Longer tenure reduces churn risk
# - Customers with high monthly charges are more likely to churn
# - Recommend loyalty programs, bundle discounts, personalized offers

## --- 9. Next Steps ---

In [None]:
# Export model predictions for Power BI dashboard
predictions = pd.DataFrame({
    'CustomerID': df['customerID'],
    'ActualChurn': y,
    'PredictedChurn_RF': rf.predict(X)
})

predictions.to_csv("../data/processed/churn_predictions.csv", index=False)

print("Predictions exported to ../data/processed/churn_predictions.csv")