# Customer Churn Prediction Analysis

This notebook contains the complete pipeline for predicting customer churn in a telecom environment. 

## 1. Environment Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score, precision_score, recall_score, f1_score

%matplotlib inline
plt.style.use('seaborn-v0_8')
DATA_PATH = '../data/WA_Fn-UseC_-Telco-Customer-Churn.csv'

## 2. Data Loading & Cleaning

In [None]:
df = pd.read_csv(DATA_PATH)
print(f"Initial Shape: {df.shape}")

df = df.drop('customerID', axis=1)
df = df.drop('TotalCharges', axis=1)

df.head()

## 3. Preprocessing & Feature Engineering

In [None]:
le = LabelEncoder()
df['Churn'] = le.fit_transform(df['Churn'])

numeric_cols = ['tenure', 'MonthlyCharges']
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
feature_names = df.drop('Churn', axis=1).columns.tolist()

X = df.drop('Churn', axis=1)
y = df['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Feature Count: {len(feature_names)}")

## 4. Model Training & Optimization
Hyperparameter tuning using Grid Search for both Logistic Regression and Decision Tree models.

In [None]:
lr_grid = GridSearchCV(LogisticRegression(max_iter=1000), {'C': [0.01, 0.1, 1, 10]}, cv=5, scoring='f1')
lr_grid.fit(X_train, y_train)
lr_model = lr_grid.best_estimator_

dt_grid = GridSearchCV(DecisionTreeClassifier(random_state=42), {'max_depth': [3, 5, 7, 10], 'min_samples_split': [2, 5, 10]}, cv=5, scoring='f1')
dt_grid.fit(X_train, y_train)
dt_model = dt_grid.best_estimator_

print("Training Complete.")

## 5. Evaluation & Results

In [None]:
models = {'Logistic Regression': lr_model, 'Decision Tree': dt_model}
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"\n--- {name} ---")
    print(classification_report(y_test, y_pred))
    print(f"ROC-AUC: {roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]):.4f}")

## 6. Saving Artifacts for Dashboard

In [None]:
os.makedirs('models', exist_ok=True)
joblib.dump(lr_model, 'models/logistic_regression.pkl')
joblib.dump(dt_model, 'models/decision_tree.pkl')
joblib.dump(scaler, 'models/scaler.pkl')
joblib.dump(feature_names, 'models/feature_names.pkl')
print("Artifacts saved in notebooks/models/")