
# Customer Churn Prediction — Live Project (Shiksha Skills Institute)

**Goal:** Build an end-to-end machine learning pipeline to predict customer churn for a telecom company and deploy a simple web app.

**You will:**
1. Load and clean the Telco Customer Churn dataset.
2. Explore data and visualize key patterns.
3. Build and evaluate ML models (Logistic Regression, Random Forest).
4. Create and persist a production-ready pipeline.
5. (Optional) Run a Streamlit app for live predictions.

---

## 0. Setup
> Before running:
> - Place the dataset CSV in this folder.
> - Common filenames (from Kaggle): `Telco-Customer-Churn.csv` OR `WA_Fn-UseC_-Telco-Customer-Churn.csv`.
> - Install dependencies from `requirements.txt` if needed.


In [None]:

# Core libs
import os
import numpy as np
import pandas as pd

# Visualization (No seaborn, per project constraints)
import matplotlib.pyplot as plt

# Modeling
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, RocCurveDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import joblib

# Utility
pd.set_option("display.max_columns", 100)
print("Setup complete.")



## 1. Load Data
We'll try to load from one of two common filenames. If not found, raise a helpful error.


In [None]:

candidate_files = [
    "Telco-Customer-Churn.csv",
    "WA_Fn-UseC_-Telco-Customer-Churn.csv"
]

csv_path = None
for f in candidate_files:
    if os.path.exists(f):
        csv_path = f
        break

if csv_path is None:
    raise FileNotFoundError(
        "Dataset not found. Please place 'Telco-Customer-Churn.csv' or "
        "'WA_Fn-UseC_-Telco-Customer-Churn.csv' in this folder and rerun."
    )

df = pd.read_csv(csv_path)
print(f"Loaded: {csv_path}, shape={df.shape}")
df.head()



## 2. Quick Data Audit & Cleaning
- Fix types (e.g., `TotalCharges` sometimes comes as string).
- Handle missing values.
- Strip whitespace in column names and string entries.


In [None]:

# Standardize column names
df.columns = [c.strip().replace(' ', '_') for c in df.columns]

# Strip whitespace for object columns
for c in df.select_dtypes(include=['object']).columns:
    df[c] = df[c].astype(str).str.strip()

# Convert TotalCharges to numeric if present
if 'TotalCharges' in df.columns:
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Basic info
display(df.info())
display(df.describe(include='all'))

# Missing values
na_counts = df.isna().sum().sort_values(ascending=False)
print("Missing values per column:\n", na_counts[na_counts>0])


In [None]:

# Simple imputation strategies:
# - For numeric columns: fill with median
# - For categorical columns: fill with most frequent (mode)

num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df.select_dtypes(include=['object']).columns.tolist()

for c in num_cols:
    df[c] = df[c].fillna(df[c].median())

for c in cat_cols:
    df[c] = df[c].fillna(df[c].mode()[0])

print("After imputation:", df.isna().sum().sum(), "missing values remaining.")



## 3. Exploratory Data Analysis (EDA)
We'll look at churn distribution and key drivers.


In [None]:

# Ensure target column name 'Churn' exists; convert to binary 1/0 if necessary
target_col = None
for c in df.columns:
    if c.lower() == 'churn':
        target_col = c
        break

if target_col is None:
    raise KeyError("Target column 'Churn' not found. Please ensure the dataset has a 'Churn' column.")

# Map Yes/No to 1/0 if needed
if df[target_col].dtype == 'object':
    df[target_col] = df[target_col].map({'Yes':1, 'No':0}).fillna(df[target_col]).astype(int)

# Basic churn rate
churn_rate = df[target_col].mean()
print(f"Churn rate: {churn_rate:.3f}")

# Plot churn count
plt.figure()
df[target_col].value_counts().sort_index().plot(kind='bar')
plt.title('Churn Count (0=No, 1=Yes)')
plt.xlabel('Churn')
plt.ylabel('Count')
plt.show()


In [None]:

# Example: relationship between Contract and Churn (if Contract exists)
if 'Contract' in df.columns:
    cross = pd.crosstab(df['Contract'], df[target_col], normalize='index')
    display(cross)

    plt.figure()
    cross.plot(kind='bar', rot=45)
    plt.title('Churn Rate by Contract Type')
    plt.xlabel('Contract Type')
    plt.ylabel('Proportion')
    plt.legend(title='Churn')
    plt.show()

# Example: tenure distribution by churn (if Tenure exists)
if 'tenure' in df.columns:
    plt.figure()
    df[df[target_col]==0]['tenure'].plot(kind='hist', alpha=0.5, bins=30)
    df[df[target_col]==1]['tenure'].plot(kind='hist', alpha=0.5, bins=30)
    plt.title('Tenure Distribution by Churn')
    plt.xlabel('Tenure (months)')
    plt.ylabel('Frequency')
    plt.show()



## 4. Train/Test Split & Preprocessing
We will:
- Split the data
- One-hot encode categorical features
- Scale numeric features (for linear models)
- Build pipelines for Logistic Regression and Random Forest


In [None]:

# Define X, y
X = df.drop(columns=[target_col])
y = df[target_col]

# Identify column types
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Preprocess: OneHot for categoricals, Scale numerics (benefits LR)
preprocess = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ]
)

# Pipelines
pipe_lr = Pipeline(steps=[
    ('preprocess', preprocess),
    ('clf', LogisticRegression(max_iter=200))
])

pipe_rf = Pipeline(steps=[
    ('preprocess', preprocess),
    ('clf', RandomForestClassifier(n_estimators=300, random_state=42))
])



## 5. Train & Evaluate Models
We'll compare Logistic Regression and Random Forest on:
- Accuracy, Precision, Recall, F1
- ROC AUC
- Confusion Matrix


In [None]:

def evaluate_model(pipe, name):
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    probs = None
    if hasattr(pipe, "predict_proba"):
        probs = pipe.predict_proba(X_test)[:, 1]
    print(f"\n=== {name} ===")
    print(classification_report(y_test, preds, digits=3))
    if probs is not None:
        auc = roc_auc_score(y_test, probs)
        print(f"ROC AUC: {auc:.3f}")
        RocCurveDisplay.from_predictions(y_test, probs)
        plt.title(f'ROC Curve — {name}')
        plt.show()
    cm = confusion_matrix(y_test, preds)
    print("Confusion Matrix:\n", cm)
    plt.figure()
    plt.imshow(cm, cmap='Blues')
    plt.title(f'Confusion Matrix — {name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    for (i, j), v in np.ndenumerate(cm):
        plt.text(j, i, str(v), ha='center', va='center')
    plt.show()
    return pipe

model_lr = evaluate_model(pipe_lr, "Logistic Regression")
model_rf = evaluate_model(pipe_rf, "Random Forest")



## 6. Select & Persist the Best Pipeline
For simplicity, we will persist the Random Forest pipeline (often stronger on tabular data).  
This saved file can be used directly by the Streamlit app.


In [None]:

best_pipeline = model_rf  # choose based on above evaluation
save_path = "churn_pipeline.pkl"
joblib.dump(best_pipeline, save_path)
print(f"Saved trained pipeline to {save_path}")



## 7. Try a Single Prediction (Sanity Check)
Create a single-row DataFrame mimicking a customer record and predict churn probability.


In [None]:

# Build a template record from X_test to preserve columns
sample = X_test.iloc[[0]].copy()

# Predict
proba = best_pipeline.predict_proba(sample)[0,1]
pred = best_pipeline.predict(sample)[0]
print("Sample prediction:", pred, " | Probability of churn:", round(proba, 3))

sample



## 8. Next Steps
- Iterate on feature engineering (e.g., new features like tenure buckets, TotalSpend).
- Hyperparameter tuning (GridSearchCV) for further gains.
- Ship the Streamlit app using the `churn_pipeline.pkl` saved here.
