**Step 1 — Import Libraries**

In [None]:
import pandas as pd
import numpy as np

# Visualization (optional)
import matplotlib.pyplot as plt
import seaborn as sns

# ML
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Save model
import joblib

**Step 2 — Load Dataset**

In [None]:
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

**Step 3 — Data Cleaning**

In [None]:
#Remove customerID (not useful)
df.drop("customerID", axis=1, inplace=True)

In [None]:
#Convert TotalCharges to numeric
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df.dropna(inplace=True)

**Step 4 — Encode Target Variable**

In [None]:
df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})

**Step 5 — Split Features & Target**

In [None]:
X = df.drop("Churn", axis=1)
y = df["Churn"]

**Step 6 — Identify Column Types**

In [None]:
categorical_cols = X.select_dtypes(include=["object"]).columns
numerical_cols = X.select_dtypes(exclude=["object"]).columns

**Step 7 — Preprocessing Pipeline**

In [None]:
#Scaling + Encoding inside Pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
    ]
)

**Step 8 — Train Test Split**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

**Step 9 — Create Pipeline**

In [None]:
# Logistic Regression Pipeline
log_pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("model", LogisticRegression(max_iter=1000))
])

**Step 10 — GridSearchCV (Hyperparameter Tuning)**

In [None]:
param_grid_log = {
    "model__C": [0.01, 0.1, 1, 10],
    "model__solver": ["liblinear", "lbfgs"]
}

grid_log = GridSearchCV(
    log_pipeline,
    param_grid_log,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

grid_log.fit(X_train, y_train)

**Step 11 — Evaluation**

In [None]:
y_pred_log = grid_log.predict(X_test)

print("Best Params:", grid_log.best_params_)

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred_log))

In [None]:
print(classification_report(y_test, y_pred_log))

**Step 12 — Random Forest Pipeline**

In [None]:
rf_pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("model", RandomForestClassifier(random_state=42))
])

**Step 13 — GridSearch for Random Forest**

In [None]:
param_grid_rf = {
    "model__n_estimators": [100, 200],
    "model__max_depth": [None, 10, 20],
    "model__min_samples_split": [2, 5]
}

grid_rf = GridSearchCV(
    rf_pipeline,
    param_grid_rf,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

grid_rf.fit(X_train, y_train)

**Step 14 — Evaluation**

In [None]:
y_pred_rf = grid_rf.predict(X_test)

print("Best Params:", grid_rf.best_params_)

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred_rf))

In [None]:
print(classification_report(y_test, y_pred_rf))

**Step 15 — Export Final Pipeline (Production Ready)**

In [None]:
joblib.dump(grid_rf.best_estimator_, "churn_pipeline.pkl")

**Step 16 — Load & Predict Later (Reuse)**

In [None]:
model = joblib.load("churn_pipeline.pkl")

sample_prediction = model.predict(X_test.iloc[:5])
print(sample_prediction)