In [3]:
# run once in your terminal / notebook cell
!pip install -q scikit-learn pandas joblib

In [4]:
# churn_pipeline.py  (PART 1)
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import warnings
warnings.filterwarnings("ignore")

In [5]:
CSV_PATH = "/content/WA_Fn-UseC_-Telco-Customer-Churn.csv"

df = pd.read_csv(CSV_PATH)
print("Shape:", df.shape)
df.head()

Shape: (7043, 21)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [6]:
# churn_pipeline.py  (PART 3)
df.drop("customerID", axis=1, inplace=True)
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df["TotalCharges"].fillna(df["TotalCharges"].median(), inplace=True)

# Encode target
df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})

In [7]:
# churn_pipeline.py  (PART 4)
X = df.drop("Churn", axis=1)
y = df["Churn"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [8]:
# churn_pipeline.py  (PART 5)
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ]
)

In [17]:
# Fit on the training data
preprocessor.fit(X_train)

# Build list of final feature names
ohe = preprocessor.named_transformers_["cat"]
ohe_names = ohe.get_feature_names_out(categorical_cols).tolist()
full_feature_names = numeric_cols + ohe_names

# Transform first 5 rows (returns ndarray already)
X_sample_enc = preprocessor.transform(X_train[:5])

# Create a DataFrame
df_enc = pd.DataFrame(X_sample_enc, columns=full_feature_names)

print("\n🔍 First 5 rows AFTER preprocessing (numeric + one-hot):\n")
print(df_enc.head())


🔍 First 5 rows AFTER preprocessing (numeric + one-hot):

   SeniorCitizen    tenure  MonthlyCharges  TotalCharges  gender_Female  \
0      -0.441773  0.102371       -0.521976     -0.263289            0.0   
1      -0.441773 -0.711743        0.337478     -0.504814            0.0   
2      -0.441773 -0.793155       -0.809013     -0.751213            0.0   
3      -0.441773 -0.263980        0.284384     -0.173699            1.0   
4      -0.441773 -1.281624       -0.676279     -0.990851            0.0   

   gender_Male  Partner_No  Partner_Yes  Dependents_No  Dependents_Yes  ...  \
0          1.0         1.0          0.0            1.0             0.0  ...   
1          1.0         0.0          1.0            0.0             1.0  ...   
2          1.0         0.0          1.0            0.0             1.0  ...   
3          0.0         0.0          1.0            1.0             0.0  ...   
4          1.0         0.0          1.0            0.0             1.0  ...   

   StreamingMovi

In [9]:
# churn_pipeline.py  (PART 6)
log_reg = LogisticRegression(max_iter=500, n_jobs=-1)
rf = RandomForestClassifier(n_jobs=-1, random_state=42)

log_reg_params = {
    "clf__C": [0.01, 0.1, 1, 10],
    "clf__penalty": ["l2"],
    "clf__solver": ["lbfgs", "liblinear"],
}

rf_params = {
    "clf__n_estimators": [100, 300],
    "clf__max_depth": [None, 5, 10],
    "clf__min_samples_split": [2, 5],
}

In [10]:
# churn_pipeline.py  (PART 7)
def build_and_tune(name, model, param_grid):
    pipe = Pipeline(
        steps=[
            ("prep", preprocessor),
            ("clf", model),
        ]
    )
    gs = GridSearchCV(
        pipe,
        param_grid=param_grid,
        cv=5,
        scoring="accuracy",
        n_jobs=-1,
        verbose=1,
    )
    gs.fit(X_train, y_train)
    print(f"\n{name} — Best CV accuracy: {gs.best_score_:.4f}")
    print("Best params:", gs.best_params_)
    y_pred = gs.predict(X_test)
    print("Test accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    return gs.best_estimator_

In [11]:
# churn_pipeline.py  (PART 8)
best_logreg = build_and_tune("Logistic Regression", log_reg, log_reg_params)
best_rf     = build_and_tune("Random Forest",     rf,     rf_params)

# Decide which model to keep. Here we simply keep the RF.
final_pipeline = best_rf

Fitting 5 folds for each of 8 candidates, totalling 40 fits

Logistic Regression — Best CV accuracy: 0.8049
Best params: {'clf__C': 10, 'clf__penalty': 'l2', 'clf__solver': 'lbfgs'}
Test accuracy: 0.8055358410220014
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1035
           1       0.66      0.56      0.60       374

    accuracy                           0.81      1409
   macro avg       0.75      0.73      0.74      1409
weighted avg       0.80      0.81      0.80      1409

Fitting 5 folds for each of 12 candidates, totalling 60 fits

Random Forest — Best CV accuracy: 0.8000
Best params: {'clf__max_depth': 10, 'clf__min_samples_split': 5, 'clf__n_estimators': 300}
Test accuracy: 0.801277501774308
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      1035
           1       0.66      0.52      0.58       374

    accuracy                           0.80      1409
   macro avg

In [12]:
# churn_pipeline.py  (PART 9)
MODEL_FILE = "telco_churn_pipeline.joblib"
joblib.dump(final_pipeline, MODEL_FILE)
print(f"\n✅ Full pipeline exported to {MODEL_FILE}")


✅ Full pipeline exported to telco_churn_pipeline.joblib
