In [2]:

import pandas as pd
import numpy as np
from google.colab import files

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib

uploaded = files.upload()

for fn in uploaded.keys():
    print(f" Uploaded file: {fn}")
    DATA_PATH = "/content/" + fn

# LOAD DATASET

df = pd.read_csv(DATA_PATH)
print("\n Dataset loaded successfully!")
print("Shape:", df.shape)
print(df.head())


if "TotalCharges" in df.columns:
    df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
    df["TotalCharges"] = df["TotalCharges"].fillna(df["TotalCharges"].median())


# FEATURE TARGET SPLIT

X = df.drop("Churn", axis=1)
y = df["Churn"].map({"Yes": 1, "No": 0})


# CATEGORICAL & NUMERIC COLUMNS

cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

print("\n Categorical columns:", cat_cols)
print(" Numeric columns:", num_cols)


# TRAIN-TEST SPLIT

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


# PREPROCESSING

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)


# PIPELINES


# Logistic Regression pipeline
pipeline_logreg = Pipeline([
    ("preprocessor", preprocessor),
    ("logreg", LogisticRegression(max_iter=1000))
])

# Random Forest pipeline
pipeline_rf = Pipeline([
    ("preprocessor", preprocessor),
    ("rf", RandomForestClassifier(random_state=42))
])

# HYPERPARAMETER GRIDS

param_grid_logreg = {
    "logreg__C": [0.1, 1.0, 10],
    "logreg__class_weight": [None, "balanced"]
}

param_grid_rf = {
    "rf__n_estimators": [100, 200],
    "rf__max_depth": [None, 5, 10],
    "rf__class_weight": [None, "balanced"]
}


# GRID SEARCH SETUP

grid_logreg = GridSearchCV(
    pipeline_logreg, param_grid_logreg,
    cv=5, scoring="accuracy", n_jobs=-1
)

grid_rf = GridSearchCV(
    pipeline_rf, param_grid_rf,
    cv=5, scoring="accuracy", n_jobs=-1
)

print("\n Step 2: Training Logistic Regression...")
grid_logreg.fit(X_train, y_train)

print("\n Step 3: Training Random Forest...")
grid_rf.fit(X_train, y_train)


# EVALUATION

print("\n Evaluation Results")

# Logistic Regression
y_pred_log = grid_logreg.predict(X_test)
print("\n Logistic Regression Best Params:", grid_logreg.best_params_)
print("Accuracy (LogReg):", round(accuracy_score(y_test, y_pred_log), 4))
print("\nClassification Report (LogReg):\n", classification_report(y_test, y_pred_log))

# Random Forest
y_pred_rf = grid_rf.predict(X_test)
print("\n Random Forest Best Params:", grid_rf.best_params_)
print(" Accuracy (Random Forest):", round(accuracy_score(y_test, y_pred_rf), 4))
print("\nClassification Report (Random Forest):\n", classification_report(y_test, y_pred_rf))


print("\n Saving best model pipeline...")

best_model = grid_rf if grid_rf.best_score_ > grid_logreg.best_score_ else grid_logreg
joblib.dump(best_model, "telco_churn_pipeline.pkl")
print(" Pipeline exported as 'telco_churn_pipeline.pkl'")




Saving WA_Fn-UseC_-Telco-Customer-Churn.csv to WA_Fn-UseC_-Telco-Customer-Churn.csv
 Uploaded file: WA_Fn-UseC_-Telco-Customer-Churn.csv

 Dataset loaded successfully!
Shape: (7043, 21)
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service          