In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer
import joblib

# Clean, working dataset URL (ASCII hyphen)
url = "https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv"

# Load the dataset
df = pd.read_csv(url)

# Basic cleaning
df.drop("customerID", axis=1, inplace=True)
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"].replace(" ", np.nan), errors="coerce")
df["TotalCharges"].fillna(df["TotalCharges"].median(), inplace=True)

# Feature & target separation
X = df.drop("Churn", axis=1)
y = df["Churn"].map({"Yes": 1, "No": 0})

# Identify column types
numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()

# Preprocessing pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

# Define models
models = {
    'logistic': LogisticRegression(max_iter=1000),
    'random_forest': RandomForestClassifier(random_state=42)
}

pipelines = {
    name: Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ]) for name, model in models.items()
}

# Hyperparameter search space
param_grids = {
    'logistic': {
        'classifier__C': [0.1, 1, 10],
        'classifier__solver': ['liblinear']
    },
    'random_forest': {
        'classifier__n_estimators': [50, 100],
        'classifier__max_depth': [5, 10, None]
    }
}

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and tune
best_models = {}
for name in pipelines:
    print(f"\nTuning {name.upper()} model...")
    grid_search = GridSearchCV(pipelines[name], param_grids[name], cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_models[name] = grid_search.best_estimator_
    print(f"Best parameters for {name}: {grid_search.best_params_}")
    print(f"Validation Accuracy: {grid_search.best_score_:.4f}")

# Final test evaluation (choose best-performing model)
final_model = best_models['random_forest']  # or 'logistic'
y_pred = final_model.predict(X_test)

print("\nFinal Test Set Evaluation:")
print(classification_report(y_test, y_pred))
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")

# Export the model pipeline
joblib.dump(final_model, "telco_churn_model.pkl")
print("\nModel pipeline saved as 'telco_churn_model.pkl'")



Tuning LOGISTIC model...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["TotalCharges"].fillna(df["TotalCharges"].median(), inplace=True)


Best parameters for logistic: {'classifier__C': 1, 'classifier__solver': 'liblinear'}
Validation Accuracy: 0.8016

Tuning RANDOM_FOREST model...
Best parameters for random_forest: {'classifier__max_depth': 10, 'classifier__n_estimators': 50}
Validation Accuracy: 0.7980

Final Test Set Evaluation:
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      1036
           1       0.66      0.52      0.58       373

    accuracy                           0.80      1409
   macro avg       0.75      0.71      0.73      1409
weighted avg       0.79      0.80      0.79      1409

Test Accuracy: 0.8027

Model pipeline saved as 'telco_churn_model.pkl'
