In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import joblib

# 1️⃣ Load Dataset from local directory
print("📥 Loading dataset...")
data = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")  # ✅ Your local file

# Handle TotalCharges (some are blank strings)
data["TotalCharges"] = pd.to_numeric(data["TotalCharges"], errors="coerce")
data["TotalCharges"].fillna(data["TotalCharges"].median(), inplace=True)

# Split features & target
X = data.drop(["customerID", "Churn"], axis=1)
y = data["Churn"].map({"No": 0, "Yes": 1})

# 2️⃣ Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3️⃣ Define Preprocessing
numeric_features = ["tenure", "MonthlyCharges", "TotalCharges"]
categorical_features = [col for col in X.columns if col not in numeric_features]

numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, numeric_features),
    ("cat", categorical_pipeline, categorical_features)
])

# 4️⃣ Build Pipeline
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LogisticRegression(max_iter=1000))
])

# 5️⃣ Grid Search Setup
param_grid = [
    {"model": [LogisticRegression(max_iter=1000)],
     "model__C": [0.1, 1, 10],
     "model__solver": ["liblinear"]},
    {"model": [RandomForestClassifier()],
     "model__n_estimators": [50, 100],
     "model__max_depth": [5, 10, None]}
]

grid = GridSearchCV(pipeline, param_grid, cv=3, scoring="accuracy", verbose=2)

# 6️⃣ Train
print("🚀 Training model...")
grid.fit(X_train, y_train)

# 7️⃣ Best Model & Results
print(f"\n✅ Best Model: {grid.best_estimator_}")
print(f"🔍 Best Params: {grid.best_params_}")
print(f"🎯 Best Accuracy: {grid.best_score_:.4f}")

# 8️⃣ Save the Pipeline
joblib.dump(grid.best_estimator_, "churn_pipeline.pkl")
print("\n💾 Model pipeline saved as churn_pipeline.pkl")

# 9️⃣ Load Model & Predict Sample
loaded_model = joblib.load("churn_pipeline.pkl")
sample = X_test.iloc[[0]]
prediction = loaded_model.predict(sample)[0]
print("\n📊 Sample Prediction:", "Churn" if prediction == 1 else "No Churn")


📥 Loading dataset...
🚀 Training model...
Fitting 3 folds for each of 9 candidates, totalling 27 fits


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["TotalCharges"].fillna(data["TotalCharges"].median(), inplace=True)


[CV] END model=LogisticRegression(max_iter=1000), model__C=0.1, model__solver=liblinear; total time=   0.1s
[CV] END model=LogisticRegression(max_iter=1000), model__C=0.1, model__solver=liblinear; total time=   0.1s
[CV] END model=LogisticRegression(max_iter=1000), model__C=0.1, model__solver=liblinear; total time=   0.0s
[CV] END model=LogisticRegression(max_iter=1000), model__C=1, model__solver=liblinear; total time=   0.0s
[CV] END model=LogisticRegression(max_iter=1000), model__C=1, model__solver=liblinear; total time=   0.0s
[CV] END model=LogisticRegression(max_iter=1000), model__C=1, model__solver=liblinear; total time=   0.0s
[CV] END model=LogisticRegression(max_iter=1000), model__C=10, model__solver=liblinear; total time=   0.0s
[CV] END model=LogisticRegression(max_iter=1000), model__C=10, model__solver=liblinear; total time=   0.0s
[CV] END model=LogisticRegression(max_iter=1000), model__C=10, model__solver=liblinear; total time=   0.0s
[CV] END model=RandomForestClassifier