In [None]:
# Step 1: Imports
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Step 2: Load Data (replace with your churn dataset path)
df = pd.read_csv("/content/WA_Fn-UseC_-Telco-Customer-Churn.csv")

# Step 3: Define Features & Target
X = df.drop("Churn", axis=1)   # Features
y = df["Churn"]                # Target

# Step 4: Split Train & Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Identify Column Types
num_features = X.select_dtypes(include=["int64", "float64"]).columns
cat_features = X.select_dtypes(include=["object"]).columns

# Step 6: Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_features),    # scale numeric features
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)  # encode categoricals
    ]
)

# Step 7: Create Pipeline (Logistic Regression as example)
log_reg_pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

# Step 8: GridSearchCV for Logistic Regression
param_grid_logreg = {
    "classifier__C": [0.1, 1, 10],
    "classifier__solver": ["liblinear", "lbfgs"]
}

grid_logreg = GridSearchCV(log_reg_pipeline, param_grid_logreg, cv=5, scoring="accuracy")
grid_logreg.fit(X_train, y_train)

print("Best Logistic Regression Params:", grid_logreg.best_params_)

# Step 9: Test Best Logistic Regression Model
y_pred_logreg = grid_logreg.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_logreg))
print(classification_report(y_test, y_pred_logreg))

# Step 10: Random Forest Pipeline
rf_pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

# Step 11: GridSearchCV for Random Forest
param_grid_rf = {
    "classifier__n_estimators": [50, 100],
    "classifier__max_depth": [5, 10, None]
}

grid_rf = GridSearchCV(rf_pipeline, param_grid_rf, cv=5, scoring="accuracy")
grid_rf.fit(X_train, y_train)

print("Best Random Forest Params:", grid_rf.best_params_)

# Step 12: Test Best Random Forest Model
y_pred_rf = grid_rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

# Step 13: Save Best Model (example: Random Forest)
joblib.dump(grid_rf.best_estimator_, "best_churn_model.pkl")

# Step 14: Load Saved Model and Predict
loaded_model = joblib.load("best_churn_model.pkl")
sample_pred = loaded_model.predict(X_test[:5])
print("Predictions on sample:", sample_pred)


Best Logistic Regression Params: {'classifier__C': 1, 'classifier__solver': 'lbfgs'}
Logistic Regression Accuracy: 0.8261178140525195
              precision    recall  f1-score   support

          No       0.86      0.91      0.88      1036
         Yes       0.70      0.60      0.64       373

    accuracy                           0.83      1409
   macro avg       0.78      0.75      0.76      1409
weighted avg       0.82      0.83      0.82      1409

Best Random Forest Params: {'classifier__max_depth': None, 'classifier__n_estimators': 100}
Random Forest Accuracy: 0.7963094393186657
              precision    recall  f1-score   support

          No       0.82      0.92      0.87      1036
         Yes       0.67      0.46      0.54       373

    accuracy                           0.80      1409
   macro avg       0.75      0.69      0.71      1409
weighted avg       0.78      0.80      0.78      1409

Predictions on sample: ['Yes' 'No' 'No' 'Yes' 'No']


In [None]:
# Install Hugging Face Transformers (includes PyTorch)
!pip install -q transformers datasets

**Project Overview**

Built an end-to-end ML pipeline for customer churn prediction using the Telco Churn dataset.

Focused on making the solution reusable and production-ready.

Steps include preprocessing, model training, hyperparameter tuning, evaluation, and model export.

 **Why Pipeline?**

Automates data preprocessing + model training into a single workflow.

Ensures preprocessing (scaling, encoding) is always applied correctly to new data.

Makes the workflow clean, reusable, and ready for deployment.

 ***Why Joblib?***

Saves the entire trained pipeline (preprocessing + model).

Avoids retraining the model every time.

Enables quick reloading and direct predictions on new data.

***Workflow Steps***

Load dataset and separate features/target.

Train-test split for evaluation.

Preprocess numeric (scaling) and categorical (encoding) features.

Build pipelines with Logistic Regression and Random Forest.

Use GridSearchCV for hyperparameter tuning.

Evaluate both models with accuracy, precision, recall, and f1-score.

Export the best pipeline with Joblib.

Reload and use the saved model for new predictions.

 **Results**
Logistic Regression

Best Params: C = 1, solver = lbfgs

Accuracy: ~83%

Performs better overall, especially in recall for "No churn".

Random Forest

Best Params: n_estimators = 100, max_depth = None

Accuracy: ~80%

Slightly weaker performance compared to Logistic Regression on this dataset.

Sample Predictions

Example output: ['Yes', 'No', 'No', 'Yes', 'No']

 **Final Notes**

Logistic Regression was the stronger model here.

The saved .pkl file can be reused in a web app or API without retraining.

This project demonstrates ML pipeline construction, tuning, and deployment readiness.