In [None]:
# 📌 Task 2: End-to-End ML Pipeline with Scikit-learn
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

# --------------------------
# Step 1: Load Dataset
# --------------------------
path= '/content/WA_Fn-UseC_-Telco-Customer-Churn(4).csv'
data = pd.read_csv(path)

# Drop customerID (not useful)
data = data.drop("customerID", axis=1)

# Convert target to binary (Yes=1, No=0)
data["Churn"] = data["Churn"].map({"Yes": 1, "No": 0})

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


In [None]:
# --------------------------
# Step 2: Train-test split
# --------------------------
X = data.drop("Churn", axis=1)
y = data["Churn"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
# --------------------------
# Step 3: Preprocessing
# --------------------------
# Separate categorical and numerical columns
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()
numerical_features = X.select_dtypes(exclude=["object"]).columns.tolist()

# Define transformers
categorical_transformer = OneHotEncoder(handle_unknown="ignore")
numerical_transformer = StandardScaler()
# Column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("categorical", categorical_transformer, categorical_features),
        ("numerical", numerical_transformer, numerical_features),
    ]
)


In [None]:
# --------------------------
# Step 4: Pipeline
# --------------------------
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", LogisticRegression(max_iter=1000))
])

# --------------------------
# Step 5: Hyperparameter tuning
# --------------------------
param_grid = [
    {   # Logistic Regression
        "model": [LogisticRegression(max_iter=1000)],
        "model__C": [0.1, 1.0, 10.0]
    },
    {   # Random Forest
        "model": [RandomForestClassifier(random_state=42)],
        "model__n_estimators": [100, 200],
        "model__max_depth": [5, 10, None]
    }
]

grid_search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, scoring="f1")
grid_search.fit(X_train, y_train)


In [None]:

# --------------------------
# Step 6: Evaluation
# --------------------------
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)
print("✅ Best Parameters:", grid_search.best_params_)
print("📊 Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


✅ Best Parameters: {'model': LogisticRegression(max_iter=1000), 'model__C': 1.0}
📊 Accuracy: 0.794180269694819

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.89      0.86      1035
           1       0.63      0.54      0.58       374

    accuracy                           0.79      1409
   macro avg       0.74      0.71      0.72      1409
weighted avg       0.79      0.79      0.79      1409



In [None]:

# --------------------------
# Step 7: Save pipeline
# --------------------------
joblib.dump(best_model, "churn_pipeline.pkl")
print("💾 Pipeline saved as churn_pipeline.pkl")

💾 Pipeline saved as churn_pipeline.pkl


In [None]:

# --------------------------
# Step 8: Load and predict (example)
# --------------------------
loaded_model = joblib.load("churn_pipeline.pkl")
sample = X_test.iloc[:5]
print("\n🔮 Predictions on sample data:", loaded_model.predict(sample).tolist())


🔮 Predictions on sample data: [0, 1, 0, 0, 0]


In [None]:
print(sample)

      gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
437     Male              0     Yes        Yes      72          Yes   
2280  Female              1      No         No       8          Yes   
2235  Female              0     Yes        Yes      41          Yes   
4460    Male              0     Yes         No      18          Yes   
3761  Female              0     Yes         No      72          Yes   

     MultipleLines InternetService OnlineSecurity OnlineBackup  \
437            Yes     Fiber optic            Yes          Yes   
2280           Yes     Fiber optic             No           No   
2235           Yes             DSL            Yes          Yes   
4460            No     Fiber optic             No           No   
3761           Yes             DSL            Yes          Yes   

     DeviceProtection TechSupport StreamingTV StreamingMovies        Contract  \
437               Yes         Yes         Yes             Yes        Two year   
2280          