In [None]:
# ==============================================================
# Task 2: End-to-End ML Pipeline with Scikit-learn Pipeline API
# Objective:
#   Build a reusable ML pipeline to predict customer churn
#   using the Telco Churn dataset.
# Skills:
#   - Preprocessing (scaling, encoding)
#   - Building Scikit-learn Pipelines
#   - Hyperparameter tuning (GridSearchCV)
#   - Export pipeline with joblib
# ==============================================================

# 1Ô∏è‚É£ Install Required Libraries (if not already installed)
# Uncomment and run these lines once if needed
# !pip install scikit-learn pandas joblib

# 2Ô∏è‚É£ Import Necessary Libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
import joblib

# 3Ô∏è‚É£ Load Dataset (Telco Churn Dataset)
# üëâ Make sure you have 'Telco-Customer-Churn.csv' in your working directory
data = pd.read_csv("Telco-Customer-Churn.csv")

# 4Ô∏è‚É£ Basic Data Inspection
print("üîπ Dataset Shape:", data.shape)
print("üîπ Columns:", data.columns.tolist())
print(data.head())

# 5Ô∏è‚É£ Handle Missing Values (if any)
data = data.dropna()

# 6Ô∏è‚É£ Separate Features (X) and Target (y)
X = data.drop("Churn", axis=1)  # Features
y = data["Churn"]              # Target

# 7Ô∏è‚É£ Identify Categorical & Numerical Columns
categorical_cols = X.select_dtypes(include=["object"]).columns
numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns

print("üîπ Categorical Columns:", categorical_cols.tolist())
print("üîπ Numerical Columns:", numerical_cols.tolist())

# 8Ô∏è‚É£ Preprocessing Pipelines
# - OneHotEncoder for categorical
# - StandardScaler for numerical
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
    ]
)

# 9Ô∏è‚É£ Create a Machine Learning Pipeline
# Here we try Logistic Regression and Random Forest
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

# üîü Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 1Ô∏è‚É£1Ô∏è‚É£ Define Models & Hyperparameters for GridSearchCV
param_grid = [
    {
        "classifier": [LogisticRegression(max_iter=1000)],
        "classifier__C": [0.1, 1.0, 10.0]
    },
    {
        "classifier": [RandomForestClassifier(random_state=42)],
        "classifier__n_estimators": [50, 100],
        "classifier__max_depth": [5, 10, None]
    }
]

# 1Ô∏è‚É£2Ô∏è‚É£ Apply GridSearchCV for Best Model Selection
grid_search = GridSearchCV(
    pipeline, param_grid, cv=3, n_jobs=-1, scoring="accuracy"
)

print("‚è≥ Training and tuning the models...")
grid_search.fit(X_train, y_train)

# 1Ô∏è‚É£3Ô∏è‚É£ Best Model from GridSearch
print("‚úÖ Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

# 1Ô∏è‚É£4Ô∏è‚É£ Model Evaluation
y_pred = best_model.predict(X_test)

print("\nüîπ Accuracy:", accuracy_score(y_test, y_pred))
print("üîπ F1 Score:", f1_score(y_test, y_pred, pos_label="Yes"))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# 1Ô∏è‚É£5Ô∏è‚É£ Export the Trained Pipeline
joblib.dump(best_model, "churn_pipeline.pkl")
print("üì¶ Model pipeline saved as churn_pipeline.pkl")
