In [1]:
# Task 4: Baseline Model Building with Pipelines and Classification Report

import pandas as pd
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Load train.csv
train = pd.read_csv("data/train.csv")

# Separate features and target
X = train.drop(columns="churn")
y = train["churn"].map({"No": 0, "Yes": 1})  # Encode churn: No -> 0, Yes -> 1

# Identify numerical and categorical columns
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

print(f"Numerical columns: {num_cols}")
print(f"Categorical columns: {cat_cols}")

# Preprocessing for numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ]
)

# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC(probability=True, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "LightGBM": LGBMClassifier(random_state=42)
}

# Function to evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1-Score": f1_score(y_test, y_pred)
    }

# Split train again into sub train-validation set for quick model testing
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Store results
results = {}

# Train and evaluate each model
for name, clf in models.items():
    print("="*60)
    print(f"Training and Evaluating: {name}")
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                            ('classifier', clf)])
    scores = evaluate_model(pipe, X_train, X_val, y_train, y_val)
    results[name] = scores

# Convert results into a DataFrame
results_df = pd.DataFrame(results).T
print("\nModel Evaluation Summary Table:")
print(results_df.sort_values(by="F1-Score", ascending=False))


Numerical columns: ['tenure', 'monthlycharges', 'totalcharges']
Categorical columns: ['gender', 'seniorcitizen', 'partner', 'dependents', 'phoneservice', 'multiplelines', 'internetservice', 'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling', 'paymentmethod']
Training and Evaluating: Logistic Regression

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.88      0.86       826
           1       0.63      0.57      0.60       299

    accuracy                           0.80      1125
   macro avg       0.74      0.72      0.73      1125
weighted avg       0.79      0.80      0.79      1125

Training and Evaluating: KNN

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.85      0.84       826
           1       0.55      0.52      0.53       299

    accuracy                           0.76  

Parameters: { "use_label_encoder" } are not used.




Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.87      0.85       826
           1       0.60      0.53      0.56       299

    accuracy                           0.78      1125
   macro avg       0.72      0.70      0.71      1125
weighted avg       0.77      0.78      0.78      1125

Training and Evaluating: LightGBM
[LightGBM] [Info] Number of positive: 1196, number of negative: 3304
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000333 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 669
[LightGBM] [Info] Number of data points in the train set: 4500, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.265778 -> initscore=-1.016151
[LightGBM] [Info] Start training from score -1.016151

Classification Report:
              precision    recall  f1



 Baseline Model Building (Step-by-Step Explanation)

1. Loaded the dataset "train.csv" and separated the features (X) and target (y).
2. Encoded the target column "churn" as 0 (No) and 1 (Yes).
3. Identified numerical and categorical columns from the features.
4. Built separate preprocessing pipelines:
   - For numerical columns: handled missing values and scaled features.
   - For categorical columns: handled missing values and applied one-hot encoding.
5. Combined both pipelines into a single ColumnTransformer for full preprocessing.
6. Defined multiple machine learning models:
   - Logistic Regression, KNN, SVM, Decision Tree, Random Forest, XGBoost, and LightGBM.
7. Created a pipeline combining preprocessing and each model.
8. Split the dataset into training and testing sets (80% train, 20% test) with stratified sampling.
9. Trained each model using the pipeline and evaluated performance:
   - Calculated Accuracy, Precision
