# **Classification**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
filtered = pd.read_csv('/content/drive/MyDrive/Projects/Project excellence series/ML_Classification&Regression/classification.csv')
filtered.head()

Unnamed: 0,person_age,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,71948.0,0.0,RENT,23093.125,16.02,0.37,3.0,561.0,No,1
1,21.0,12282.0,0.0,OWN,1000.0,11.14,0.08,2.0,504.0,Yes,0
2,25.0,12438.0,3.0,MORTGAGE,5500.0,12.87,0.37,3.0,635.0,No,1
3,23.0,79753.0,0.0,RENT,23093.125,15.23,0.37,2.0,675.0,No,1
4,24.0,66135.0,1.0,RENT,23093.125,14.27,0.37,4.0,586.0,No,1


### Checking if the target classes are balanced

In [None]:
filtered['loan_status'].value_counts()

Unnamed: 0_level_0,count
loan_status,Unnamed: 1_level_1
0,34988
1,10000


## **Handling class imbalance**

**SMOTENC** (Synthetic Minority Over-sampling Technique for Nominal and Continuous variables) is used to balance the  dataset where the target variable (loan_status) has an uneven distribution.

In [None]:
# Identify categorical columns for SMOTENC
filtered.select_dtypes(include=['object']).columns

Index(['person_home_ownership', 'previous_loan_defaults_on_file'], dtype='object')

In [3]:
from imblearn.over_sampling import SMOTE, SMOTENC

# Define features (X) and target (y)
X = filtered.drop('loan_status', axis=1)
y = filtered['loan_status']

categorical_cols = ['person_home_ownership', 'previous_loan_defaults_on_file']

# SMOTE needs categorical columns as indices for non-continuous features
categorical_indices = [X.columns.get_loc(col) for col in categorical_cols]

# Apply SMOTE
smote = SMOTENC(categorical_features=categorical_indices, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Convert to DataFrame
df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
df_resampled['loan_status'] = y_resampled

# Verify the new class distribution
df_resampled['loan_status'].value_counts()

Unnamed: 0_level_0,count
loan_status,Unnamed: 1_level_1
1,34988
0,34988


# Model building

In [6]:
import warnings
import numpy as np
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer

warnings.filterwarnings("ignore")

# 🧱 **What is ColumnTransformer?**
ColumnTransformer lets you apply different transformations to different columns. For example, scale numerical features, and encode categorical ones — all in one step.

✅ **Why use ColumnTransformer?**

* Handle mixed data types.

* Avoid writing separate code for each transformation.

* Works great with pipelines!

In [4]:
df_resampled.columns

Index(['person_age', 'person_income', 'person_emp_exp',
       'person_home_ownership', 'loan_amnt', 'loan_int_rate',
       'loan_percent_income', 'cb_person_cred_hist_length', 'credit_score',
       'previous_loan_defaults_on_file', 'loan_status'],
      dtype='object')

In [7]:
# Define columns
nominal_data = ['person_home_ownership','previous_loan_defaults_on_file']  # OHE
pos_skewed_cols = ['person_age', 'person_emp_exp', 'cb_person_cred_hist_length','person_income','loan_int_rate', 'loan_percent_income', 'loan_amnt']  # Log transformation
scaling_cols = ['credit_score']  # Standard Scaler

# Define transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('ohe', OneHotEncoder(drop='first', handle_unknown='ignore'), nominal_data),  # One-Hot Encoding
        ('log', FunctionTransformer(np.log1p, validate=True), pos_skewed_cols),  # Log Transformation
        ('scaler', StandardScaler(), scaling_cols)  # Standard Scaler
    ]
)


# Splitting data ('loan_status' is the target variable)
X = df_resampled.drop(columns=['loan_status'])
y = df_resampled['loan_status']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [15]:
models = {
    "Logistic Regression": LogisticRegression(),
    #"Decision Tree": DecisionTreeClassifier(),
    #"K-Nearest Neighbors": KNeighborsClassifier(),
    #"Random Forest": RandomForestClassifier(),
    #"XGBoost Classifier": XGBClassifier(n_estimators= 200,use_label_encoder=False, eval_metric='logloss')
}

# Loop through models
for name, model in models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])

    # Train the model
    pipeline.fit(X_train, y_train)

    # Make predictions
    y_pred = pipeline.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    print(f"{name}: Accuracy = {accuracy:.4f}")

Logistic Regression: Accuracy = 0.8849


# 🔄**What is a Pipeline in Machine Learning?**

A Pipeline is a way to chain multiple steps (like preprocessing + model training) into one object so your code is clean, and you don’t repeat transformations manually.

✅** Why use Pipeline?**

* Ensures consistent preprocessing during training and testing.

* Helps in cross-validation without data leakage.

* Keeps your code clean and modular.

In [None]:
# Define models to test
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost Classifier": XGBClassifier(n_estimators= 200,use_label_encoder=False, eval_metric='logloss')
}

# Loop through models
for name, model in models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])

    # Train the model
    pipeline.fit(X_train, y_train)

    # Make predictions
    y_pred = pipeline.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    print(f"{name}: Accuracy = {accuracy:.4f}")


Logistic Regression: Accuracy = 0.8801
Decision Tree: Accuracy = 0.9033
K-Nearest Neighbors: Accuracy = 0.8930
Random Forest: Accuracy = 0.9339
XGBoost Classifier: Accuracy = 0.9486


# Cross Validation

In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
# Define number of folds for Stratified Cross-Validation
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Dictionary to store results
model_results = {}

# Loop through models and apply StratifiedCV
for name, model in models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])

    # Perform Stratified Cross-Validation
    scores = cross_val_score(pipeline, X, y, cv=stratified_kfold, scoring='accuracy')

    # Store mean accuracy
    model_results[name] = scores.mean()
    print(f"{name}: Mean Accuracy = {scores.mean():.4f}")

Logistic Regression: Mean Accuracy = 0.8864
Decision Tree: Mean Accuracy = 0.9051
K-Nearest Neighbors: Mean Accuracy = 0.8963
Random Forest: Mean Accuracy = 0.9398
XGBoost Classifier: Mean Accuracy = 0.9515


# 📌 Hyperparameter Tuning for XGBoost using RandomizedSearchCV

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
X = df_resampled.drop(columns=['loan_status'])
y = df_resampled['loan_status']

In [None]:
# Hyperparameter grid for XGBoost
xgb_param_grid = {
    'classifier__n_estimators': [100, 200, 300],  # More trees to learn better
    'classifier__max_depth': [5,7,8],  # Avoid excessive depth
    'classifier__learning_rate': [0.03, 0.05, 0.1],  # Balance between learning & generalization
    'classifier__min_child_weight': [1, 2],  # Allow smaller splits
    'classifier__gamma': [0, 0.05]  # Less aggressive regularization
    }

# XGBoost Pipeline
xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(enable_categorical=True, eval_metric='logloss'))
])


# Perform GridSearchCV for XGBoost
xgb_search = RandomizedSearchCV(xgb_pipeline,xgb_param_grid,cv=25, verbose=1, n_jobs=-1)
xgb_search.fit(X, y)


Fitting 25 folds for each of 10 candidates, totalling 250 fits


In [None]:
# Best parameters and accuracy for XGBoost
print("Best parameters for XGBoost:\n", xgb_search.best_params_)
print()
print("Best XGBoost Accuracy:", xgb_search.best_score_)

Best parameters for XGBoost:
 {'classifier__n_estimators': 300, 'classifier__min_child_weight': 1, 'classifier__max_depth': 8, 'classifier__learning_rate': 0.05, 'classifier__gamma': 0.05}

Best XGBoost Accuracy: 0.9370933139386516


In [None]:
print("Train Accuracy:", xgb_search.best_estimator_.score(X_train, y_train))
print("Test Accuracy:", xgb_search.best_score_)

Train Accuracy: 0.9694355126831011
Test Accuracy: 0.9370933139386516
