
## Code from model_.py




In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

class ModelPipeline:
    def __init__(self, model):
        self.model = model
        self.label_encoders = {}
        self.scaler = None

    def load(self, file_path):
        return pd.read_excel(file_path)

    def preprocess(self, data, is_training=True):
        # Label Encoding for categorical variables
        categorical_features = ['sub_grade', 'term', 'home_ownership', 'purpose',
                                 'application_type', 'verification_status']
        if is_training:
            for col in categorical_features:
                le = LabelEncoder()
                data[col] = le.fit_transform(data[col])
                self.label_encoders[col] = le
        else:
            for col in categorical_features:
                data[col] = self.label_encoders[col].transform(data[col])

        # Scale numerical variables
        numerical_features = ['cibil_score', 'total_no_of_acc', 'annual_inc',
                              'int_rate', 'loan_amnt', 'installment', 'account_bal', 'emp_length']
        if is_training:
            self.scaler = StandardScaler()
            data[numerical_features] = self.scaler.fit_transform(data[numerical_features])
        else:
            data[numerical_features] = self.scaler.transform(data[numerical_features])

        return data

    def train(self, X_train, y_train):
        self.model.fit(X_train, y_train)

    def test(self, X_val, y_val):
        predictions = self.model.predict(X_val)
        report = classification_report(y_val, predictions)
        accuracy = accuracy_score(y_val, predictions)
        return report, accuracy

    def predict(self, X):
        return self.model.predict(X)

# Example Usage
if __name__ == "__main__":
    # Load the data
    train_file = 'train_data.xlsx'
    test_file = 'test_data.xlsx'

    train_data = pd.read_excel(train_file)
    test_data = pd.read_excel(test_file)

    # Initialize pipelines for each model
    lr_pipeline = ModelPipeline(LogisticRegression(max_iter=1000, random_state=42))
    rf_pipeline = ModelPipeline(RandomForestClassifier(random_state=42))
    knn_pipeline = ModelPipeline(KNeighborsClassifier())
    xgb_pipeline = ModelPipeline(XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))


    # Preprocess data
    train_data = lr_pipeline.preprocess(train_data, is_training=True)
    X = train_data.drop(columns=['loan_status', 'customer_id', 'transaction_date'])
    y = train_data['loan_status']
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
# Logistic Regression
lr_pipeline.train(X_train, y_train)
report, accuracy = lr_pipeline.test(X_val, y_val)
print("Logistic Regression Results:\\n", report, "\\nAccuracy:", accuracy)
 # Random Forest
rf_pipeline.train(X_train, y_train)
report, accuracy = rf_pipeline.test(X_val, y_val)
print("Random Forest Results:\\n", report, "\\nAccuracy:", accuracy)
 # K-Nearest Neighbors
knn_pipeline.train(X_train, y_train)
report, accuracy = knn_pipeline.test(X_val, y_val)
print("KNN Results:\\n", report, "\\nAccuracy:", accuracy)
# XGBoost Classifier
xgb_pipeline.train(X_train, y_train)
report, accuracy = xgb_pipeline.test(X_val, y_val)
print("XGBoost Classifier Results:\\n", report, "\\nAccuracy:", accuracy)

Logistic Regression Results:\n               precision    recall  f1-score   support

           0       0.62      0.26      0.36      5917
           1       0.78      0.94      0.86     16824

    accuracy                           0.77     22741
   macro avg       0.70      0.60      0.61     22741
weighted avg       0.74      0.77      0.73     22741
 \nAccuracy: 0.7650499098544479
Random Forest Results:\n               precision    recall  f1-score   support

           0       0.59      0.30      0.40      5917
           1       0.79      0.93      0.85     16824

    accuracy                           0.76     22741
   macro avg       0.69      0.61      0.63     22741
weighted avg       0.74      0.76      0.73     22741
 \nAccuracy: 0.763642759773097
KNN Results:\n               precision    recall  f1-score   support

           0       0.47      0.31      0.37      5917
           1       0.78      0.88      0.83     16824

    accuracy                           0.73     22

Parameters: { "use_label_encoder" } are not used.



XGBoost Classifier Results:\n               precision    recall  f1-score   support

           0       0.60      0.30      0.40      5917
           1       0.79      0.93      0.85     16824

    accuracy                           0.77     22741
   macro avg       0.69      0.61      0.63     22741
weighted avg       0.74      0.77      0.74     22741
 \nAccuracy: 0.7652258036146168


In [10]:
from sklearn.metrics import classification_report, roc_auc_score

# Initialize models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'KNN': KNeighborsClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

# Train and evaluate models
results = {}
for name, model in models.items():
    print(f"Training {name}...")
    pipeline = ModelPipeline(model)
    pipeline.train(X_train, y_train)
    report, accuracy = pipeline.test(X_val, y_val)
    auc_score = roc_auc_score(y_val, model.predict_proba(X_val)[:, 1] if hasattr(model, "predict_proba") else [0]*len(y_val))
    results[name] = {
        'Accuracy': accuracy,
        'ROC AUC Score': auc_score,
        'Classification Report': report
    }
    print(f"{name} Results:\\n", report, "\\nROC AUC Score:", auc_score)



Training Logistic Regression...
Logistic Regression Results:\n               precision    recall  f1-score   support

           0       0.62      0.26      0.36      5917
           1       0.78      0.94      0.86     16824

    accuracy                           0.77     22741
   macro avg       0.70      0.60      0.61     22741
weighted avg       0.74      0.77      0.73     22741
 \nROC AUC Score: 0.7338315853857584
Training Random Forest...
Random Forest Results:\n               precision    recall  f1-score   support

           0       0.59      0.30      0.40      5917
           1       0.79      0.93      0.85     16824

    accuracy                           0.76     22741
   macro avg       0.69      0.61      0.63     22741
weighted avg       0.74      0.76      0.73     22741
 \nROC AUC Score: 0.7276281465246256
Training KNN...
KNN Results:\n               precision    recall  f1-score   support

           0       0.47      0.31      0.37      5917
           1       0

Parameters: { "use_label_encoder" } are not used.



XGBoost Results:\n               precision    recall  f1-score   support

           0       0.60      0.30      0.40      5917
           1       0.79      0.93      0.85     16824

    accuracy                           0.77     22741
   macro avg       0.69      0.61      0.63     22741
weighted avg       0.74      0.77      0.74     22741
 \nROC AUC Score: 0.7395661430659388


# Compare Model Performance

In [11]:
import pandas as pd

# Create a DataFrame for comparison
comparison_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Accuracy': [results[model]['Accuracy'] for model in results],
    'ROC AUC': [results[model]['ROC AUC Score'] for model in results]
})
print(comparison_df)


                 Model  Accuracy   ROC AUC
0  Logistic Regression  0.765050  0.733832
1        Random Forest  0.763643  0.727628
2                  KNN  0.729343  0.659905
3              XGBoost  0.765226  0.739566


# Performing Hyperparameter Tuning
Using GridSearchCV

-focusing on the paramenter that significantly impact the model's performance.

In [13]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

reduced the grid size

In [19]:
param_grid = {
    'n_estimators': [100, 150],
    'learning_rate': [0.1, 0.2],
    'max_depth': [3, 4],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

In [20]:
grid_search = GridSearchCV(XGBClassifier(eval_metric='logloss', random_state=42),
                           param_grid, cv=3, scoring='accuracy', verbose=1)
grid_search = GridSearchCV(
    XGBClassifier(eval_metric='logloss', random_state=42,
                  n_jobs=-1),
    param_grid,
    cv=2,
    scoring='accuracy',
    verbose=1,
)
# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

Fitting 2 folds for each of 32 candidates, totalling 64 fits


In [21]:
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best Cross-Validation Score: 0.766402093135746


Final model selected Xgboost Classifier

