<a href="https://colab.research.google.com/github/Spencer166/Second/blob/main/Untitled3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, ConfusionMatrixDisplay
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [5]:
# Path to the dataset
file_path = "/content/drive/MyDrive/loan_dataset_20000.csv"

# Load into DataFrame
df = pd.read_csv(file_path)

# Display first few rows
df.head()

Unnamed: 0,age,gender,marital_status,education_level,annual_income,monthly_income,employment_status,debt_to_income_ratio,credit_score,loan_amount,...,loan_term,installment,grade_subgrade,num_of_open_accounts,total_credit_limit,current_balance,delinquency_history,public_records,num_of_delinquencies,loan_paid_back
0,59,Male,Married,Master's,24240.19,2020.02,Employed,0.074,743,17173.72,...,36,581.88,B5,7,40833.47,24302.07,1,0,1,1
1,72,Female,Married,Bachelor's,20172.98,1681.08,Employed,0.219,531,22663.89,...,60,573.17,F1,5,27968.01,10803.01,1,0,3,1
2,49,Female,Single,High School,26181.8,2181.82,Employed,0.234,779,3631.36,...,60,76.32,B4,2,15502.25,4505.44,0,0,0,1
3,35,Female,Single,High School,11873.84,989.49,Employed,0.264,809,14939.23,...,36,468.07,A5,7,18157.79,5525.63,4,0,5,1
4,63,Other,Single,Other,25326.44,2110.54,Employed,0.26,663,16551.71,...,60,395.5,D5,1,17467.56,3593.91,2,0,2,1


In [6]:
# Define Target
target = "loan_paid_back"

# Split into features and Target
X = df.drop(columns=[target])
y = df[target]

# Split dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1, stratify=y)

# Separate numeric and categorical columns
num_cols = X.select_dtypes(include=np.number).columns
cat_cols = X.select_dtypes(include='object').columns


# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(drop="first",handle_unknown='ignore'), cat_cols)])

In [13]:
# Define Models
models = {
    "Logistic Regression": LogisticRegression(random_state=1, max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=1),
    "Gradient Boosting": GradientBoostingClassifier(random_state=1),
    "SVM": SVC(probability=True),
    "KNN": KNeighborsClassifier(n_neighbors=10)}

In [15]:
results = []

for name, model in models.items():
    pipe = Pipeline(steps=[
        ("preprocess", preprocessor),
        ("model", model)
    ])

    scores = cross_validate(
        pipe,
        X_train,
        y_train,
        cv=5,
        scoring=["accuracy", "roc_auc", "f1"],
        n_jobs=-1
    )

    results.append({
        "Model": name,
        "Accuracy": scores["test_accuracy"].mean(),
        "ROC_AUC": scores["test_roc_auc"].mean(),
        "F1": scores["test_f1"].mean()
    })

results_df = pd.DataFrame(results).sort_values(by="ROC_AUC", ascending=False)
results_df

Unnamed: 0,Model,Accuracy,ROC_AUC,F1
2,Gradient Boosting,0.901437,0.89009,0.941596
0,Logistic Regression,0.886625,0.879891,0.931676
1,Random Forest,0.899438,0.87266,0.940512
3,SVM,0.892813,0.864318,0.936489
4,KNN,0.809875,0.71985,0.89148
