In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE

In [None]:
df = pd.read_csv('/content/archive (6).zip')

In [None]:
df.head()


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


***Features Description:***

**person_age**: Age of the individual applying for the loan.

**person_income:** Annual income of the individual.

**person_home_ownership:** Type of home ownership of the individual.

**rent**:  The individual is currently renting a property.

**mortgage:** The individual has a mortgage on the property they own.

**own**: The individual owns their home outright.
**other**: Other categories of home ownership that may be specific to the dataset.
**person_emp_length**: Employment length of the individual in years.

**loan_intent**: The intent behind the loan application.

**loan_grade**: The grade assigned to the loan based on the creditworthiness of the borrower.
A: The borrower has a high creditworthiness, indicating low risk.
B: The borrower is relatively low-risk, but not as creditworthy as Grade A.
C: The borrower's creditworthiness is moderate.
D: The borrower is considered to have higher risk compared to previous grades.
E: The borrower's creditworthiness is lower, indicating a higher risk.
F: The borrower poses a significant credit risk.
G: The borrower's creditworthiness is the lowest, signifying the highest risk.

**loan_amnt**: The loan amount requested by the individual.

**loan_int_rate**: The interest rate associated with the loan.

**loan_status**: Loan status, where 0 indicates non-default and 1 indicates default.

0: Non-default - The borrower successfully repaid the loan
0: Non-default - The borrower successfully repaid the loan as agreed, and there was no default.
1: Default - The borrower failed to repay the loan according to the agreed-upon terms and defaulted on the loan

In [None]:
df.isnull().sum()

Unnamed: 0,0
person_age,0
person_income,0
person_home_ownership,0
person_emp_length,895
loan_intent,0
loan_grade,0
loan_amnt,0
loan_int_rate,3116
loan_status,0
loan_percent_income,0


In [None]:
df.dropna(inplace=True)

In [None]:
df.isnull().sum()

Unnamed: 0,0
person_age,0
person_income,0
person_home_ownership,0
person_emp_length,0
loan_intent,0
loan_grade,0
loan_amnt,0
loan_int_rate,0
loan_status,0
loan_percent_income,0


In [None]:
df.dropna(inplace=True)

In [None]:
df.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length
count,28638.0,28638.0,28638.0,28638.0,28638.0,28638.0,28638.0,28638.0
mean,27.727216,66649.37,4.788672,9656.493121,11.039867,0.2166,0.169488,5.793736
std,6.310441,62356.45,4.154627,6329.683361,3.229372,0.411935,0.106393,4.038483
min,20.0,4000.0,0.0,500.0,5.42,0.0,0.0,2.0
25%,23.0,39480.0,2.0,5000.0,7.9,0.0,0.09,3.0
50%,26.0,55956.0,4.0,8000.0,10.99,0.0,0.15,4.0
75%,30.0,80000.0,7.0,12500.0,13.48,0.0,0.23,8.0
max,144.0,6000000.0,123.0,35000.0,23.22,1.0,0.83,30.0


In [None]:
df.duplicated().sum()

137

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
numerical_columns = df.select_dtypes(include=['number']).columns

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [None]:
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = LabelEncoder().fit_transform(df[col])

In [None]:
X = df.drop('loan_status', axis=1)
y = df['loan_status']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_test)

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.928433608138923
Confusion Matrix:
 [[4397   44]
 [ 364  896]]
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.99      0.96      4441
           1       0.95      0.71      0.81      1260

    accuracy                           0.93      5701
   macro avg       0.94      0.85      0.89      5701
weighted avg       0.93      0.93      0.92      5701



In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
gnb = GaussianNB()

# train the model
gnb.fit(X_train, y_train)

In [None]:
y_pred = gnb.predict(X_test)

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8152955621820733
Confusion Matrix:
 [[4292  149]
 [ 904  356]]
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.97      0.89      4441
           1       0.70      0.28      0.40      1260

    accuracy                           0.82      5701
   macro avg       0.77      0.62      0.65      5701
weighted avg       0.80      0.82      0.78      5701



In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
# initialize Logistic Regression classifier
logreg = LogisticRegression(max_iter=1000)

# train the model
logreg.fit(X_train, y_train)

# make predictions
y_pred = logreg.predict(X_test)

# evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8030170145588493
Confusion Matrix:
 [[4369   72]
 [1051  209]]
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.98      0.89      4441
           1       0.74      0.17      0.27      1260

    accuracy                           0.80      5701
   macro avg       0.77      0.57      0.58      5701
weighted avg       0.79      0.80      0.75      5701



In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
# initialize MLP classifier
mlp = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=1000)

# train the model
mlp.fit(X_train, y_train)

In [None]:
# make predictions
y_pred = mlp.predict(X_test)

# evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8216102438168742
Confusion Matrix:
 [[3927  514]
 [ 503  757]]
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.88      0.89      4441
           1       0.60      0.60      0.60      1260

    accuracy                           0.82      5701
   macro avg       0.74      0.74      0.74      5701
weighted avg       0.82      0.82      0.82      5701



In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# initialize Gradient Boosting ensemble
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)

# train the model
gb.fit(X_train, y_train)

# make predictions
y_pred = gb.predict(X_test)

# evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9194878091562884
Confusion Matrix:
 [[4379   62]
 [ 397  863]]
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.99      0.95      4441
           1       0.93      0.68      0.79      1260

    accuracy                           0.92      5701
   macro avg       0.92      0.84      0.87      5701
weighted avg       0.92      0.92      0.91      5701



In [None]:
#AdaBoost Ensemble

from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# initialize AdaBoost ensemble
ada = AdaBoostClassifier(n_estimators=100, random_state=42)

# train the model
ada.fit(X_train, y_train)

# make predictions
y_pred = ada.predict(X_test)

# evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8896684792141729
Confusion Matrix:
 [[4265  176]
 [ 453  807]]
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.96      0.93      4441
           1       0.82      0.64      0.72      1260

    accuracy                           0.89      5701
   macro avg       0.86      0.80      0.83      5701
weighted avg       0.89      0.89      0.88      5701



In [None]:
from sklearn.model_selection import GridSearchCV
# define hyperparameter tuning space
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 5, 10]
}

# initialize Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy')

# perform grid search
grid_search.fit(X_train, y_train)


In [55]:
# evaluate best model on test data
y_pred = grid_search.best_estimator_.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Test Accuracy: 0.9291352394316786
Confusion Matrix:
 [[4403   38]
 [ 366  894]]
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.99      0.96      4441
           1       0.96      0.71      0.82      1260

    accuracy                           0.93      5701
   macro avg       0.94      0.85      0.89      5701
weighted avg       0.93      0.93      0.93      5701



In [56]:
from sklearn.neighbors import KNeighborsClassifier

In [57]:
# initialize KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)

# train the model
knn.fit(X_train, y_train)

# make predictions
y_pred = knn.predict(X_test)

In [58]:
# evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8340641992632871
Confusion Matrix:
 [[4136  305]
 [ 641  619]]
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.93      0.90      4441
           1       0.67      0.49      0.57      1260

    accuracy                           0.83      5701
   macro avg       0.77      0.71      0.73      5701
weighted avg       0.82      0.83      0.82      5701



In [59]:
import xgboost as xgb

In [60]:
# initialize XGBoost classifier
xgb_model = xgb.XGBClassifier(objective='binary:logistic', n_estimators=100, learning_rate=0.1)

# train the model
xgb_model.fit(X_train, y_train)

In [61]:
# make predictions
y_pred = xgb_model.predict(X_test)

# evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9319417646027013
Confusion Matrix:
 [[4416   25]
 [ 363  897]]
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.99      0.96      4441
           1       0.97      0.71      0.82      1260

    accuracy                           0.93      5701
   macro avg       0.95      0.85      0.89      5701
weighted avg       0.93      0.93      0.93      5701



In [62]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.5


In [63]:
from catboost import CatBoostClassifier

In [66]:
# initialize CatBoost classifier
catboost_model = CatBoostClassifier(iterations=100, learning_rate=0.1, depth=10)

# train the model
catboost_model.fit(X_train, y_train)

0:	learn: 0.5891317	total: 145ms	remaining: 14.4s
1:	learn: 0.5034079	total: 204ms	remaining: 9.99s
2:	learn: 0.4455483	total: 293ms	remaining: 9.48s
3:	learn: 0.3977874	total: 361ms	remaining: 8.66s
4:	learn: 0.3637065	total: 431ms	remaining: 8.18s
5:	learn: 0.3334913	total: 465ms	remaining: 7.29s
6:	learn: 0.3127408	total: 502ms	remaining: 6.66s
7:	learn: 0.2956215	total: 536ms	remaining: 6.16s
8:	learn: 0.2803370	total: 574ms	remaining: 5.8s
9:	learn: 0.2674302	total: 628ms	remaining: 5.66s
10:	learn: 0.2601309	total: 663ms	remaining: 5.37s
11:	learn: 0.2500620	total: 710ms	remaining: 5.21s
12:	learn: 0.2432257	total: 790ms	remaining: 5.29s
13:	learn: 0.2403642	total: 814ms	remaining: 5s
14:	learn: 0.2371762	total: 893ms	remaining: 5.06s
15:	learn: 0.2300550	total: 920ms	remaining: 4.83s
16:	learn: 0.2264224	total: 950ms	remaining: 4.64s
17:	learn: 0.2229060	total: 980ms	remaining: 4.46s
18:	learn: 0.2203140	total: 1.01s	remaining: 4.33s
19:	learn: 0.2184575	total: 1.04s	remaining: 

<catboost.core.CatBoostClassifier at 0x7fc4182f3460>

In [67]:
# make predictions
y_pred = catboost_model.predict(X_test)

# evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.932467988072268
Confusion Matrix:
 [[4417   24]
 [ 361  899]]
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.99      0.96      4441
           1       0.97      0.71      0.82      1260

    accuracy                           0.93      5701
   macro avg       0.95      0.85      0.89      5701
weighted avg       0.94      0.93      0.93      5701



In [68]:
import lightgbm as lgb

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [69]:
# initialize LightGBM classifier
lgb_model = lgb.LGBMClassifier(objective='binary', num_leaves=31, learning_rate=0.05, n_estimators=100)

# train the model
lgb_model.fit(X_train, y_train)

# make predictions
y_pred = lgb_model.predict(X_test)

[LightGBM] [Info] Number of positive: 4928, number of negative: 17872
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005845 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 935
[LightGBM] [Info] Number of data points in the train set: 22800, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216140 -> initscore=-1.288302
[LightGBM] [Info] Start training from score -1.288302


In [70]:
# evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9321171724258902
Confusion Matrix:
 [[4421   20]
 [ 367  893]]
Classification Report:
               precision    recall  f1-score   support

           0       0.92      1.00      0.96      4441
           1       0.98      0.71      0.82      1260

    accuracy                           0.93      5701
   macro avg       0.95      0.85      0.89      5701
weighted avg       0.94      0.93      0.93      5701



In [73]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import lightgbm as lgb
import catboost as cb
import xgboost as xgb

In [74]:
# initialize models
lgb_model = lgb.LGBMClassifier(objective='binary', num_leaves=31, learning_rate=0.05, n_estimators=100)
catboost_model = cb.CatBoostClassifier(iterations=100, learning_rate=0.1, depth=6)
xgb_model = xgb.XGBClassifier(objective='binary:logistic', n_estimators=100, learning_rate=0.1)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)


In [75]:
# combine models using Voting Classifier
voting_model = VotingClassifier(estimators=[('lgb', lgb_model), ('catboost', catboost_model), ('xgb', xgb_model), ('rf', rf_model)], voting='soft')

# train the model
voting_model.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 4928, number of negative: 17872
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005485 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 935
[LightGBM] [Info] Number of data points in the train set: 22800, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216140 -> initscore=-1.288302
[LightGBM] [Info] Start training from score -1.288302
0:	learn: 0.5948180	total: 21.1ms	remaining: 2.09s
1:	learn: 0.5187394	total: 40.6ms	remaining: 1.99s
2:	learn: 0.4652170	total: 60.8ms	remaining: 1.97s
3:	learn: 0.4224368	total: 89.8ms	remaining: 2.15s
4:	learn: 0.3926859	total: 110ms	remaining: 2.08s
5:	learn: 0.3638853	total: 131ms	remaining: 2.05s
6:	learn: 0.3441731	total: 152ms	remaining: 2.01s
7:	learn: 0.3270820	total: 176ms	remaining: 2.02s
8:	learn: 0.3143009	total: 207ms	remaining: 2.09s
9:	learn: 0.3046025	total: 230ms	remaining: 2.07s
10:	

In [76]:
# make predictions
y_pred = voting_model.predict(X_test)

# evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9328188037186459
Confusion Matrix:
 [[4421   20]
 [ 363  897]]
Classification Report:
               precision    recall  f1-score   support

           0       0.92      1.00      0.96      4441
           1       0.98      0.71      0.82      1260

    accuracy                           0.93      5701
   macro avg       0.95      0.85      0.89      5701
weighted avg       0.94      0.93      0.93      5701



In [77]:
# calculate correlation with target variable
corr_with_target = df.corrwith(df['loan_status'])

In [78]:
# select features with correlation above threshold (e.g., 0.3)
selected_features = corr_with_target[abs(corr_with_target) > 0.3].index.tolist()


In [80]:
df_selected = df[selected_features + ['loan_status']]

In [81]:
df_selected.head()

Unnamed: 0,loan_int_rate,loan_status,loan_percent_income,loan_grade_D,loan_status.1
0,16.02,1,0.59,True,1
1,11.14,0,0.1,False,0
2,12.87,1,0.57,False,1
3,15.23,1,0.53,False,1
4,14.27,1,0.55,False,1


In [82]:
XX = df_selected.drop('loan_status', axis=1)
yy = df_selected['loan_status']

In [83]:
XX_train, XX_test, yy_train, yy_test = train_test_split(XX, yy, test_size=0.2, random_state=42)

In [84]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(XX_train, yy_train)

In [85]:
yy_pred = rf.predict(XX_test)

In [87]:
print("Accuracy:", accuracy_score(yy_test, yy_pred))
print("Classification Report:\n", classification_report(yy_test, yy_pred))

Accuracy: 0.8309068584458866
Classification Report:
               precision    recall  f1-score   support

           0       0.65      0.52      0.57      1260
           1       0.65      0.52      0.57      1260

   micro avg       0.65      0.52      0.57      2520
   macro avg       0.65      0.52      0.57      2520
weighted avg       0.65      0.52      0.57      2520
 samples avg       0.11      0.11      0.11      2520



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
