In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [32]:
## Reading the dataset
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade,loan_paid_back
0,0,29367.99,0.084,736,2528.42,13.67,Female,Single,High School,Self-employed,Other,C3,1.0
1,1,22108.02,0.166,636,4593.1,12.92,Male,Married,Master's,Employed,Debt consolidation,D3,0.0
2,2,49566.2,0.097,694,17005.15,9.76,Male,Single,High School,Employed,Debt consolidation,C5,1.0
3,3,46858.25,0.065,533,4682.48,16.1,Female,Single,High School,Employed,Debt consolidation,F1,1.0
4,4,25496.7,0.053,665,12184.43,10.21,Male,Married,High School,Employed,Other,D1,1.0


In [33]:
## Data Cleaning
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 593994 entries, 0 to 593993
Data columns (total 13 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    593994 non-null  int64  
 1   annual_income         593994 non-null  float64
 2   debt_to_income_ratio  593994 non-null  float64
 3   credit_score          593994 non-null  int64  
 4   loan_amount           593994 non-null  float64
 5   interest_rate         593994 non-null  float64
 6   gender                593994 non-null  object 
 7   marital_status        593994 non-null  object 
 8   education_level       593994 non-null  object 
 9   employment_status     593994 non-null  object 
 10  loan_purpose          593994 non-null  object 
 11  grade_subgrade        593994 non-null  object 
 12  loan_paid_back        593994 non-null  float64
dtypes: float64(5), int64(2), object(6)
memory usage: 58.9+ MB


In [34]:
df.drop(['id'],axis=1,inplace=True)
df.drop(['gender'],axis=1,inplace=True)

In [35]:
df.head()

Unnamed: 0,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,marital_status,education_level,employment_status,loan_purpose,grade_subgrade,loan_paid_back
0,29367.99,0.084,736,2528.42,13.67,Single,High School,Self-employed,Other,C3,1.0
1,22108.02,0.166,636,4593.1,12.92,Married,Master's,Employed,Debt consolidation,D3,0.0
2,49566.2,0.097,694,17005.15,9.76,Single,High School,Employed,Debt consolidation,C5,1.0
3,46858.25,0.065,533,4682.48,16.1,Single,High School,Employed,Debt consolidation,F1,1.0
4,25496.7,0.053,665,12184.43,10.21,Married,High School,Employed,Other,D1,1.0


In [36]:
df.isnull().sum()

annual_income           0
debt_to_income_ratio    0
credit_score            0
loan_amount             0
interest_rate           0
marital_status          0
education_level         0
employment_status       0
loan_purpose            0
grade_subgrade          0
loan_paid_back          0
dtype: int64

In [37]:
df.duplicated().sum()

np.int64(0)

In [38]:
df.columns

Index(['annual_income', 'debt_to_income_ratio', 'credit_score', 'loan_amount',
       'interest_rate', 'marital_status', 'education_level',
       'employment_status', 'loan_purpose', 'grade_subgrade',
       'loan_paid_back'],
      dtype='object')

In [39]:
df['loan_purpose'].value_counts()

loan_purpose
Debt consolidation    324695
Other                  63874
Car                    58108
Home                   44118
Education              36641
Business               35303
Medical                22806
Vacation                8449
Name: count, dtype: int64

In [40]:
df['grade_subgrade'].value_counts()

grade_subgrade
C3    58695
C4    55957
C2    54443
C1    53363
C5    53317
D1    37029
D3    36694
D4    35097
D2    34432
D5    32101
B2    15167
B1    14344
B5    13937
B3    13926
B4    13877
E4     8036
E3     7075
E1     6891
E2     6372
E5     6084
F5     5947
F4     5535
F1     5534
F2     5203
F3     5082
A5     2471
A3     2066
A2     2018
A4     1701
A1     1600
Name: count, dtype: int64

In [41]:
df['education_level'].value_counts()

education_level
Bachelor's     279606
High School    183592
Master's        93097
Other           26677
PhD             11022
Name: count, dtype: int64

In [42]:
df['employment_status'].value_counts()

employment_status
Employed         450645
Unemployed        62485
Self-employed     52480
Retired           16453
Student           11931
Name: count, dtype: int64

In [201]:
df['loan_purpose'].value_counts()

loan_purpose
Debt consolidation    324695
Other                  63874
Car                    58108
Home                   44118
Education              36641
Business               35303
Medical                22806
Vacation                8449
Name: count, dtype: int64

In [160]:
## Getting All Different Types OF Features
num_features = [feature for feature in df.columns if df[feature].dtype != 'O']
print('Num of Numerical Features :', len(num_features))
cat_features = [feature for feature in df.columns if df[feature].dtype == 'O']
print('Num of Categorical Features :', len(cat_features))
discrete_features=[feature for feature in num_features if len(df[feature].unique())<=25]
print('Num of Discrete Features :',len(discrete_features))
continuous_features=[feature for feature in num_features if feature not in discrete_features]
print('Num of Continuous Features :',len(continuous_features))

Num of Numerical Features : 6
Num of Categorical Features : 5
Num of Discrete Features : 1
Num of Continuous Features : 5


In [161]:
## Independent and Dependent Features
X = df.drop(['loan_paid_back'], axis=1)
y = df['loan_paid_back']

In [162]:
X.head()

Unnamed: 0,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,marital_status,education_level,employment_status,loan_purpose,grade_subgrade
0,29367.99,0.084,736,2528.42,13.67,Single,High School,Self-employed,Other,C3
1,22108.02,0.166,636,4593.1,12.92,Married,Master's,Employed,Debt consolidation,D3
2,49566.2,0.097,694,17005.15,9.76,Single,High School,Employed,Debt consolidation,C5
3,46858.25,0.065,533,4682.48,16.1,Single,High School,Employed,Debt consolidation,F1
4,25496.7,0.053,665,12184.43,10.21,Married,High School,Employed,Other,D1


In [46]:
## Feature encoding and scaling

In [163]:
from sklearn.preprocessing import LabelEncoder
le_grade = LabelEncoder()
le_purpose = LabelEncoder()
X['grade_subgrade'] = le_grade.fit_transform(X['grade_subgrade'])
X['loan_purpose'] = le_purpose.fit_transform(X['loan_purpose'])

In [164]:
X.head()

Unnamed: 0,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,marital_status,education_level,employment_status,loan_purpose,grade_subgrade
0,29367.99,0.084,736,2528.42,13.67,Single,High School,Self-employed,6,12
1,22108.02,0.166,636,4593.1,12.92,Married,Master's,Employed,2,17
2,49566.2,0.097,694,17005.15,9.76,Single,High School,Employed,2,14
3,46858.25,0.065,533,4682.48,16.1,Single,High School,Employed,2,25
4,25496.7,0.053,665,12184.43,10.21,Married,High School,Employed,6,15


In [165]:
num_features = X.select_dtypes(exclude="object").columns
onehot_columns = ['marital_status','employment_status','education_level']

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, onehot_columns),
        ("StandardScaler", numeric_transformer, num_features)
        
    ],remainder='passthrough'
    
)

In [166]:
X = preprocessor.fit_transform(X)

In [167]:
X.shape

(593994, 18)

In [168]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.705461,-0.535135,0.993849,-1.803484,0.653899,2.046515,-0.384653
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.977248,0.660668,-0.810394,-1.505401,0.280571,-0.368235,0.592762
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.050689,-0.345556,0.236067,0.286558,-1.292385,-0.368235,0.006313
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.050687,-0.812211,-2.668764,-1.492497,1.863482,-0.368235,2.156626
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.850388,-0.987206,-0.287163,-0.409421,-1.068388,2.046515,0.201796
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593989,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.943696,0.456506,0.398449,0.857295,-0.714971,-1.575610,-0.384653
593990,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.483783,-0.228892,-2.199661,-1.698263,1.126781,-0.368235,2.938559
593991,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.041164,-0.710130,-0.106739,-2.034358,0.882873,-0.368235,-0.775619
593992,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.068296,-0.783045,1.066019,0.183368,-1.237630,-0.368235,-1.557551


In [169]:
## Train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((475195, 18), (118799, 18))

In [55]:
## Model training
## 1. Logistic Regression
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()
logistic.fit(X_train,y_train)

In [84]:
y_pred = logistic.predict(X_test)
logistic_model_prob = logistic.predict_proba(X_test)[:,1]
print(y_pred)

[1. 1. 1. ... 1. 1. 1.]


In [83]:
## Performance metrices
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,roc_auc_score

In [87]:
logistic_auc = roc_auc_score(y_test,logistic_model_prob)
print(logistic_auc)

0.9109848243136325


In [62]:
## 2) GradientBoost
from sklearn.ensemble import GradientBoostingClassifier
grad_classifier = GradientBoostingClassifier()
grad_classifier.fit(X_train,y_train)

In [91]:
y_pred_grad = grad_classifier.predict(X_test)
gradient_model_prob = grad_classifier.predict_proba(X_test)[:,1]

In [92]:
gradient_auc = roc_auc_score(y_test,gradient_model_prob)
print(gradient_auc)

0.916193937343509


In [65]:
## 3) XGBoost
from xgboost import XGBClassifier
xg = XGBClassifier(eval_metric = 'logloss')
xg.fit(X_train,y_train)

In [94]:
y_pred_xg = xg.predict(X_test)
xg_model_prob = xg.predict_proba(X_test)[:,1]

In [95]:
xg_auc = roc_auc_score(y_test,xg_model_prob)
print(xg_auc)

0.9214874517240299


In [71]:
## 4) CatBoost

In [170]:
from catboost import CatBoostClassifier
cat_model = CatBoostClassifier()
cat_model.fit(X_train,y_train)

Learning rate set to 0.143205
0:	learn: 0.5394027	total: 114ms	remaining: 1m 54s
1:	learn: 0.4425158	total: 214ms	remaining: 1m 46s
2:	learn: 0.3811232	total: 301ms	remaining: 1m 40s
3:	learn: 0.3415549	total: 379ms	remaining: 1m 34s
4:	learn: 0.3165096	total: 476ms	remaining: 1m 34s
5:	learn: 0.2987528	total: 572ms	remaining: 1m 34s
6:	learn: 0.2874887	total: 672ms	remaining: 1m 35s
7:	learn: 0.2789663	total: 779ms	remaining: 1m 36s
8:	learn: 0.2727612	total: 888ms	remaining: 1m 37s
9:	learn: 0.2684130	total: 969ms	remaining: 1m 35s
10:	learn: 0.2655358	total: 1.05s	remaining: 1m 34s
11:	learn: 0.2629666	total: 1.14s	remaining: 1m 33s
12:	learn: 0.2614418	total: 1.21s	remaining: 1m 31s
13:	learn: 0.2599235	total: 1.28s	remaining: 1m 30s
14:	learn: 0.2589785	total: 1.37s	remaining: 1m 30s
15:	learn: 0.2580043	total: 1.45s	remaining: 1m 29s
16:	learn: 0.2574799	total: 1.53s	remaining: 1m 28s
17:	learn: 0.2568273	total: 1.61s	remaining: 1m 27s
18:	learn: 0.2564829	total: 1.69s	remaining:

<catboost.core.CatBoostClassifier at 0x1d1494c2710>

In [171]:
y_pred_cat = cat_model.predict(X_test)
cat_model_prob = cat_model.predict_proba(X_test)[:,1]

In [172]:
cat_auc = roc_auc_score(y_test,cat_model_prob)
print(cat_auc)

0.9238322079276315


In [72]:
## 5) Random Forest

In [73]:
from sklearn.ensemble import RandomForestClassifier
rand = RandomForestClassifier()
rand.fit(X_train,y_train)

In [96]:
y_pred_rand = rand.predict(X_test)
random_model_prob = rand.predict_proba(X_test)[:,1]

In [97]:
random_auc = roc_auc_score(y_test,random_model_prob)
print(random_auc)

0.9089456914623488


In [79]:
## 6) LightGBM
from lightgbm import LGBMClassifier
light = LGBMClassifier()
light.fit(X_train,y_train)

[LightGBM] [Info] Number of positive: 379692, number of negative: 95503
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019901 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1336
[LightGBM] [Info] Number of data points in the train set: 475195, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.799024 -> initscore=1.380203
[LightGBM] [Info] Start training from score 1.380203


In [100]:
y_pred_light = light.predict(X_test)
light_model_prob = light.predict_proba(X_test)[:,1]

In [101]:
light_auc = roc_auc_score(y_test,logistic_model_prob)
print(light_auc)

0.9109848243136325


In [102]:
print("--------- Models Accuracy without hyperparameter tuning --------------")
print("Logistic Regression: ",logistic_auc)
print("Random Forest: ",random_auc)
print("Gradient Boosting Classifier: ",gradient_auc)
print("XG Boost Classifier: ",xg_auc)
print("LightGBM Classifier: ",light_auc)
print("CatBoost Classifier: ",cat_auc)
print("----------------------------------------------------------------------")
print("Best Result: CatBoost Classifier: ",cat_auc)
print("----------------------------------------------------------------------")

--------- Models Accuracy without hyperparameter tuning --------------
Logistic Regression:  0.9109848243136325
Random Forest:  0.9089456914623488
Gradient Boosting Classifier:  0.916193937343509
XG Boost Classifier:  0.9214874517240299
LightGBM Classifier:  0.9109848243136325
CatBoost Classifier:  0.9238322079276315
----------------------------------------------------------------------
Best Result: CatBoost Classifier:  0.9238322079276315
----------------------------------------------------------------------


## Applying the model on testing data

In [186]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade
0,593994,28781.05,0.049,626,11461.42,14.73,Female,Single,High School,Employed,Other,D5
1,593995,46626.39,0.093,732,15492.25,12.85,Female,Married,Master's,Employed,Other,C1
2,593996,54954.89,0.367,611,3796.41,13.29,Male,Single,Bachelor's,Employed,Debt consolidation,D1
3,593997,25644.63,0.11,671,6574.3,9.57,Female,Single,Bachelor's,Employed,Debt consolidation,C3
4,593998,25169.64,0.081,688,17696.89,12.8,Female,Married,PhD,Employed,Business,C1


In [187]:
test.drop(['gender'],axis=1,inplace=True)

In [188]:
test.head()

Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,marital_status,education_level,employment_status,loan_purpose,grade_subgrade
0,593994,28781.05,0.049,626,11461.42,14.73,Single,High School,Employed,Other,D5
1,593995,46626.39,0.093,732,15492.25,12.85,Married,Master's,Employed,Other,C1
2,593996,54954.89,0.367,611,3796.41,13.29,Single,Bachelor's,Employed,Debt consolidation,D1
3,593997,25644.63,0.11,671,6574.3,9.57,Single,Bachelor's,Employed,Debt consolidation,C3
4,593998,25169.64,0.081,688,17696.89,12.8,Married,PhD,Employed,Business,C1


In [189]:
x_test = test.drop(['id'],axis=1)

In [190]:
le_test_grade = LabelEncoder()
le_test_purpose = LabelEncoder()
x_test['grade_subgrade'] = le_test_grade.fit_transform(x_test['grade_subgrade'])
x_test['loan_purpose'] = le_test_purpose.fit_transform(x_test['loan_purpose'])

In [191]:
num_features = x_test.select_dtypes(exclude="object").columns
onehot_columns = ['marital_status','employment_status','education_level']

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder(drop='first')

preprocessor_test = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, onehot_columns),
        ("StandardScaler", numeric_transformer, num_features)
        
    ],remainder='passthrough'
    
)

In [192]:
x_test = preprocessor_test.fit_transform(x_test)

In [193]:
pd.DataFrame(x_test)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.728006,-1.043756,-0.989459,-0.513617,1.178469,2.040421,0.982508
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.060132,-0.402190,0.916193,0.068692,0.246668,2.040421,-0.772640
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.251568,3.593019,-1.259127,-1.620933,0.464749,-0.370222,0.202442
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.845389,-0.154312,-0.180456,-1.219629,-1.379028,-0.370222,-0.382607
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.863165,-0.577162,0.125167,0.387183,0.221886,-1.575544,-0.772640
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
254564,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.669294,-0.766716,1.131927,2.121775,0.558920,-0.370222,-1.552706
254565,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.022957,-0.431352,-0.845636,0.760974,-1.374071,-0.370222,0.787491
254566,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-1.031623,-0.358447,0.664503,1.642668,-1.661542,-0.370222,-0.187591
254567,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.528750,-0.387609,1.042038,-0.564766,-1.260074,-1.575544,-0.577624


In [194]:
y_pred_test = cat_model.predict_proba(x_test)[:,1]

In [195]:
submission = pd.DataFrame({
    "id": test["id"],
    "loan_paid_back": y_pred_test
})


In [196]:
submission.to_csv("submission_cat.csv", index=False)
print("✅ Submission file created successfully! Upload 'submission.csv' to Kaggle.")

✅ Submission file created successfully! Upload 'submission.csv' to Kaggle.


In [197]:
## Saving my catboost mxg_model
cat_model

<catboost.core.CatBoostClassifier at 0x1d1494c2710>

In [198]:
cat_model.save_model("catboost_model.cbm")

In [199]:
import joblib
joblib.dump(preprocessor_test, "preprocessor.joblib")
print("✅ Saved preprocessor as 'preprocessor.joblib'")


✅ Saved preprocessor as 'preprocessor.joblib'


In [200]:
encoders = {
    "loan_purpose": le_test_purpose,
    "grade_subgrade": le_test_grade
}
joblib.dump(encoders, "label_encoders.joblib")
print("✅ Saved all label encoders as a dictionary.")


✅ Saved all label encoders as a dictionary.
