In [168]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression,SGDRegressor
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score

# Load File

In [169]:
df = pd.read_csv('Loan_Default.csv')

# Explore Data

To simplified the header of each column for easier using the value

In [170]:
df.columns= df.columns.str.lower()

To explore and understand data

In [171]:
#head()/tail()
print(df.head())

      id  year loan_limit             gender approv_in_adv loan_type  \
0  24890  2019         cf  Sex Not Available         nopre     type1   
1  24891  2019         cf               Male         nopre     type2   
2  24892  2019         cf               Male           pre     type1   
3  24893  2019         cf               Male         nopre     type1   
4  24894  2019         cf              Joint           pre     type1   

  loan_purpose credit_worthiness open_credit business_or_commercial  ...  \
0           p1                l1        nopc                  nob/c  ...   
1           p1                l1        nopc                    b/c  ...   
2           p1                l1        nopc                  nob/c  ...   
3           p4                l1        nopc                  nob/c  ...   
4           p1                l1        nopc                  nob/c  ...   

   credit_type  credit_score  co-applicant_credit_type    age  \
0          EXP           758                 

In [172]:
#info()
df.shape
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148670 entries, 0 to 148669
Data columns (total 34 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   id                         148670 non-null  int64  
 1   year                       148670 non-null  int64  
 2   loan_limit                 145326 non-null  object 
 3   gender                     148670 non-null  object 
 4   approv_in_adv              147762 non-null  object 
 5   loan_type                  148670 non-null  object 
 6   loan_purpose               148536 non-null  object 
 7   credit_worthiness          148670 non-null  object 
 8   open_credit                148670 non-null  object 
 9   business_or_commercial     148670 non-null  object 
 10  loan_amount                148670 non-null  int64  
 11  rate_of_interest           112231 non-null  float64
 12  interest_rate_spread       112031 non-null  float64
 13  upfront_charges            10

In [173]:
#describe() and value_counts()
print(df.describe().T)
print('')
print(df.value_counts().T)
df.describe(include='all').T

                         count           mean            std           min  \
id                    148670.0   99224.500000   42917.476598  24890.000000   
year                  148670.0    2019.000000       0.000000   2019.000000   
loan_amount           148670.0  331117.743997  183909.310127  16500.000000   
rate_of_interest      112231.0       4.045476       0.561391      0.000000   
interest_rate_spread  112031.0       0.441656       0.513043     -3.638000   
upfront_charges       109028.0    3224.996127    3251.121510      0.000000   
term                  148629.0     335.136582      58.409084     96.000000   
property_value        133572.0  497893.465696  359935.315562   8000.000000   
income                139520.0    6957.338876    6496.586382      0.000000   
credit_score          148670.0     699.789103     115.875857    500.000000   
ltv                   133572.0      72.746457      39.967603      0.967478   
status                148670.0       0.246445       0.430942    

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
id,148670.0,,,,99224.5,42917.476598,24890.0,62057.25,99224.5,136391.75,173559.0
year,148670.0,,,,2019.0,0.0,2019.0,2019.0,2019.0,2019.0,2019.0
loan_limit,145326.0,2.0,cf,135348.0,,,,,,,
gender,148670.0,4.0,Male,42346.0,,,,,,,
approv_in_adv,147762.0,2.0,nopre,124621.0,,,,,,,
loan_type,148670.0,3.0,type1,113173.0,,,,,,,
loan_purpose,148536.0,4.0,p3,55934.0,,,,,,,
credit_worthiness,148670.0,2.0,l1,142344.0,,,,,,,
open_credit,148670.0,2.0,nopc,148114.0,,,,,,,
business_or_commercial,148670.0,2.0,nob/c,127908.0,,,,,,,


## Filter related feature and Extract the numerical value

In [174]:
not_related_feature = ['id', 'year', 'loan_limit', 'gender', 'approv_in_adv',
       'loan_purpose', 'credit_worthiness', 'open_credit',
       'business_or_commercial', 'upfront_charges', 'neg_ammortization',
       'interest_only', 'lump_sum_payment', 'construction_type', 'occupancy_type', 'secured_by', 'total_units', 'credit_type', 'co-applicant_credit_type', 'submission_of_application', 'ltv', 'region', 'security_type']
df = df.drop(not_related_feature, axis=1)

In [175]:
numerical_features = df._get_numeric_data().columns.tolist()
numerical_features

['loan_amount',
 'rate_of_interest',
 'interest_rate_spread',
 'term',
 'property_value',
 'income',
 'credit_score',
 'status',
 'dtir1']

In [176]:
categorical_features=list(set(df.columns.tolist())-set(numerical_features))
categorical_features

['loan_type', 'age']

# Explore Numerical Value Data

### Check the missing value of data and sort it to know the most lack of data by using percentage

In [177]:
#Check the missing value
print('Missing Percentage')
print((df.isnull().sum()*100/len(df)).sort_values(ascending=False))
print('')

#Isna() for identify the number of missing value for each columns
print('Missing value')
df.isna().sum().T

Missing Percentage
interest_rate_spread    24.644515
rate_of_interest        24.509989
dtir1                   16.224524
property_value          10.155378
income                   6.154571
age                      0.134526
term                     0.027578
loan_amount              0.000000
loan_type                0.000000
credit_score             0.000000
status                   0.000000
dtype: float64

Missing value


loan_type                   0
loan_amount                 0
rate_of_interest        36439
interest_rate_spread    36639
term                       41
property_value          15098
income                   9150
credit_score                0
age                       200
status                      0
dtir1                   24121
dtype: int64

# Proprocessing

Separate the DataFrame into features (x) and the target variable (y), excluding the target variable from the features.

In [178]:
x=df.drop(['status'],axis=1)
y=df[['status']]

Use train_test_split to split the data into training and testing sets for both features and the target variable.

In [179]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,shuffle=True,random_state=42)

In [180]:
# Filling in Missing value

numerical_features = list(set(df._get_numeric_data().columns.tolist())-set(['status']))

numerical_imputer = SimpleImputer(missing_values=np.nan,strategy='median')

x_train[numerical_features] = numerical_imputer.fit_transform(x_train[numerical_features])
x_test[numerical_features] = numerical_imputer.fit_transform(x_test[numerical_features])

In [181]:
# Filling in Missing value
# Categorical
categorical_imputer = SimpleImputer(missing_values=np.nan,strategy='most_frequent')

x_train[categorical_features] = categorical_imputer.fit_transform(x_train[categorical_features])
x_test[categorical_features] = categorical_imputer.fit_transform(x_test[categorical_features])

### Check Correlation

In [182]:
corr_matrix = x_train[numerical_features].corr().abs()

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool_))

feature_to_drop = [column for column in upper.columns if any (upper[column] > 0.5)]
print(feature_to_drop)

['property_value', 'interest_rate_spread']


In [183]:
x_train = x_train.drop(feature_to_drop, axis=1)
x_test = x_test.drop(feature_to_drop, axis=1)
x_train.head(5)

Unnamed: 0,loan_type,loan_amount,rate_of_interest,term,income,credit_score,age,dtir1
141245,type3,76500.0,3.5,360.0,2460.0,605.0,>74,12.0
3507,type1,556500.0,4.0,360.0,7200.0,729.0,45-54,43.0
53688,type2,126500.0,3.625,180.0,2100.0,609.0,65-74,42.0
46491,type1,246500.0,3.99,360.0,8220.0,600.0,45-54,41.0
54671,type1,486500.0,4.125,360.0,8940.0,701.0,55-64,35.0


Scale Feature

In [184]:
# Scale feature
filtered_numerical_features = x_train.select_dtypes(include = ['int64','float']).columns

scaler = StandardScaler()

x_train[filtered_numerical_features] = scaler.fit_transform(x_train[filtered_numerical_features])
x_test[filtered_numerical_features] = scaler.transform(x_test[filtered_numerical_features])

Encoding

In [185]:
ct = ColumnTransformer(transformers =[('encoder', OneHotEncoder(handle_unknown='ignore'), categorical_features)], remainder='passthrough')

x_train = ct.fit_transform(x_train)
x_test = ct.transform(x_test)

In [186]:
y_train

Unnamed: 0,status
141245,0
3507,0
53688,0
46491,1
54671,0
...,...
119879,1
103694,0
131932,0
146867,0


In [187]:
threshold = 0.5

y_train['status'] = ['Low' if score < threshold else 'High' for score in y_train['status']]
y_test['status'] = ['Low' if score < threshold else 'High' for score in y_test['status']]

le = LabelEncoder()

y_train['status'] = le.fit_transform(y_train['status'])
y_test['status'] = le.transform(y_test['status'])


y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

In [188]:
print(y_train)
print(y_test)
print("Unique values in y_train:", np.unique(y_train))

[1 1 1 ... 1 1 0]
[1 1 1 ... 0 1 0]
Unique values in y_train: [0 1]


# Model

## Logistic Regression

In [189]:
classifier = LogisticRegression(random_state = 42)
classifier.fit(x_train,y_train)

In [190]:
y_pred = classifier.predict(x_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test), 1)), 1))

[[1 1]
 [1 1]
 [1 1]
 ...
 [1 0]
 [1 1]
 [1 0]]


In [191]:
report = classification_report(y_test, y_pred)
print(report)

accuracy_score(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      7240
           1       0.76      1.00      0.86     22494

    accuracy                           0.76     29734
   macro avg       0.38      0.50      0.43     29734
weighted avg       0.57      0.76      0.65     29734



0.7563395439564136

## Classification via KNN

In [192]:
from sklearn.neighbors import KNeighborsClassifier
KnnPred=KNeighborsClassifier()
KnnPred.fit(x_train,y_train)
y_preds=KnnPred.predict(x_test)
y_preds

array([1, 1, 1, ..., 1, 1, 0])

In [193]:
#Keeping the default parameters
KnnPred.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [195]:
report = classification_report(y_test, y_preds)
print(report)

print(accuracy_score(y_test,y_preds))
# Cross Val
scores = cross_val_score(KnnPred, x_train, y_train, cv=5)
print("Mean cross-validation score: %.2f" % scores.mean())

              precision    recall  f1-score   support

           0       0.68      0.76      0.72      7240
           1       0.92      0.89      0.90     22494

    accuracy                           0.86     29734
   macro avg       0.80      0.82      0.81     29734
weighted avg       0.86      0.86      0.86     29734

0.8557879868164391
Mean cross-validation score: 0.85


## Summary

## Comparison between the Logistic regression and KNN

The logistic regression model achieved an accuracy of approximately 75.63%, while the KNN model achieved a higher accuracy of approximately 85.58%. This indicates that the KNN model outperformed the logistic regression model in terms of overall accuracy. However, a comprehensive comparison should also consider other performance metrics such as precision, recall, and F1-score. By evaluating these metrics for both models, we can gain insights into their respective strengths and weaknesses. For instance, if the dataset is imbalanced, models with high recall for the minority class may be preferred. 

Logistic Regression:

Precision for class 0: 0.00

Precision for class 1: 0.76

Recall for class 0: 0.00

Recall for class 1: 1.00

F1-score for class 0: 0.00

F1-score for class 1: 0.86

Accuracy: 0.76

KNN:

Precision for class 0: 0.68

Precision for class 1: 0.92

Recall for class 0: 0.76

Recall for class 1: 0.89

F1-score for class 0: 0.72

F1-score for class 1: 0.90

Accuracy: 0.86

Comparison:

Precision: The KNN model achieves higher precision for both classes compared to logistic regression. This indicates that KNN is better at correctly identifying true positive instances for both classes.
Recall: The KNN model also demonstrates higher recall for both classes, indicating its ability to capture a higher proportion of true positive instances.
F1-score: Similarly, the KNN model yields higher F1-scores for both classes, reflecting a better balance between precision and recall compared to logistic regression.
Accuracy: The KNN model also exhibits higher overall accuracy compared to logistic regression.
In summary, the results suggest that the KNN model generally outperforms logistic regression across all metrics, indicating its superiority in this particular classification task.