## Data Information

Loan Default Prediction Dataset This dataset contains information about customer loans, including customer demographics, loan details, and default status. The dataset can be used for various data analysis and machine learning tasks, such as predicting loan default risk. The dataset consists of the following columns:



* customer_id: Unique identifier for each customer 
* customer_age: Age of the customer
* customer_income: Annual income of the customer
* home_ownership: Home ownership status (e.g., RENT, OWN, MORTGAGE)
* employment_duration: Duration of employment in months
* loan_intent: Purpose of the loan (e.g., PERSONAL, EDUCATION, MEDICAL, VENTURE)
* loan_grade: Grade assigned to the loan
* loan_amnt: Loan amount requested
* loan_int_rate: Interest rate of the loan
* term_years: Loan term in years
* historical_default: Indicates if the customer has a history of default (Y/N)
* cred_hist_length: Length of the customer's credit history in years
* Current_loan_status: Current status of the loan (DEFAULT, NO DEFAULT)

In [89]:
import pandas as pd
import numpy as np


In [90]:
df = pd.read_csv("LoanDataset.csv")

In [91]:
df.head()

Unnamed: 0,customer_id,customer_age,customer_income,home_ownership,employment_duration,loan_intent,loan_grade,loan_amnt,loan_int_rate,term_years,historical_default,cred_hist_length,Current_loan_status
0,1.0,22,59000,RENT,123.0,PERSONAL,C,"£35,000.00",16.02,10,Y,3,DEFAULT
1,2.0,21,9600,OWN,5.0,EDUCATION,A,"£1,000.00",11.14,1,,2,NO DEFAULT
2,3.0,25,9600,MORTGAGE,1.0,MEDICAL,B,"£5,500.00",12.87,5,N,3,DEFAULT
3,4.0,23,65500,RENT,4.0,MEDICAL,B,"£35,000.00",15.23,10,N,2,DEFAULT
4,5.0,24,54400,RENT,8.0,MEDICAL,B,"£35,000.00",14.27,10,Y,4,DEFAULT


In [92]:
df.drop(columns = "customer_id", inplace = True)
df.head()

Unnamed: 0,customer_age,customer_income,home_ownership,employment_duration,loan_intent,loan_grade,loan_amnt,loan_int_rate,term_years,historical_default,cred_hist_length,Current_loan_status
0,22,59000,RENT,123.0,PERSONAL,C,"£35,000.00",16.02,10,Y,3,DEFAULT
1,21,9600,OWN,5.0,EDUCATION,A,"£1,000.00",11.14,1,,2,NO DEFAULT
2,25,9600,MORTGAGE,1.0,MEDICAL,B,"£5,500.00",12.87,5,N,3,DEFAULT
3,23,65500,RENT,4.0,MEDICAL,B,"£35,000.00",15.23,10,N,2,DEFAULT
4,24,54400,RENT,8.0,MEDICAL,B,"£35,000.00",14.27,10,Y,4,DEFAULT


In [93]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32586 entries, 0 to 32585
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   customer_age         32586 non-null  int64  
 1   customer_income      32586 non-null  object 
 2   home_ownership       32586 non-null  object 
 3   employment_duration  31691 non-null  float64
 4   loan_intent          32586 non-null  object 
 5   loan_grade           32586 non-null  object 
 6   loan_amnt            32585 non-null  object 
 7   loan_int_rate        29470 non-null  float64
 8   term_years           32586 non-null  int64  
 9   historical_default   11849 non-null  object 
 10  cred_hist_length     32586 non-null  int64  
 11  Current_loan_status  32582 non-null  object 
dtypes: float64(2), int64(3), object(7)
memory usage: 3.0+ MB


In [94]:
def binary_convert(df, col_name):
    
    """
    
    Converts the values of 'DEFAULT' and '"DEFAULT NO' to 1 and 0 in the specified 
    column in the given DataFrame.
    
    Args:
    df (pd. DataFrame): The DataFrame to act on.
    column_name (str): The name of the column to be transformed.

    Returns:
    Pd. DataFrame: The DataFrame to which the values are converted.
    
    """
    
    df[col_name] = df[col_name].map({'DEFAULT': 1 ,'NO DEFAULT': 0})
    
    return df

binary_convert(df, "Current_loan_status")


Unnamed: 0,customer_age,customer_income,home_ownership,employment_duration,loan_intent,loan_grade,loan_amnt,loan_int_rate,term_years,historical_default,cred_hist_length,Current_loan_status
0,22,59000,RENT,123.0,PERSONAL,C,"£35,000.00",16.02,10,Y,3,1.0
1,21,9600,OWN,5.0,EDUCATION,A,"£1,000.00",11.14,1,,2,0.0
2,25,9600,MORTGAGE,1.0,MEDICAL,B,"£5,500.00",12.87,5,N,3,1.0
3,23,65500,RENT,4.0,MEDICAL,B,"£35,000.00",15.23,10,N,2,1.0
4,24,54400,RENT,8.0,MEDICAL,B,"£35,000.00",14.27,10,Y,4,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
32581,57,53000,MORTGAGE,1.0,PERSONAL,C,"£5,800.00",13.16,7,,30,0.0
32582,54,120000,MORTGAGE,4.0,PERSONAL,A,"£17,625.00",7.49,4,,19,0.0
32583,65,76000,RENT,3.0,HOMEIMPROVEMENT,B,"£35,000.00",10.99,5,N,28,1.0
32584,56,150000,MORTGAGE,5.0,PERSONAL,B,"£15,000.00",11.48,6,,26,0.0


In [95]:
df.loan_grade.value_counts()

loan_grade
A    15661
B     9065
C     4926
D     2629
E      305
Name: count, dtype: int64

In [96]:
def ordinary_convert(df, col_name):
    
    """
    
    Converts the values of 'A,B,C,D,E' to 5,4,3,2,1 in the specified 
    column in the given DataFrame.
    
    Args:
    df (pd. DataFrame): The DataFrame to act on.
    column_name (str): The name of the column to be transformed.

    Returns:
    Pd. DataFrame: The DataFrame to which the values are converted.
    
    
    """
    
    df[col_name] = df[col_name].map({"A": 5, "B": 4, "C": 3, "D": 2, "E": 1})
    return df

ordinary_convert(df, "loan_grade")

Unnamed: 0,customer_age,customer_income,home_ownership,employment_duration,loan_intent,loan_grade,loan_amnt,loan_int_rate,term_years,historical_default,cred_hist_length,Current_loan_status
0,22,59000,RENT,123.0,PERSONAL,3,"£35,000.00",16.02,10,Y,3,1.0
1,21,9600,OWN,5.0,EDUCATION,5,"£1,000.00",11.14,1,,2,0.0
2,25,9600,MORTGAGE,1.0,MEDICAL,4,"£5,500.00",12.87,5,N,3,1.0
3,23,65500,RENT,4.0,MEDICAL,4,"£35,000.00",15.23,10,N,2,1.0
4,24,54400,RENT,8.0,MEDICAL,4,"£35,000.00",14.27,10,Y,4,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
32581,57,53000,MORTGAGE,1.0,PERSONAL,3,"£5,800.00",13.16,7,,30,0.0
32582,54,120000,MORTGAGE,4.0,PERSONAL,5,"£17,625.00",7.49,4,,19,0.0
32583,65,76000,RENT,3.0,HOMEIMPROVEMENT,4,"£35,000.00",10.99,5,N,28,1.0
32584,56,150000,MORTGAGE,5.0,PERSONAL,4,"£15,000.00",11.48,6,,26,0.0


In [97]:
df["customer_income"] = df["customer_income"].str.replace(',', '').str.strip()
df["customer_income"] = df["customer_income"].astype(int)

In [98]:
df["customer_income"].info() 

<class 'pandas.core.series.Series'>
RangeIndex: 32586 entries, 0 to 32585
Series name: customer_income
Non-Null Count  Dtype
--------------  -----
32586 non-null  int32
dtypes: int32(1)
memory usage: 127.4 KB


In [99]:
df.isnull().sum()

customer_age               0
customer_income            0
home_ownership             0
employment_duration      895
loan_intent                0
loan_grade                 0
loan_amnt                  1
loan_int_rate           3116
term_years                 0
historical_default     20737
cred_hist_length           0
Current_loan_status        4
dtype: int64

In [100]:
df["loan_amnt"].dropna()

0        £35,000.00
1         £1,000.00
2         £5,500.00
3        £35,000.00
4        £35,000.00
            ...    
32581     £5,800.00
32582    £17,625.00
32583    £35,000.00
32584    £15,000.00
32585     £6,475.00
Name: loan_amnt, Length: 32585, dtype: object

In [101]:
df["loan_amnt"] = df["loan_amnt"].str.replace(',', '').str.strip("£")
df["loan_amnt"] = df["loan_amnt"].astype("float64")

In [102]:
df["loan_amnt"].info()

<class 'pandas.core.series.Series'>
RangeIndex: 32586 entries, 0 to 32585
Series name: loan_amnt
Non-Null Count  Dtype  
--------------  -----  
32585 non-null  float64
dtypes: float64(1)
memory usage: 254.7 KB


In [103]:
df["historical_default"].fillna("N", inplace = True)

In [104]:
df["employment_duration"].value_counts()

employment_duration
0.0      4105
2.0      3849
3.0      3457
5.0      2946
1.0      2915
4.0      2873
6.0      2669
7.0      2197
8.0      1687
9.0      1367
11.0      740
10.0      696
12.0      575
13.0      426
14.0      335
15.0      239
16.0      165
17.0      129
18.0      104
19.0       64
20.0       42
21.0       38
22.0       19
24.0       10
23.0       10
25.0        8
26.0        6
27.0        5
31.0        4
28.0        3
123.0       2
30.0        2
41.0        1
34.0        1
29.0        1
38.0        1
Name: count, dtype: int64

In [141]:
most_frequent_value = df["employment_duration"].value_counts().idxmax()

df["employment_duration"].fillna(most_frequent_value, inplace = True)

In [142]:
df["Current_loan_status"].dropna(inplace = True)

df["Current_loan_status"].value_counts()

Current_loan_status
0.0    25742
1.0     6840
Name: count, dtype: int64

In [143]:
df["loan_int_rate"].fillna(df["loan_int_rate"].mean, inplace = True)

In [146]:
df.dropna(inplace = True)
df.isnull().sum()

customer_age           0
customer_income        0
home_ownership         0
employment_duration    0
loan_intent            0
loan_grade             0
loan_amnt              0
loan_int_rate          0
term_years             0
historical_default     0
cred_hist_length       0
Current_loan_status    0
dtype: int64

In [147]:
df.head(5)

Unnamed: 0,customer_age,customer_income,home_ownership,employment_duration,loan_intent,loan_grade,loan_amnt,loan_int_rate,term_years,historical_default,cred_hist_length,Current_loan_status
0,22,59000,RENT,123.0,PERSONAL,3,35000.0,16.02,10,Y,3,1.0
1,21,9600,OWN,5.0,EDUCATION,5,1000.0,11.14,1,N,2,0.0
2,25,9600,MORTGAGE,1.0,MEDICAL,4,5500.0,12.87,5,N,3,1.0
3,23,65500,RENT,4.0,MEDICAL,4,35000.0,15.23,10,N,2,1.0
4,24,54400,RENT,8.0,MEDICAL,4,35000.0,14.27,10,Y,4,1.0


In [190]:
X = df.drop("Current_loan_status", axis = 1)

y = df["Current_loan_status"].astype(int)

In [191]:
cat_cols = [col for col in X.columns if X[col].dtypes == "O"]

In [192]:
cat_cols

['home_ownership', 'loan_intent', 'loan_int_rate', 'historical_default']

In [193]:
dummies = pd.get_dummies(X[cat_cols], drop_first = True).astype(int)
dummies

Unnamed: 0,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,loan_int_rate_11.14,loan_int_rate_12.87,...,loan_int_rate_22.48,loan_int_rate_19.66,loan_int_rate_23.22,loan_int_rate_20.4,loan_int_rate_16.71,loan_int_rate_18.54,loan_int_rate_20.69,loan_int_rate_16.15,loan_int_rate_20.53,historical_default_Y
0,0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,1,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32581,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32582,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32583,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32584,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [194]:
X.drop(cat_cols, axis = 1, inplace = True)

In [195]:
X = pd.concat([X, dummies], axis = 1)

In [196]:
X.head(5)

Unnamed: 0,customer_age,customer_income,employment_duration,loan_grade,loan_amnt,term_years,cred_hist_length,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,...,loan_int_rate_22.48,loan_int_rate_19.66,loan_int_rate_23.22,loan_int_rate_20.4,loan_int_rate_16.71,loan_int_rate_18.54,loan_int_rate_20.69,loan_int_rate_16.15,loan_int_rate_20.53,historical_default_Y
0,22,59000,123.0,3,35000.0,10,3,0,0,1,...,0,0,0,0,0,0,0,0,0,1
1,21,9600,5.0,5,1000.0,1,2,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,25,9600,1.0,4,5500.0,5,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,23,65500,4.0,4,35000.0,10,2,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,24,54400,8.0,4,35000.0,10,4,0,0,1,...,0,0,0,0,0,0,0,0,0,1


In [197]:

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier



In [198]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

knn = KNeighborsClassifier(n_neighbors = 11)

knn.fit(X_train, y_train)

knn.predict(X_test)

knn_score = knn.score(X_test, y_test)

In [199]:
results = pd.DataFrame([], columns = ["SCORE"])

In [200]:
results.loc["KNNeighbour"] = [knn_score]
results

Unnamed: 0,SCORE
KNNeighbour,0.822771


In [201]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()

nb.fit(X_train, y_train)
nb_pred = nb.predict(X_test)
nb_score = nb.score(X_test, y_test)

results.loc["GausNB"] = [nb_score]

results

Unnamed: 0,SCORE
KNNeighbour,0.822771
GausNB,0.215283


In [202]:
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression(max_iter = 100, penalty = None)

log_model.fit(X_train, y_train)

log_predict = log_model.predict(X_test)

log_score = log_model.score(X_test, y_test)

results.loc["LogisticReg"] = [log_score]
results

Unnamed: 0,SCORE
KNNeighbour,0.822771
GausNB,0.215283
LogisticReg,0.836274
