In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score


In [2]:
df = pd.read_csv("./loan_dataset.csv")

In [3]:
df.sample(7)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
455,LP002455,Male,Yes,2,Graduate,No,3859,0.0,96.0,360.0,1.0,Semiurban,Y
475,LP002527,Male,Yes,2,Graduate,Yes,16525,1014.0,150.0,360.0,1.0,Rural,Y
575,LP002868,Male,Yes,2,Graduate,No,3159,461.0,108.0,84.0,1.0,Urban,Y
339,LP002114,Female,No,0,Graduate,No,4160,0.0,71.0,360.0,1.0,Semiurban,Y
53,LP001179,Male,Yes,2,Graduate,No,4616,0.0,134.0,360.0,1.0,Urban,N
193,LP001658,Male,No,0,Graduate,No,3858,0.0,76.0,360.0,1.0,Semiurban,Y
238,LP001790,Female,No,1,Graduate,No,3812,0.0,112.0,360.0,1.0,Rural,Y


In [4]:
df.shape

(614, 13)

In [5]:
df.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


## Handling Duplicate and NULL Values

In [7]:
df.duplicated().sum()

0

In [8]:
df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

##### Function to Handle NULL values

In [9]:
def fillNullValues(df, feature):
    dt = df[feature].dtype
    
    if(dt=='object'):
        mode = df[feature].mode()[0]
        print(f"Mode of {feature} is {mode}.")
        df[feature].fillna(mode, inplace=True)
        
    else:
        median = df[feature].median()
        print(f"Median of {feature} is {median}.")
        df[feature].fillna(median, inplace=True)    

###### Loop over all columns to see if any null value is there

In [10]:
cols = df.columns
cols

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [11]:
for feature in cols:
    nullValuesSum = df[feature].isnull().sum()
    if(nullValuesSum>0):
        fillNullValues(df, feature)

Mode of Gender is Male.
Mode of Married is Yes.
Mode of Dependents is 0.
Mode of Self_Employed is No.
Median of LoanAmount is 128.0.
Median of Loan_Amount_Term is 360.0.
Median of Credit_History is 1.0.


In [12]:
df.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

### Handling Features Data Types

In [13]:
print("Data Types:\n", df.dtypes)
print("\nUnique Values in 'Gender':\n", df['Gender'].unique())
print("\nUnique Values in 'Married':\n", df['Married'].unique())
print("\nUnique Values in 'Dependents':\n", df['Dependents'].unique())
print("\nUnique Values in 'Self_Employed':\n", df['Self_Employed'].unique())

Data Types:
 Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

Unique Values in 'Gender':
 ['Male' 'Female']

Unique Values in 'Married':
 ['No' 'Yes']

Unique Values in 'Dependents':
 ['0' '1' '2' '3+']

Unique Values in 'Self_Employed':
 ['No' 'Yes']


In [14]:
# replacing the '3+' to '3' in Dependents feature variable

In [15]:
df['Dependents'] = df['Dependents'].replace('3+', 3)

In [16]:
# converting Dependents feature into int
df['Dependents'] = df['Dependents'].astype('int64')

In [17]:
df['Dependents'].dtype

dtype('int64')

### Encoding categorical variables

In [18]:
le = LabelEncoder()

In [19]:
df['Gender'] = le.fit_transform(df['Gender'])
df['Married'] = le.fit_transform(df['Married'])
df['Education'] = le.fit_transform(df['Education'])
df['Self_Employed'] = le.fit_transform(df['Self_Employed'])
df['Property_Area'] = le.fit_transform(df['Property_Area'])
df['Loan_Status'] = le.fit_transform(df['Loan_Status'])

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


### Feature Engineering

In [20]:
df['Total_Income'] = df['ApplicantIncome'] + df['CoapplicantIncome']
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Total_Income
0,LP001002,1,0,0,0,0,5849,0.0,128.0,360.0,1.0,2,1,5849.0
1,LP001003,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0,0,6091.0
2,LP001005,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,2,1,3000.0
3,LP001006,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,2,1,4941.0
4,LP001008,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,2,1,6000.0


### Selecting features for clustering

In [21]:
features = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'LoanAmount', 
            'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Total_Income']

In [22]:
X = df[features]

# Scaling numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [23]:
kmeans = KMeans(n_clusters=2, random_state=42)
df['Cluster'] = kmeans.fit_predict(X_scaled)



In [24]:
df['Cluster'].unique()

array([1, 0])

### Evaluating clusters

In [26]:
silhouette_avg = silhouette_score(X_scaled, df['Cluster'])
print(f'Silhouette Score: {silhouette_avg}')

Silhouette Score: 0.15851294484059494


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


## Predict Loan Approval

In [27]:
cluster_loan_status = df.groupby('Cluster')['Loan_Status'].mean()
df['Cluster_Prediction'] = df['Cluster'].map(cluster_loan_status)
df['Predicted_Loan_Status'] = df['Cluster_Prediction'].apply(lambda x: 1 if x > 0.5 else 0)

In [28]:
print(df[['Loan_ID', 'Loan_Status', 'Predicted_Loan_Status']].head())

    Loan_ID  Loan_Status  Predicted_Loan_Status
0  LP001002            1                      1
1  LP001003            0                      1
2  LP001005            1                      1
3  LP001006            1                      1
4  LP001008            1                      1
