In [7]:
import pandas as pd
import numpy as np
df = pd.read_csv("loan_approved.csv") 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Loan_ID                 614 non-null    object 
 1   Gender                  601 non-null    object 
 2   Married                 611 non-null    object 
 3   Dependents              599 non-null    object 
 4   Education               614 non-null    object 
 5   Self_Employed           582 non-null    object 
 6   ApplicantIncome         614 non-null    int64  
 7   CoapplicantIncome       614 non-null    float64
 8   LoanAmount              592 non-null    float64
 9   Loan_Amount_Term        600 non-null    float64
 10  Credit_History          564 non-null    float64
 11  Property_Area           614 non-null    object 
 12  Loan_Status (Approved)  614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


### Loan_ID – Nominal 
because it is just an identifier with no order
### Gender – Nominal 
because Male/Female are labels with no ranking
### Married – Nominal 
because Yes/No categories have no order
### Dependents – Ordinal 
because 0 < 1 < 2 < 3+ gives a natural order
### Education – Nominal 
because Graduate/Not Graduate are labels, not ordered
### Self_Employed – Nominal 
because Yes/No categories have no ranking
### ApplicantIncome – Numerical 
continuous numeric data, not categorical
### CoapplicantIncome – Numerical 
continuous numeric data, not categorical
### LoanAmount – Numerical 
measurable amount, not categorical
### Loan_Amount_Term – Numerical 
duration in days/months, measurable
### Credit_History – Ordinal 
0 = bad < 1 = good, so order exists
### Property_Area – Nominal 
Urban/Rural/Semiurban have no natural ranking
### Loan_Status (Approved) – Nominal 
Yes/No categories, no order

In [59]:
print("Shape",df.shape)
print("Loan_ID :",len(df['Loan_ID'].unique())) ## Nominal -Categorical
print("Gender:",len(df['Gender'].unique())) ## Nominal - Categorical
print(df['Gender'].unique())
print("MArried :",len(df['Married'].unique())) ## Nominal - Categorical
print(df['Married'].unique())
print("Dependents :",len(df['Dependents'].unique())) ## Ordinal - Categorical (It is not numerical because it has 3+ value inside copulmn not 3 or 4 or 5 .....)
print(df['Dependents'].unique())
print("Education :",len(df['Education'].unique())) ## Nominal - Categorical 
print(df['Education'].unique())
print("Self_Employed :",len(df['Self_Employed'].unique())) ## Nominal - Categorical 
print(df['Self_Employed'].unique())
print("ApplicantIncome :",len(df['ApplicantIncome'].unique())) ## Continuous - Numerical
print("CoapplicantIncome :",len(df['CoapplicantIncome'].unique())) ## Continuous - Numerical
print("LoanAmount :",len(df['LoanAmount'].unique())) ## Continuous - Numerical
print("Loan_Amount_Term :",len(df['Loan_Amount_Term'].unique())) ## Continuous - Numerical
print("Credit_History :",len(df['Credit_History'].unique())) ## Discrete - Numerical
print("Property_Area :",len(df['Property_Area'].unique())) ## Nominal - Categorical
print("Loan_Status (Approved):",len(df['Loan_Status (Approved)'].unique())) ## Nominal - Categorical


Shape (614, 13)
Loan_ID : 614
Gender: 3
['Male' 'Female' nan]
MArried : 3
['No' 'Yes' nan]
Dependents : 5
['0' '1' '2' '3+' nan]
Education : 2
['Graduate' 'Not Graduate']
Self_Employed : 3
['No' 'Yes' nan]
ApplicantIncome : 505
CoapplicantIncome : 287
LoanAmount : 204
Loan_Amount_Term : 11
Credit_History : 3
Property_Area : 3
Loan_Status (Approved): 2


In [60]:
## Missing values

print("Missing values before cleaning :")
print(df.isnull().sum())

cat_cols = ['Gender','Married','Dependents','Self_Employed','Credit_History']
for col in cat_cols :
    df[col] = df[col].fillna(df[col].mode()[0])

num_cols = ['LoanAmount','Loan_Amount_Term']
for col in num_cols :
    df[col] = df[col].fillna(df[col].median())

print("Missing values after cleaning :")
print(df.isnull().sum())

df.head()

Missing values before cleaning :
Loan_ID                    0
Gender                    13
Married                    3
Dependents                15
Education                  0
Self_Employed             32
ApplicantIncome            0
CoapplicantIncome          0
LoanAmount                22
Loan_Amount_Term          14
Credit_History            50
Property_Area              0
Loan_Status (Approved)     0
dtype: int64
Missing values after cleaning :
Loan_ID                   0
Gender                    0
Married                   0
Dependents                0
Education                 0
Self_Employed             0
ApplicantIncome           0
CoapplicantIncome         0
LoanAmount                0
Loan_Amount_Term          0
Credit_History            0
Property_Area             0
Loan_Status (Approved)    0
dtype: int64


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status (Approved)
0,LP001002,Male,No,0,Graduate,No,5849,0.0,128.0,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
