Dataset for loan prediction is given
*******************************************
1. Load the dataset in Python environment  

2. Perform different preprocessing steps

  a. Missing value handling
  
  b. Outlier detection
  
  c. Encoding
  
  d. Scaling
  
3. Give detailed description for each step


# ***Import Libraries***

In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder#to import scaling features for pre-processing

# ***1. Load the dataset***

In [55]:
loan_data=pd.read_csv('/content/train_loan (1).csv')#load the dataset using pandas

In [56]:
loan_data.head()#gives the first n-rows of the dataset

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [57]:
print(loan_data.shape)

(614, 13)


In [35]:
loan_data.info()#gives the detailed summary of the dataframe

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [58]:
loan_data.describe()#gives the statistics related to the given dataframe for analysis

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


# ***3. Perform different Pre-processing techniques***
## ***a. Missing value handling***

In [59]:
print("\nMissing values before handling")
loan_data.isnull().sum()#displays the missing values in the given dataframe


Missing values before handling


Unnamed: 0,0
Loan_ID,0
Gender,13
Married,3
Dependents,15
Education,0
Self_Employed,32
ApplicantIncome,0
CoapplicantIncome,0
LoanAmount,22
Loan_Amount_Term,14


In [60]:
# Missing values are handled using Imputation technique and specifically==>Mean/median imputation
#================================================================================================
#1. Separate numeric and categorical columns
num_cols = loan_data.select_dtypes(include=['int64', 'float64']).columns
cat_cols = loan_data.select_dtypes(include='object').columns
print(num_cols)
print(cat_cols)


Index(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History'],
      dtype='object')
Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'Property_Area', 'Loan_Status'],
      dtype='object')


In [61]:
print(loan_data.index)  # See what's in the index
print(loan_data.head()) # Preview rows

RangeIndex(start=0, stop=614, step=1)
    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001002   Male      No          0      Graduate            No   
1  LP001003   Male     Yes          1      Graduate            No   
2  LP001005   Male     Yes          0      Graduate           Yes   
3  LP001006   Male     Yes          0  Not Graduate            No   
4  LP001008   Male      No          0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0         NaN             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Urban           Y  
1             1.0         Ru

In [62]:
#2. Fill numeric columns with median and mode
loan_data['LoanAmount'].fillna(loan_data['LoanAmount'].median(), inplace=True)#choose median since its insensitive to skewness
loan_data['Loan_Amount_Term'].fillna(loan_data['Loan_Amount_Term'].mode()[0], inplace=True)#discrete type data==>use mode
loan_data['Credit_History'].fillna(loan_data['Credit_History'].mode()[0], inplace=True)#binary===>use mode

#3. Fill categorical columns with mode
for col in ['Gender', 'Married', 'Dependents', 'Self_Employed']:
    loan_data[col].fillna(loan_data[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  loan_data['LoanAmount'].fillna(loan_data['LoanAmount'].median(), inplace=True)#choose median since its insensitive to skewness
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  loan_data['Loan_Amount_Term'].fillna(loan_data['Loan_Amount_Term'].mode()[0], inplace=True)#discrete type

In [63]:
print("\nMissing Values After Handling:\n", loan_data.isnull().sum())


Missing Values After Handling:
 Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64


In [64]:
print(loan_data.head())

    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001002   Male      No          0      Graduate            No   
1  LP001003   Male     Yes          1      Graduate            No   
2  LP001005   Male     Yes          0      Graduate           Yes   
3  LP001006   Male     Yes          0  Not Graduate            No   
4  LP001008   Male      No          0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0       128.0             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Urban           Y  
1             1.0         Rural           N  
2             1.0   

## ***b. Outlier detection***

In [65]:
#Outlier Detection and Capping using IQR

for col in loan_data.select_dtypes(include=['float64', 'int64']).columns:#categorical columns are accessed
  Q1 = loan_data[col].quantile(0.25)#gets the quartile Q1
  Q3 = loan_data[col].quantile(0.75)#gets the Q3 quartile
  IQR = Q3 - Q1#Finds the Inter-Quartile Range
  lower = Q1 - 1.5 * IQR#calculate lower fence
  upper = Q3 + 1.5 * IQR#calculate upper fence
  # print(f"Lower fence is {lower}")
  # print(f"Upper fence is {upper}")
  outliers = loan_data[(loan_data[col] < lower) | (loan_data[col] > upper)][col]#identifies the ouliers in each column based on the given condition
  print(f"{col}: {len(outliers)} outliers found")#displays the outliers in numerical columns

  #Capping outliers
  # outliers = ((loan_data[col] < lower) | (loan_data[col] > upper)).sum()
  # rate = round(outliers / len(loan_data) * 100, 2)
  # print(f"{col}: {rate}% outliers capped.")
  # loan_data[col] = np.where(loan_data[col] < lower, lower,np.where(loan_data[col] > upper, upper, loan_data[col]))
  # print("Outliers capped.")


ApplicantIncome: 50 outliers found
CoapplicantIncome: 18 outliers found
LoanAmount: 41 outliers found
Loan_Amount_Term: 88 outliers found
Credit_History: 89 outliers found


## ***c. Encoding***

In [66]:
# Initialize the LabelEncoder
# Using Label Encoding for categorical variables
# ================================================
label_encoder = LabelEncoder()

In [78]:
#Special Encoding for dependents
loan_data['Dependents'] = loan_data['Dependents'].replace('3+', 3).astype(int)
for col in ['Loan_ID', 'Gender', 'Married', 'Education','Self_Employed', 'Property_Area', 'Loan_Status']:
  loan_data[col] = label_encoder.fit_transform(loan_data[col])

In [79]:
# Display the data after encoding
print("\nData After Encoding:")
print(loan_data.head())


Data After Encoding:
   Loan_ID  Gender  Married  Dependents  Education  Self_Employed  \
0        0       1        0           0          0              0   
1        1       1        1           1          0              0   
2        2       1        1           0          0              1   
3        3       1        1           0          1              0   
4        4       1        0           0          0              0   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0         0.072991          -0.554487   -0.211241          0.273231   
1        -0.134412          -0.038732   -0.211241          0.273231   
2        -0.393747          -0.554487   -0.948996          0.273231   
3        -0.462062           0.251980   -0.306435          0.273231   
4         0.097728          -0.554487   -0.056551          0.273231   

   Credit_History  Property_Area  Loan_Status  
0        0.411733              2            1  
1        0.411733              0        

## ***d. Scaling***

In [80]:
# Scaling numerical features using standardization technique
scaler = StandardScaler()
loan_data[num_cols] = scaler.fit_transform(loan_data[num_cols])

# Display the final preprocessed data
print("\nFinal Preprocessed Data:")
print(loan_data.head())



Final Preprocessed Data:
   Loan_ID  Gender  Married  Dependents  Education  Self_Employed  \
0        0       1        0           0          0              0   
1        1       1        1           1          0              0   
2        2       1        1           0          0              1   
3        3       1        1           0          1              0   
4        4       1        0           0          0              0   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0         0.072991          -0.554487   -0.211241          0.273231   
1        -0.134412          -0.038732   -0.211241          0.273231   
2        -0.393747          -0.554487   -0.948996          0.273231   
3        -0.462062           0.251980   -0.306435          0.273231   
4         0.097728          -0.554487   -0.056551          0.273231   

   Credit_History  Property_Area  Loan_Status  
0        0.411733              2            1  
1        0.411733              0    

In [81]:
# Exporting the Cleaned Data
# Save the cleaned data to a CSV file
loan_data.to_csv("cleaned_data.csv", index=False)
print("\nCleaned data has been saved to 'cleaned_data.csv'.")



Cleaned data has been saved to 'cleaned_data.csv'.


In [84]:
loan_data[num_cols].mean()
loan_data[num_cols].std()

Unnamed: 0,0
ApplicantIncome,1.000815
CoapplicantIncome,1.000815
LoanAmount,1.000815
Loan_Amount_Term,1.000815
Credit_History,1.000815
