## Data Preprocessing for Loan Approval Prediction
This notebook performs feature engineering, encoding, scaling, and splitting of data to prepare it for machine learning models.

### Step 1: Import Libraries & Load Dataset

In [23]:
# Import Required Libraries
import pandas as pd
import numpy as np

# Load the cleaned dataset
df = pd.read_csv('../data/processed/loan_dataset.csv')
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,130.0,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


### Step 2: Drop Irrelevant Columns

Remove unique identifiers like `Loan_ID`.

In [24]:
df.drop(columns=['Loan_ID'], inplace=True)

### Step 3: Feature Engineering

Create new features to help model capture better patterns.

In [25]:
# Total income of applicant and co-applicant
df['Total_Income'] = df['ApplicantIncome'] + df['CoapplicantIncome']

# Income to loan ratio
df['Income_Loan_Ratio'] = df['Total_Income'] / df['LoanAmount']

# Log transform for skewed loanAmount
df['LoanAmount_log'] = np.log(df['LoanAmount'])

# Drop original columns
df.drop(['ApplicantIncome','CoapplicantIncome','LoanAmount'], axis=1, inplace=True)

### Step 4: Encode Categorical Variables

Convert text-based categories into numerical values.

In [26]:
# Binary encoding 
df['Gender'] = df['Gender'].map({'Male':1, 'Female':0})
df['Married'] = df['Married'].map({'Yes':1, 'No':0})
df['Education'] = df['Education'].map({'Graduate':1, 'Not Graduate':0})
df['Self_Employed'] = df['Self_Employed'].map({'Yes':1, 'No':0})
df['Loan_Status'] = df['Loan_Status'].map({'Y':1, 'N':0})

# One-hot encoding
df = pd.get_dummies(df, columns=['Property_Area', 'Dependents'], drop_first=True)

### Step 5: Outlier Removal
Use the IQR method to remove extreme values.

In [28]:
Q1 = df['Total_Income'].quantile(0.25)
Q3 = df['Total_Income'].quantile(0.75)

IQR = Q3 - Q1
df = df[(df['Total_Income'] >= Q1 - 1.5 * IQR) & (df['Total_Income'] <= Q3 + 1.5 * IQR)]

In [9]:
df.shape

(564, 16)

### Step 6: Feature Scaling
Scale numerical features using StandardScaler.

In [30]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_cols = ['Total_Income', 'Income_Loan_Ratio','LoanAmount_log']
df[scaled_cols] = scaler.fit_transform(df[scaled_cols])

### Step 7: Split Dataset
Split the data into training and testing sets.

In [39]:
from sklearn.model_selection import train_test_split

X = df.drop('Loan_Status', axis=1)
y = df['Loan_Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y, random_state= 42)

### Step 8: Save Processed Data
Save the preprocessed datasets to data/processed

In [40]:
X_train.to_csv('../data/processed/X_train.csv', index=False)
X_test.to_csv('../data/processed/X_test.csv', index=False)
y_train.to_csv('../data/processed/y_train.csv', index=False)
y_test.to_csv('../data/processed/y_test.csv', index=False)

In [38]:
X_train.head()

Unnamed: 0,Gender,Married,Education,Self_Employed,Loan_Amount_Term,Credit_History,Total_Income,Income_Loan_Ratio,LoanAmount_log,Property_Area_Semiurban,Property_Area_Urban,Dependents_1,Dependents_2,Dependents_3+
355,0,0,1,0,180.0,1.0,-0.806215,-0.593364,-0.11225,False,True,False,False,False
121,0,0,1,0,360.0,1.0,-0.649658,2.230229,-2.343644,True,False,False,False,False
226,1,1,0,1,360.0,1.0,-0.397305,-0.527542,0.287494,False,True,False,False,False
0,1,0,1,0,360.0,1.0,0.096757,-0.039637,0.150031,False,True,False,False,False
482,1,1,1,0,360.0,1.0,-0.176441,-0.227362,0.114343,True,False,False,False,False


In [34]:
X.head()

Unnamed: 0,Gender,Married,Education,Self_Employed,Loan_Amount_Term,Credit_History,Total_Income,Income_Loan_Ratio,LoanAmount_log,Property_Area_Semiurban,Property_Area_Urban,Dependents_1,Dependents_2,Dependents_3+
0,1,0,1,0,360.0,1.0,0.096757,-0.039637,0.150031,False,True,False,False,False
1,1,1,1,0,360.0,1.0,0.204084,0.078843,0.114343,False,False,True,False,False
2,1,1,1,1,360.0,1.0,-1.166782,-0.018521,-1.410333,False,True,False,False,False
3,1,1,0,0,360.0,1.0,-0.305944,-0.214015,-0.034214,False,True,False,False,False
4,1,0,1,0,360.0,1.0,0.163726,-0.151058,0.336998,False,True,False,False,False


In [33]:
df.head()

Unnamed: 0,Gender,Married,Education,Self_Employed,Loan_Amount_Term,Credit_History,Loan_Status,Total_Income,Income_Loan_Ratio,LoanAmount_log,Property_Area_Semiurban,Property_Area_Urban,Dependents_1,Dependents_2,Dependents_3+
0,1,0,1,0,360.0,1.0,1,0.096757,-0.039637,0.150031,False,True,False,False,False
1,1,1,1,0,360.0,1.0,0,0.204084,0.078843,0.114343,False,False,True,False,False
2,1,1,1,1,360.0,1.0,1,-1.166782,-0.018521,-1.410333,False,True,False,False,False
3,1,1,0,0,360.0,1.0,1,-0.305944,-0.214015,-0.034214,False,True,False,False,False
4,1,0,1,0,360.0,1.0,1,0.163726,-0.151058,0.336998,False,True,False,False,False
