In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import boxcox

In [2]:
df=pd.read_csv('Loan_null.csv')
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849.0,0.0,128.0,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583.0,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000.0,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583.0,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000.0,0.0,141.0,360.0,1.0,Urban,Y


In [3]:
df['Loan_Amount_Term'] = pd.cut(
    df['Loan_Amount_Term'],
    bins=[0, 84, 240, float('inf')],
    labels=['Short', 'Medium', 'Long']
)

In [4]:
df['Loan_Amount_Term'].value_counts()


Long      554
Medium     51
Short       9
Name: Loan_Amount_Term, dtype: int64

In [5]:
## Combine income and create a debt-to-income ratio
df['Total_Income'] = df['ApplicantIncome'] + df['CoapplicantIncome']
df['Debt_to_Income_Ratio'] = df['LoanAmount'] / df['Total_Income']
df.drop(['ApplicantIncome', 'CoapplicantIncome'], axis=1, inplace=True)

In [6]:
# separating Categorical and numrical feature 
categorical_cols = ['Gender', 'Married', 'Dependents', 'Self_Employed', 'Credit_History']
numerical_cols = ['LoanAmount', 'Loan_Amount_Term']

In [7]:
## Encoding categorical variables
label_encoders = {}
for col in categorical_cols + ['Education', 'Property_Area', 'Loan_Status','Loan_Amount_Term']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [8]:
df.sample(10)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Total_Income,Debt_to_Income_Ratio
456,1,1,0,0,0,118.0,0,1,2,1,4301.0,0.027435
598,1,1,0,0,1,180.0,0,1,0,1,9963.0,0.018067
408,1,1,1,0,0,152.0,0,0,1,0,8300.0,0.018313
453,1,1,0,0,0,90.0,1,0,0,1,4949.0,0.018185
67,1,1,1,0,0,261.5,0,1,2,1,10171.25,0.02571
498,1,1,1,0,1,95.0,0,1,1,1,2895.0,0.032815
384,1,1,2,0,0,100.0,0,1,2,0,4566.0,0.021901
314,1,1,0,0,0,159.0,0,1,0,0,4316.0,0.03684
106,1,1,2,0,0,225.0,0,1,2,1,11297.25,0.019916
427,1,1,2,0,0,133.0,0,1,1,1,5935.0,0.022409


In [9]:
print(df['LoanAmount'].skew(),
df['Total_Income'].skew(),
df['Debt_to_Income_Ratio'].skew())

0.6836926591947383 0.9821895816980065 1.533601409651107


In [10]:
# Log transformation for skewed features

numerical_features = ['LoanAmount', 'Total_Income', 'Debt_to_Income_Ratio']
for feature in numerical_features:
    df[feature] = np.log1p(df[feature])  # log(x + 1) to handle zero values

In [11]:
print(df['LoanAmount'].skew(),
df['Total_Income'].skew(),
df['Debt_to_Income_Ratio'].skew())

-0.8963858941078962 0.0002885053318540614 1.4395179194506333


In [12]:
# applying the box-cox transformation to further reduce the skewness
numerical_features = ['LoanAmount', 'Total_Income', 'Debt_to_Income_Ratio']

# Adding 1 to handle zero or negative values (if present)
for feature in numerical_features:
    df[feature], _ = boxcox(df[feature] + 1)

# Checking skewness after transformation
print(df['LoanAmount'].skew(),
      df['Total_Income'].skew(),
      df['Debt_to_Income_Ratio'].skew())

0.028750279762134914 -0.004765653628284856 -0.13243942100362394


In [16]:
# saving the data for training

df.to_csv('loan_train.csv', index=False)