#### **1. Import Libraries and Load Data**

In [58]:
# Core libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

# Preprocessing libraries
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# Statistcal libraries
from scipy import stats
from scipy.stats import zscore, skew

# Set style for better visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

In [2]:
df_eda = pd.read_csv("eda_cleaned.csv")
df_eda

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849.0,0.0,128.0,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583.0,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000.0,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583.0,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000.0,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900.0,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106.0,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072.0,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583.0,0.0,187.0,360.0,1.0,Urban,Y


#### **2. EDA-Based Data Quality Assessment**

**Based on EDA findings, let's assess the specific issues identified**

In [44]:
# Create a copy for preprocessing
df_p = df_eda.copy()
df_p.head(5)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849.0,0.0,128.0,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583.0,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000.0,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583.0,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000.0,0.0,141.0,360.0,1.0,Urban,Y


In [45]:
df_p = df_p.drop(columns='Loan_ID', inplace=False)

In [47]:
categorical_cols = ['Gender', 'Married', 'Dependents', 'Education',
                    'Self_Employed', 'Loan_Amount_Term', 'Credit_History', 'Property_Area']

numeric_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']


In [48]:
df_p['Loan_Status'] = df_p['Loan_Status'].map({'Y': 1, 'N': 0})

In [None]:
# 1. Check for missing values (EDA showed no missing values)
print("\n1. Missing Values:")
missing_values = df_p.isnull().sum()
if missing_values.sum() > 0:
    print(missing_values[missing_values > 0])
else:
    print("No missing values found (as expected from EDA)")
    

# 2. Check for duplicates
print("\n2. Duplicate Rows:")
duplicates = df_p.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")
if duplicates > 0:
    print(f"Percentage of duplicates: {(duplicates/len(df_p))*100:.2f}%")
    
# 3. Check skewness for variables indentified in EDA as right-skewed
print("\n3. Skewness Analysis (EDA identified right-skewed variables):")
skewed_vars = ["ApplicantIncome", "CoapplicantIncome", "LoanAmount"]
for var in skewed_vars:
    if var in df_p.columns:
        skewness = skew(df_p[var])
        print(f"{var}: skewness = {skewness:.3f} ({'right-skewed' if skewness > 0.5 else 'approximately normal'})")


1. Missing Values:
No missing values found (as expected from EDA)

2. Duplicate Rows:
Number of duplicate rows: 1
Percentage of duplicates: 0.16%

3. Skewness Analysis (EDA identified right-skewed variables):
ApplicantIncome: skewness = 1.037 (right-skewed)
CoapplicantIncome: skewness = 1.010 (right-skewed)
LoanAmount: skewness = 0.682 (right-skewed)


In [57]:
correlations = df_p.corr(numeric_only=True)['Loan_Status'].sort_values(key=abs, ascending=False)

corr_df = correlations.reset_index()
corr_df.columns = ['Feature', 'Correlation_with_LoanStatus']
corr_df['AbsCorrelation'] = corr_df['Correlation_with_LoanStatus'].abs()
corr_df = corr_df.sort_values('AbsCorrelation', ascending=False)

print("\nRanked Correlations with Loan_Status:")
display(corr_df)



Ranked Correlations with Loan_Status:


Unnamed: 0,Feature,Correlation_with_LoanStatus,AbsCorrelation
0,Loan_Status,1.0,1.0
1,Credit_History,0.540556,0.540556
2,LoanAmount,-0.047262,0.047262
3,Loan_Amount_Term,-0.022549,0.022549
4,CoapplicantIncome,0.011983,0.011983
5,ApplicantIncome,-0.000442,0.000442


In [64]:
#  Copy main DataFrame

df_encoded = df_p.copy()

# Identify categorical columns
categorical_cols = ['Gender', 'Married', 'Dependents', 'Education',
                    'Self_Employed', 'Property_Area', 'Loan_Amount_Term']

# Credit_History stays numeric since it's already 0/1 categorical

# Encode categorical columns

le = LabelEncoder()
for col in categorical_cols:
    df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))

# Confirm all columns are now numeric
print("\nEncoded Data Types:\n", df_encoded.dtypes)

# Correlation analysis
print("\nCorrelation with Loan_Status (EDA Evidence):")

# Drop Loan_Status AFTER correlation extraction, not before
correlations = df_encoded.corr(numeric_only=True)['Loan_Status'].sort_values(key=abs, ascending=False)

# Separate high- and low-signal features
print("\nHigh-signal features (|correlation| > 0.2):")
high_signal = correlations[abs(correlations) > 0.2].drop('Loan_Status', errors='ignore')
for feature, corr in high_signal.items():
    print(f"{feature}: {corr:.3f}")

print("\nLow-signal features (|correlation| < 0.1):")
low_signal = correlations[abs(correlations) < 0.1]
for feature, corr in low_signal.items():
    print(f"{feature}: {corr:.3f}")


corr_df = (
    correlations.drop('Loan_Status', errors='ignore')
    .reset_index()
    .rename(columns={'index': 'Feature', 'Loan_Status': 'Correlation_with_LoanStatus'})
)
corr_df['AbsCorrelation'] = corr_df['Correlation_with_LoanStatus'].abs()
corr_df = corr_df.sort_values('AbsCorrelation', ascending=False)

print("\nRanked Correlations:")
display(corr_df)


Encoded Data Types:
 Gender                 int32
Married                int32
Dependents             int32
Education              int32
Self_Employed          int32
ApplicantIncome      float64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term       int32
Credit_History       float64
Property_Area          int32
Loan_Status            int64
dtype: object

Correlation with Loan_Status (EDA Evidence):

High-signal features (|correlation| > 0.2):
Credit_History: 0.541

Low-signal features (|correlation| < 0.1):
Married: 0.091
Education: -0.086
LoanAmount: -0.047
Property_Area: 0.032
Gender: 0.018
CoapplicantIncome: 0.012
Dependents: 0.010
Loan_Amount_Term: -0.004
Self_Employed: -0.004
ApplicantIncome: -0.000

Ranked Correlations:


Unnamed: 0,Feature,Correlation_with_LoanStatus,AbsCorrelation
0,Credit_History,0.540556,0.540556
1,Married,0.091478,0.091478
2,Education,-0.085884,0.085884
3,LoanAmount,-0.047262,0.047262
4,Property_Area,0.032112,0.032112
5,Gender,0.017987,0.017987
6,CoapplicantIncome,0.011983,0.011983
7,Dependents,0.010118,0.010118
8,Loan_Amount_Term,-0.004123,0.004123
9,Self_Employed,-0.0037,0.0037


#### **Feature Engineering**

Implement the specific feature engineering recommendations from the EDA.

In [72]:
# Combine incomes
df_encoded['TotalIncome'] = df_encoded['ApplicantIncome'] + df_encoded['CoapplicantIncome']

# Ratio of loan amount to total income
df_encoded['Loan_to_Income_Ratio'] = df_encoded['LoanAmount'] / df_encoded['TotalIncome']

# Income per term
df_encoded['Income_per_Term'] = df_encoded['TotalIncome'] / df_encoded['Loan_Amount_Term']


In [74]:
print("\nNew Feature Summary:")
print(df_encoded[['TotalIncome', 'Loan_to_Income_Ratio', 'Income_per_Term']].describe())


New Feature Summary:
        TotalIncome  Loan_to_Income_Ratio  Income_per_Term
count    614.000000            614.000000       614.000000
mean    6036.813795              0.024192              inf
std     2635.181350              0.008216              NaN
min     1442.000000              0.003785       240.333333
25%     4166.000000              0.019676       715.666667
50%     5416.500000              0.024417       942.916667
75%     7452.500000              0.027875      1411.171875
max    15914.375000              0.082712              inf


In [76]:
# Replace inf and NaN with median of valid values
df_encoded['Income_per_Term'].replace([np.inf, -np.inf], np.nan, inplace=True)
median_income_term = df_encoded['Income_per_Term'].median()
df_encoded['Income_per_Term'].fillna(median_income_term, inplace=True)


In [77]:
print(df_encoded['Income_per_Term'].describe())


count     614.000000
mean     1189.035213
std       841.597143
min       240.333333
25%       715.666667
50%       941.500000
75%      1393.265625
max      7010.625000
Name: Income_per_Term, dtype: float64


In [78]:
print("\nNew Feature Summary:")
print(df_encoded[['TotalIncome', 'Loan_to_Income_Ratio', 'Income_per_Term']].describe())


New Feature Summary:
        TotalIncome  Loan_to_Income_Ratio  Income_per_Term
count    614.000000            614.000000       614.000000
mean    6036.813795              0.024192      1189.035213
std     2635.181350              0.008216       841.597143
min     1442.000000              0.003785       240.333333
25%     4166.000000              0.019676       715.666667
50%     5416.500000              0.024417       941.500000
75%     7452.500000              0.027875      1393.265625
max    15914.375000              0.082712      7010.625000


In [79]:
corrs = df_encoded.corr(numeric_only=True)['Loan_Status'].sort_values(key=abs, ascending=False)
print(corrs.head(10))


Loan_Status             1.000000
Credit_History          0.540556
Loan_to_Income_Ratio   -0.112336
Married                 0.091478
Education              -0.085884
LoanAmount             -0.047262
Property_Area           0.032112
Gender                  0.017987
Income_per_Term         0.013236
CoapplicantIncome       0.011983
Name: Loan_Status, dtype: float64


In [85]:
print("LOG-TRANSFORMING SKEWED VARIABLES")

# Variables identified in EDA
skewed_vars = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']

for var in skewed_vars:
    if var in df_encoded.columns:
        min_val = df_encoded[var].min()
        if min_val <= 0:
            df_encoded[f'{var}_log'] = np.log1p(df_encoded[var])  # log(1 + x)
            method = "log1p"
        else:
            df_encoded[f'{var}_log'] = np.log(df_encoded[var])
            method = "log"
        
        original_skew = skew(df_encoded[var])
        transformed_skew = skew(df_encoded[f'{var}_log'])
        print(f"✓ {var}: Applied {method} transformation")
        print(f"  Original skew: {original_skew:.3f} → Transformed skew: {transformed_skew:.3f}")

print(f"\nDataset shape after log transformation: {df_encoded.shape}")
print("New log-transformed columns:", [col for col in df_encoded.columns if '_log' in col])


LOG-TRANSFORMING SKEWED VARIABLES
✓ ApplicantIncome: Applied log transformation
  Original skew: 1.037 → Transformed skew: -0.594
✓ CoapplicantIncome: Applied log1p transformation
  Original skew: 1.010 → Transformed skew: -0.184
✓ LoanAmount: Applied log transformation
  Original skew: 0.682 → Transformed skew: -0.937

Dataset shape after log transformation: (614, 18)
New log-transformed columns: ['ApplicantIncome_log', 'CoapplicantIncome_log', 'LoanAmount_log']


In [84]:
df_encoded.shape

(614, 18)