### Import Required Libraries

In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler 
from sklearn.feature_selection import SelectKBest, f_classif 
from sklearn.decomposition import PCA 
from imblearn.over_sampling import SMOTE

###  Load Dataset

In [2]:
# Sample data
data = {
    "Applicant_ID": [201, 202, 203, 204, 205, 206, 207, 208, 209, 210],
    "Income": [50000, 60000, 35000, 80000, 45000, 70000, 30000, 90000, 40000, 65000],
    "Credit_Score": [720, 680, 600, 750, 650, 700, 580, 780, 620, 710],
    "Employment_Status": ["Employed", "Self-Employed", "Employed", "Employed", "Unemployed",
                          "Self-Employed", "Employed", "Employed", "Unemployed", "Self-Employed"],
    "Loan_Amount": [20000, 25000, 15000, 30000, 18000, 28000, 12000, 35000, 16000, 27000],
    "Loan_Approved": ["Yes", "Yes", "No", "Yes", "No", "Yes", "No", "Yes", "No", "Yes"]
}

# Create DataFrame
df = pd.DataFrame(data)

# Show the DataFrame
print(df)

   Applicant_ID  Income  Credit_Score Employment_Status  Loan_Amount  \
0           201   50000           720          Employed        20000   
1           202   60000           680     Self-Employed        25000   
2           203   35000           600          Employed        15000   
3           204   80000           750          Employed        30000   
4           205   45000           650        Unemployed        18000   
5           206   70000           700     Self-Employed        28000   
6           207   30000           580          Employed        12000   
7           208   90000           780          Employed        35000   
8           209   40000           620        Unemployed        16000   
9           210   65000           710     Self-Employed        27000   

  Loan_Approved  
0           Yes  
1           Yes  
2            No  
3           Yes  
4            No  
5           Yes  
6            No  
7           Yes  
8            No  
9           Yes  


### Encoding Categorical Data

In [3]:
# Encode Loan Status (Target Variable) 
label_encoder = LabelEncoder() 
df["Loan_Approved"] = label_encoder.fit_transform(df["Loan_Approved"])  # Approved → 1, Rejected → 0 
 
# One-Hot Encode Employment Type 
df = pd.get_dummies(df, columns=["Employment_Status"], drop_first=True) 

### Feature Selection

In [4]:
X = df.drop(columns=["Loan_Approved"])  # Features 
y = df["Loan_Approved"]  # Target 
 
# Select top 3 features 
selector = SelectKBest(score_func=f_classif, k=3) 
X_new = selector.fit_transform(X, y) 
print(X_new.shape)

(10, 3)


### Feature Extraction (PCA) 

In [5]:
pca = PCA(n_components=2)  # Reduce to 2 components 
X_pca = pca.fit_transform(X_new) 
print(X_pca.shape) 

(10, 2)


### Handling Imbalanced Data 

In [7]:
print("Before SMOTE:", y.value_counts()) 
 
smote = SMOTE(sampling_strategy='auto', k_neighbors=3) 
X_resampled, y_resampled = smote.fit_resample(X_pca, y) 
 
print("After SMOTE:", y_resampled.value_counts())

Before SMOTE: Loan_Approved
1    6
0    4
Name: count, dtype: int64
After SMOTE: Loan_Approved
1    6
0    6
Name: count, dtype: int64
