### Import Required Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

### Sample loan dataset

In [2]:
data = {
    'Customer_ID': range(1, 16),
    'Income': [50000, 60000, 35000, 80000, 120000, 40000, 70000, 90000, 30000, 45000, 100000, 75000, 65000, 55000, 85000],
    'Employment_Type': ['Salaried', 'Salaried', 'Self-Employed', 'Salaried', 'Salaried',
                        'Self-Employed', 'Salaried', 'Salaried', 'Self-Employed', 'Salaried',
                        'Salaried', 'Self-Employed', 'Salaried', 'Salaried', 'Salaried'],
    'Loan_Amount': [200000, 250000, 100000, 300000, 400000, 120000, 280000, 350000, 90000, 150000, 370000, 220000, 260000, 210000, 330000],
    'Credit_History': [1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1],
    'Marital_Status': ['Married', 'Single', 'Single', 'Married', 'Married',
                       'Single', 'Married', 'Single', 'Single', 'Married',
                       'Married', 'Single', 'Married', 'Single', 'Married'],
    'Loan_Default': ['No', 'No', 'Yes', 'No', 'No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'No', 'No']
}

df = pd.DataFrame(data)

# Introduce some missing values
df.loc[2, 'Income'] = np.nan
df.loc[5, 'Marital_Status'] = np.nan

df

Unnamed: 0,Customer_ID,Income,Employment_Type,Loan_Amount,Credit_History,Marital_Status,Loan_Default
0,1,50000.0,Salaried,200000,1,Married,No
1,2,60000.0,Salaried,250000,1,Single,No
2,3,,Self-Employed,100000,0,Single,Yes
3,4,80000.0,Salaried,300000,1,Married,No
4,5,120000.0,Salaried,400000,1,Married,No
5,6,40000.0,Self-Employed,120000,0,,Yes
6,7,70000.0,Salaried,280000,1,Married,No
7,8,90000.0,Salaried,350000,1,Single,No
8,9,30000.0,Self-Employed,90000,0,Single,Yes
9,10,45000.0,Salaried,150000,0,Married,Yes


### Exploratory Data Analysis (EDA)

In [3]:
# Basic info
print(df.info())

# Summary statistics
print(df.describe())

# Check missing values
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Customer_ID      15 non-null     int64  
 1   Income           14 non-null     float64
 2   Employment_Type  15 non-null     object 
 3   Loan_Amount      15 non-null     int64  
 4   Credit_History   15 non-null     int64  
 5   Marital_Status   14 non-null     object 
 6   Loan_Default     15 non-null     object 
dtypes: float64(1), int64(3), object(3)
memory usage: 972.0+ bytes
None
       Customer_ID         Income    Loan_Amount  Credit_History
count    15.000000      14.000000      15.000000       15.000000
mean      8.000000   68928.571429  242000.000000        0.733333
std       4.472136   24820.786220   98503.081895        0.457738
min       1.000000   30000.000000   90000.000000        0.000000
25%       4.500000   51250.000000  175000.000000        0.500000
50%       8.000000   6

### Handle Missing Values

In [5]:
# Fill numerical missing values with median
df['Income'].fillna(df['Income'].median(), inplace=True)

# Fill categorical missing values with mode
df['Marital_Status'].fillna(df['Marital_Status'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Income'].fillna(df['Income'].median(), inplace=True)


### Encode Categorical Features

In [6]:
le = LabelEncoder()
df['Employment_Type'] = le.fit_transform(df['Employment_Type'])
df['Marital_Status'] = le.fit_transform(df['Marital_Status'])
df['Loan_Default'] = le.fit_transform(df['Loan_Default'])  # Target: 1=Yes, 0=No

df.head()

Unnamed: 0,Customer_ID,Income,Employment_Type,Loan_Amount,Credit_History,Marital_Status,Loan_Default
0,1,50000.0,0,200000,1,0,0
1,2,60000.0,0,250000,1,1,0
2,3,67500.0,1,100000,0,1,1
3,4,80000.0,0,300000,1,0,0
4,5,120000.0,0,400000,1,0,0


### Feature Selection

In [None]:
# Features and target
X = df.drop(columns=['Customer_ID', 'Loan_Default'])
y = df['Loan_Default']

# SelectKBest
best_features = SelectKBest(score_func=f_classif, k=3)
fit = best_features.fit_transform(X, y)
print(fit.shape)

# Recursive Feature Elimination (RFE)
model = LogisticRegression()
rfe = RFE(model, n_features_to_select=3)
fit = rfe.fit(X, y)
print(pd.DataFrame({'Feature': X.columns, 'Selected': fit.support_}))

(15, 3)
           Feature  Selected
0           Income      True
1  Employment_Type      True
2      Loan_Amount      True
3   Credit_History     False
4   Marital_Status     False


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Dimensionality Reduction (PCA & LDA)

In [10]:
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
print("PCA Components Shape:", X_pca.shape)

# LDA (use target)
lda = LDA(n_components=1)
X_lda = lda.fit_transform(X_scaled, y)
print("LDA Components Shape:", X_lda.shape)


PCA Components Shape: (15, 2)
LDA Components Shape: (15, 1)


### Handle Class Imbalance with SMOTE

In [11]:
smote = SMOTE(sampling_strategy='auto', k_neighbors=3)
X_res, y_res = smote.fit_resample(X_scaled, y)

print("Before SMOTE:\n", y.value_counts())
print("After SMOTE:\n", y_res.value_counts())


Before SMOTE:
 Loan_Default
0    10
1     5
Name: count, dtype: int64
After SMOTE:
 Loan_Default
0    10
1    10
Name: count, dtype: int64
