In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, KBinsDiscretizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [None]:
# Generating a synthetic dataset
np.random.seed(42)
data = pd.DataFrame({
    'age': np.random.randint(18, 60, 100),
    'salary': np.random.randint(30000, 120000, 100),
    'education_level': np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], 100),
    'loan_approval': np.random.choice([0, 1], 100)  # Target variable
})

print("Original Dataset:")
print(data.head())

Original Dataset:
   age  salary education_level  loan_approval
0   56   38392     High School              1
1   46   60535             PhD              1
2   32  108603        Bachelor              0
3   25   82256        Bachelor              1
4   38  119135        Bachelor              0


In [None]:
#1. Scaling Features

scaler = MinMaxScaler()
data['age_scaled'] = scaler.fit_transform(data[['age']])
data['salary_scaled'] = scaler.fit_transform(data[['salary']])
print("\nAfter Scaling:")
print(data[['age', 'age_scaled', 'salary', 'salary_scaled']].head())


After Scaling:
   age  age_scaled  salary  salary_scaled
0   56    0.926829   38392       0.091701
1   46    0.682927   60535       0.339752
2   32    0.341463  108603       0.878221
3   25    0.170732   82256       0.583076
4   38    0.487805  119135       0.996202


Scaling: Used MinMaxScaler to normalize numerical features to a range between 0 and 1.

In [None]:
# 2. Transforming Features
data['log_salary'] = np.log(data['salary'] + 1)
print("\nAfter Log Transformation:")
print(data[['salary', 'log_salary']].head())


After Log Transformation:
   salary  log_salary
0   38392   10.555630
1   60535   11.010994
2  108603   11.595464
3   82256   11.317604
4  119135   11.688021


Transformation: Applied logarithmic transformation to reduce the effect of high salary values.

In [None]:
# 3. Discretizing Features

discretizer = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy='uniform')
data['age_bins'] = discretizer.fit_transform(data[['age']])
print("\nAfter Discretization:")
print(data[['age', 'age_bins']].head())


After Discretization:
   age  age_bins
0   56       3.0
1   46       2.0
2   32       1.0
3   25       0.0
4   38       1.0




Discretization: Used KBinsDiscretizer to create age bins, grouping ages into 4 categories.

Embedding: Encoded categorical education_level into dummy variables for the ML model.

Model Training: Applied RandomForestClassifier on the engineered features to predict loan_approval.

In [None]:
# ------------------------------
# Preparing Data for ML Task
# ------------------------------
# Selecting features and target
features = [col for col in data.columns if col not in ['education_level', 'age', 'salary']]
X = data[features]
y = data['loan_approval']

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Training a Random Forest Classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Evaluating the Model
y_pred = clf.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       1.00      1.00      1.00        16

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



This classification report indicates that the model performed perfectly on the test dataset:

Class-Specific Metrics:

Class 0 - Rejected Loans

Precision: 1.00
Out of all predictions for class 0, 100% were correct. No false positives were made.
Recall: 1.00
The model correctly identified all 14 instances of class 0. No false negatives occurred.
F1-Score: 1.00
The harmonic mean of precision and recall is also perfect since both metrics are perfect.
Support: 14
There were 14 instances of class 0 in the test set.

Class 1 - Approved Loans

Precision: 1.00
Out of all predictions for class 1, 100% were correct. No false positives occurred.
Recall: 1.00
The model correctly identified all 16 instances of class 1. No false negatives occurred.
F1-Score: 1.00
The harmonic mean of precision and recall is also perfect.
Support: 16
There were 16 instances of class 1 in the test set.