## Normalization

In [None]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline

In [None]:
X_processed = pd.read_csv('saved/feature_engineered_data.csv')
X_processed.info()

In [None]:
custom_bins = [20, 36, 40, 46, 49, 51, 53, 55, 58, 63, 90]
custom_bins = [20, 40, 49, 54, 58, 90]

X_processed['Cluster'] = pd.cut(X_processed['RiskScore'], 
                           bins=custom_bins, 
                           labels=range(5),
                           include_lowest=True
                           )
X_processed = X_processed.drop(columns=["RiskScore"])                        

X_processed.to_csv('saved/clustered_data.csv', index = False)
X_processed['Cluster'].value_counts().sort_index()


In [None]:
numerical_cols = X_processed.columns
X_scaled = X_processed.copy() 
X_scaled = StandardScaler().fit_transform(X_scaled)
X_scaled = pd.DataFrame(X_scaled, columns=X_processed.columns)

In [None]:
# inference_preprocessor = ColumnTransformer(
#     transformers=[
#         # ('num', numerical_pipeline, numerical_cols),
#         ('cat', numerical_pipeline, [i for i in numerical_cols if i != "target_default"])
#     ],
#     # remainder='drop'  # <--- IMPORTANT: Drop the remaining columns
# )

# inference_preprocessor.fit(X_processed_copy.drop(columns=["target_default"]))

# import pickle

# # Save label encoders to disk
# with open('saved/inference_preprocessor.pkl', 'wb') as f:
#     pickle.dump(inference_preprocessor, f)

In [None]:
# nan_counts = X_processed.isnull().sum()

# # Print the columns with NaNs and their counts
# print(nan_counts[nan_counts > 0])

In [None]:
import numpy as np
import pandas as pd

# Compute correlation matrix
correlation_matrix = X_processed.corr().abs()  # Absolute values to consider both positive & negative correlation

# Select upper triangle of the correlation matrix to avoid duplicate pairs
upper_tri = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))

# Find feature pairs with correlation greater than 0.6
highly_correlated_pairs = [(col, row, upper_tri.loc[row, col]) 
                           for col in upper_tri.columns 
                           for row in upper_tri.index 
                           if upper_tri.loc[row, col] > 0.6]

# Convert to DataFrame for better readability
correlated_df = pd.DataFrame(highly_correlated_pairs, columns=["Feature 1", "Feature 2", "Correlation"])

# Sort by absolute correlation value
correlated_df = correlated_df.sort_values(by="Correlation", ascending=False)

# Display result
(correlated_df)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
X_processed.corr()

# Compute correlation matrix
correlation_matrix = X_processed.corr()

# Set figure size
plt.figure(figsize=(18, 12))

# Create heatmap with bigger annotations
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5, annot_kws={"size": 8})

# Show plot
plt.title("Feature Correlation Heatmap", fontsize=14)
plt.xticks(rotation=45, ha='right', fontsize=10)
plt.yticks(fontsize=10)
plt.show()

In [None]:
from sklearn.feature_selection import f_classif

X_train_rus, y_train_rus = X_processed.drop(columns=['LoanApproved']), X_processed['LoanApproved']

f_scores, p_values = f_classif(X_train_rus, y_train_rus)

# Create a DataFrame to display feature importance
yo = pd.DataFrame({'Feature': X_train_rus.columns, 'F-Score': f_scores, 'P-Value': p_values})

# Sort by F-Score (higher F-Score means more important)
feature_importance = yo.sort_values(by='F-Score', ascending=False)

# Print the feature importance
print(feature_importance)

## Clustering

In [None]:
from sklearn.cluster import KMeans


kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(X_scaled)

X_processed['Cluster'] = clusters
print(pd.Series(clusters).value_counts().sort_index())

# X_processed.to_csv('saved/clustered_data.csv', index = False)

(X_processed.head())