## Normalization

In [61]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.pipeline import Pipeline

In [62]:
X_processed = pd.read_csv('saved/feature_engineered_data.csv')
X_processed_copy = X_processed.copy()

In [63]:
numerical_cols = X_processed.columns
numerical_pipeline = Pipeline([
    ('scaler', MinMaxScaler())
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_cols),
        # ('cat', categorical_pipeline, categorical_cols)
    ]
)


X_scaled = preprocessor.fit_transform(X_processed)
X_processed = pd.DataFrame(
    X_scaled, 
    columns=numerical_cols#+categorical_cols#[name.split('__')[-1] for name in preprocessor.get_feature_names_out()]
)

In [64]:
inference_preprocessor = ColumnTransformer(
    transformers=[
        # ('num', numerical_pipeline, numerical_cols),
        ('cat', numerical_pipeline, [i for i in numerical_cols if i != "target_default"])
    ],
    # remainder='drop'  # <--- IMPORTANT: Drop the remaining columns
)

inference_preprocessor.fit(X_processed_copy.drop(columns=["target_default"]))

import pickle

# Save label encoders to disk
with open('saved/inference_preprocessor.pkl', 'wb') as f:
    pickle.dump(inference_preprocessor, f)

## Clustering

In [65]:
from sklearn.cluster import KMeans

# Perform K-Means clustering on normalized data
kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(X_processed)

# Add cluster labels to the original dataset
X_processed['Cluster'] = clusters

print(pd.Series(clusters).value_counts().sort_index())
# Display the clustered data
X_processed.to_csv('saved/clustered_data.csv', index = False)

(X_processed.head())

0     5753
1    13935
2     7606
3     7999
4     6448
Name: count, dtype: int64


Unnamed: 0,target_default,score_1,score_2,score_3,score_4,score_5,score_6,risk_rate,last_amount_borrowed,last_borrowed_in_months,...,score_1_bin_x_fraud_score_bin,score_2_bin_x_fraud_score_bin,state_x_real_state,state_x_shipping_state,state_real_state_avg_score_1,state_shipping_state_avg_score_2,score_1_sq,score_2_sq,facebook_profile_sq,Cluster
0,0.0,0.0,0.294118,0.353535,0.561754,0.259535,0.585853,0.444444,0.714039,0.6,...,0.0,0.0,0.892495,0.896202,0.632372,0.499833,0.0,0.086505,1.0,1
1,0.0,0.5,0.470588,0.373737,0.391232,0.942678,0.384395,0.266667,0.0,0.0,...,0.0,0.0,0.953347,0.958765,0.619683,0.493784,0.25,0.221453,0.0,2
2,1.0,0.5,0.264706,0.363636,0.497919,0.351904,0.640621,0.322222,0.20559,0.6,...,0.0,0.0,0.653144,0.65387,0.640734,0.474501,0.25,0.070069,0.0,2
3,0.0,0.0,0.617647,0.515152,0.554508,0.987699,0.419965,0.355556,0.0,0.0,...,0.0,0.0,0.470588,0.470648,0.638629,0.504545,0.0,0.381488,0.0,4
4,0.0,0.333333,0.029412,0.505051,0.442036,0.532537,0.704816,0.2,0.0,0.0,...,0.0,0.0,0.957404,0.95714,0.649197,0.50303,0.111111,0.000865,1.0,1
