## Data Exploration and Preprocessing

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_val_score

import joblib

RANDOM_STATE = 42

In [2]:
# Load the dataset
df = pd.read_csv('train_data.csv')
df.head()

Unnamed: 0,Index,Bankrupt?,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,...,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
0,0,0,0.450397,0.504034,0.506986,0.59464,0.59464,0.998906,0.797293,0.809239,...,0.780554,0.004919,0.623634,0.594641,0.838869,0.279036,0.026788,0.565144,1,0.032464
1,1,0,0.530005,0.572885,0.574763,0.605695,0.605558,0.999058,0.797512,0.809399,...,0.819963,0.005968,0.624171,0.60569,0.841869,0.27904,0.026801,0.565205,1,0.032442
2,2,0,0.57115,0.620148,0.624177,0.612275,0.612282,0.999163,0.797654,0.809533,...,0.839128,0.006022,0.625306,0.612271,0.843294,0.278927,0.026816,0.565276,1,0.033034
3,3,0,0.483401,0.556694,0.536164,0.602445,0.602445,0.999035,0.797458,0.80938,...,0.806477,0.002177,0.62161,0.602444,0.841891,0.293391,0.027063,0.56619,1,0.015406
4,4,0,0.510359,0.537287,0.552546,0.600023,0.600023,0.999009,0.797406,0.809313,...,0.799277,0.001124,0.623993,0.600019,0.840313,0.279878,0.02688,0.565549,1,0.028858


In [3]:
# Display basic info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5807 entries, 0 to 5806
Data columns (total 97 columns):
 #   Column                                                    Non-Null Count  Dtype  
---  ------                                                    --------------  -----  
 0   Index                                                     5807 non-null   int64  
 1   Bankrupt?                                                 5807 non-null   int64  
 2    ROA(C) before interest and depreciation before interest  5807 non-null   float64
 3    ROA(A) before interest and % after tax                   5807 non-null   float64
 4    ROA(B) before interest and depreciation after tax        5807 non-null   float64
 5    Operating Gross Margin                                   5807 non-null   float64
 6    Realized Sales Gross Margin                              5807 non-null   float64
 7    Operating Profit Rate                                    5807 non-null   float64
 8    Pre-tax net Inter

In [4]:
print(df.describe())

             Index    Bankrupt?  \
count  5807.000000  5807.000000   
mean   2903.000000     0.034097   
std    1676.480838     0.181493   
min       0.000000     0.000000   
25%    1451.500000     0.000000   
50%    2903.000000     0.000000   
75%    4354.500000     0.000000   
max    5806.000000     1.000000   

        ROA(C) before interest and depreciation before interest  \
count                                        5807.000000          
mean                                            0.505416          
std                                             0.060808          
min                                             0.000000          
25%                                             0.476673          
50%                                             0.503096          
75%                                             0.535417          
max                                             1.000000          

        ROA(A) before interest and % after tax  \
count                         

In [5]:
print(df.shape)
print(df["Bankrupt?"].value_counts())

(5807, 97)
Bankrupt?
0    5609
1     198
Name: count, dtype: int64


In [6]:
# Target and ID
y = df["Bankrupt?"]
index_col = df["Index"]

# Drop ID and target for feature processing
X = df.drop(columns=["Index", "Bankrupt?"])
print(X.shape, y.value_counts())

(5807, 95) Bankrupt?
0    5609
1     198
Name: count, dtype: int64


In [7]:
var_thresh = VarianceThreshold(threshold=1e-5)
X_var = var_thresh.fit_transform(X)

# Keep track of which columns survived
selected_cols_step1 = X.columns[var_thresh.get_support()]
print("After variance filter:", X_var.shape)

After variance filter: (5807, 94)


In [8]:
X_var_df = pd.DataFrame(X_var, columns=selected_cols_step1)

rf = RandomForestClassifier(
    n_estimators=200,
    random_state=RANDOM_STATE,
    n_jobs=-1,
    class_weight="balanced_subsample"
)

rf.fit(X_var_df, y)

# Get importances and pick top k
importances = pd.Series(rf.feature_importances_, index=X_var_df.columns)
importances = importances.sort_values(ascending=False)

TOP_K = 40  # you can adjust between 30–40
top_features = importances.head(TOP_K).index.tolist()
print("Top features:", len(top_features))

Top features: 40


In [9]:
X_reduced = X_var_df[top_features].copy()
print("Reduced feature matrix:", X_reduced.shape)

Reduced feature matrix: (5807, 40)


In [10]:
# Save the feature names somewhere persistent
joblib.dump(top_features, "top_features_for_clustering.joblib")

# Define a function-like transformer using a Pipeline
preprocess_for_clustering = Pipeline(
    steps=[
        ("select_features", 
         # use a lambda via FunctionTransformer or simple column indexing later
         ("passthrough")),  # we'll apply column subsetting manually on dataframes
        ("scaler", StandardScaler())
    ]
)

In [11]:
# Fit scaler only on the selected columns
preprocess_for_clustering.named_steps["scaler"].fit(X_reduced)

# Save the fitted scaler and feature list together
preprocess_object = {
    "feature_names": top_features,
    "scaler": preprocess_for_clustering.named_steps["scaler"]
}

joblib.dump(preprocess_object, "preprocess_for_clustering.joblib")

['preprocess_for_clustering.joblib']

In [12]:
preprocess_obj = joblib.load("preprocess_for_clustering.joblib")
feature_names = preprocess_obj["feature_names"]
scaler = preprocess_obj["scaler"]

def transform_for_clustering(df_raw: pd.DataFrame) -> np.ndarray:
    """
    df_raw: full dataframe including Index and (for train) Bankrupt?
    returns: scaled, reduced feature matrix for clustering / cluster-ID
    """
    # drop target if present
    cols_to_drop = [c for c in ["Bankrupt?"] if c in df_raw.columns]
    X = df_raw.drop(columns=cols_to_drop + ["Index"])
    
    # subset to the 40 selected features
    X_sel = X[feature_names].copy()
    
    # scale
    X_scaled = scaler.transform(X_sel)
    return X_scaled

In [13]:
X_cluster = transform_for_clustering(df)
print("X_cluster shape:", X_cluster.shape)  # expect (5807, 40)

X_cluster shape: (5807, 40)


In [14]:
from sklearn.cluster import KMeans

K = 7  # or 7 if you want more subgroups
kmeans = KMeans(
    n_clusters=K,
    init="k-means++",
    n_init=20,
    random_state=42
)

cluster_labels = kmeans.fit_predict(X_cluster)
df["cluster"] = cluster_labels
df["cluster"].value_counts().sort_index()

cluster
0     728
1     701
2    2059
3       1
4    2313
5       2
6       3
Name: count, dtype: int64

In [15]:
from sklearn.metrics import silhouette_score

sil_score = silhouette_score(X_cluster, cluster_labels)
print("Silhouette score:", sil_score)

Silhouette score: 0.138487318271109


In [16]:
joblib.dump(kmeans, "kmeans_clusters.joblib")

df.to_csv("train_with_clusters.csv", index=False)

for c in range(K):
    df_cluster_c = df[df["cluster"] == c]
    df_cluster_c.to_csv(f"cluster_{c}.csv", index=False)
    print(f"Cluster {c}: {df_cluster_c.shape[0]} rows,",
          f"{df_cluster_c['Bankrupt?'].sum()} bankrupt")

Cluster 0: 728 rows, 132 bankrupt
Cluster 1: 701 rows, 0 bankrupt
Cluster 2: 2059 rows, 6 bankrupt
Cluster 3: 1 rows, 1 bankrupt
Cluster 4: 2313 rows, 56 bankrupt
Cluster 5: 2 rows, 0 bankrupt
Cluster 6: 3 rows, 3 bankrupt
