In [68]:
import pandas as pd
import numpy as np
import joblib
import os


###  Custom model class sometimes used in missing-bankrupt clusters

In [61]:
class ConstantModel:
    def predict(self, X): return np.zeros(len(X))
    def predict_proba(self, X):
        return np.column_stack([np.ones(len(X)), np.zeros(len(X))])

### =================
### Load test dataset
### =================

In [69]:
print("\nLoading test dataset...")
test_df = pd.read_csv("test_data.csv")
print("Test rows:", len(test_df))


Loading test dataset...
Test rows: 1012


### =============================
### LOAD CLUSTER ASSIGNMENT MODEL
### =============================

In [70]:
stage1 = joblib.load("models/cluster_id_model.joblib")   # dict stored
cluster_model = stage1["model"]                           # RF classifier
cluster_features = stage1["feature_cols"]                 # original inputs

print("\nðŸ”¹ Running Stage-1 Feature Preprocessing & Cluster Assignment...")
X_test_stage1 = test_df.drop(columns=["Index"], errors="ignore")[cluster_features]
scaled_test = cluster_model.named_steps["scaler"].transform(X_test_stage1)
test_df["Assigned_Cluster"] = cluster_model.named_steps["rf"].predict(scaled_test)

print("\nCluster Distribution in Test Data:")
print(test_df["Assigned_Cluster"].value_counts(), "\n")


ðŸ”¹ Running Stage-1 Feature Preprocessing & Cluster Assignment...

Cluster Distribution in Test Data:
Assigned_Cluster
2    371
3    354
0    248
1     39
Name: count, dtype: int64 



### =============================================
### Load Available Subgroup Models Automatically
### =============================================

In [71]:
print("\nLoading subgroup models available")
subgroup_models = {}

for file in os.listdir("models"):
    if file.startswith("subgroup") and file.endswith("_model.joblib"):
        cid = int(file.split("_cluster")[1].split("_")[0])
        subgroup_models[cid] = joblib.load(f"models/{file}")
        print(f"Loaded Model for Cluster {cid}")

active = list(subgroup_models.keys())
print("\nActive Models â†’", active)


Loading subgroup models available
Loaded Model for Cluster 0
Loaded Model for Cluster 2
Loaded Model for Cluster 5

Active Models â†’ [0, 2, 5]


### ============================================
### Predict Bankruptcy for Each Assigned Cluster
### ============================================

In [72]:
pred = np.zeros(len(test_df))
prob = np.zeros(len(test_df))

print("\n==================== CLUSTER-WISE PREDICTION ====================")

for cid, idx in test_df.groupby("Assigned_Cluster").groups.items():
    print(f"\nProcessing Cluster {cid} â†’ {len(idx)} rows")

    # ðŸ”¸ If no Stage-2 model exists â†’ forced safe assignment (0)
    if cid not in subgroup_models:
        print(f"No model for cluster {cid} â†’ marking all = 0")
        pred[idx] = 0
        prob[idx] = 0
        continue

    bundle = subgroup_models[cid]

    # ðŸ”¥ Format-1 â†’ (pipeline + threshold + feature list)  â†’ **Subgroup 2**
    if isinstance(bundle, dict) and "pipeline" in bundle:
        model = bundle["pipeline"]
        thr = bundle.get("threshold", 0.5)
        feats = bundle["feature_names"]

        X_sub = test_df.iloc[idx][feats]
        prob[idx] = model.predict_proba(X_sub)[:,1]
        pred[idx] = (prob[idx] >= thr).astype(int)

        print(f"Cluster {cid}: Predicted Bankrupt â†’ {pred[idx].sum()}  (Threshold={thr})")

    # ðŸ”¥ Format-2 â†’ {"model":model, "selected_features":...} â†’ Subgroup 0 & 5
    elif isinstance(bundle, dict) and "model" in bundle:
        model = bundle["model"]
        feats = bundle.get("selected_features", test_df.columns)

        X_sub = test_df.iloc[idx][feats]
        prob[idx] = model.predict_proba(X_sub)[:,1]
        pred[idx] = (prob[idx] >= 0.5).astype(int)

        print(f"Cluster {cid}: Predicted Bankrupt â†’ {pred[idx].sum()}")

    # ðŸ”¥ Format-3 â†’ Raw model
    else:
        model = bundle
        X_sub = test_df.iloc[idx]

        prob[idx] = model.predict_proba(X_sub)[:,1]
        pred[idx] = (prob[idx] >= 0.5).astype(int)

        print(f"Cluster {cid}: Predicted Bankrupt â†’ {pred[idx].sum()}")



Processing Cluster 0 â†’ 248 rows
Cluster 0: Predicted Bankrupt â†’ 44.0

Processing Cluster 1 â†’ 39 rows
No model for cluster 1 â†’ marking all = 0

Processing Cluster 2 â†’ 371 rows
Cluster 2: Predicted Bankrupt â†’ 2.0  (Threshold=0.01)

Processing Cluster 3 â†’ 354 rows
No model for cluster 3 â†’ marking all = 0


### =====================================
### 20% RULE â€” Keep Only Most Risky Firms
### =====================================

In [73]:
max_allowed = int(0.20 * len(test_df))
total_pred = pred.sum()

print(f"\nTotal Predicted = {total_pred}  |  Allowed â‰¤ {max_allowed}")

if total_pred > max_allowed:
    print("Exceeds limit â€” Keeping HIGH-RISK only (top probability sorting)")
    top = np.argsort(prob)[-max_allowed:]
    final = np.zeros(len(test_df),dtype=int)
    final[top] = 1
else:
    print("Below risk limit â€” Keeping model predictions as-is")
    final = pred.copy()

test_df["Bankrupt?"] = final.astype(int)


Total Predicted = 46.0  |  Allowed â‰¤ 202
Below risk limit â€” Keeping model predictions as-is


### ============================
### Export Final Submission File
### ============================

In [75]:
output = test_df[["Index","Bankrupt?"]]
output.to_csv("Group4_Generalization.csv", index=False)

print("\n==============================================================")
print("   FINAL OUTPUT SAVED â†’ Group4_Generalization.csv")
print("==============================================================\n")
print(output.head())


   FINAL OUTPUT SAVED â†’ Group4_Generalization.csv

   Index  Bankrupt?
0      0          0
1      1          0
2      2          0
3      3          0
4      4          0
