In [14]:
import pandas as pd
import numpy as np
import joblib
import os

### =================
### Load test dataset
### =================

In [15]:
print("\nLoading test dataset...")
test_df = pd.read_csv("test_data.csv")
print("Test rows:", len(test_df))


Loading test dataset...
Test rows: 1012


### =============================
### LOAD CLUSTER ASSIGNMENT MODEL
### =============================

In [16]:
cluster_joblib = joblib.load("models/cluster_id_model.joblib")
cluster_model   = cluster_joblib["model"]           # Pipeline (scaler + RF)
cluster_feats   = cluster_joblib["feature_cols"]    # 95 Stage-1 features


print("\nAssigning clusters to test data...")
X_cluster = test_df.drop(columns=["Index"], errors="ignore")[cluster_feats]
scaled_in = cluster_model.named_steps["scaler"].transform(X_cluster)

test_df["Assigned_Cluster"] = cluster_model.named_steps["rf"].predict(scaled_in)

print("\nCluster distribution:")
print(test_df["Assigned_Cluster"].value_counts(), "\n")


Assigning clusters to test data...

Cluster distribution:
Assigned_Cluster
2    371
3    354
0    248
1     39
Name: count, dtype: int64 



### ==============================
### Load Available Subgroup Models
### ==============================

In [17]:
# Dummy reconstruction to allow unpickling
class ConstantModel:
    def predict(self, X):
        import numpy as np
        return np.zeros(len(X))


In [18]:
subgroup_models = {}

for f in os.listdir("models"):
    if f.startswith("subgroup") and f.endswith("_model.joblib"):
        cid = int(f.split("_cluster")[1].split("_")[0])
        try:
            subgroup_models[cid] = joblib.load(f"models/{f}")
            print(f"✔ Loaded model for cluster {cid}")
        except Exception as e:
            print(f"⚠ Could not load {f} → {e}")

print("\nActive Models:", list(subgroup_models.keys()))


✔ Loaded model for cluster 0
✔ Loaded model for cluster 2
✔ Loaded model for cluster 3
✔ Loaded model for cluster 5

Active Models: [0, 2, 3, 5]


### ============================================
### Predict Bankruptcy for Each Assigned Cluster
### ============================================

In [19]:
pred = np.zeros(len(test_df), dtype=int)
prob = np.zeros(len(test_df), dtype=float)
print("\n================= FINAL CLUSTER PREDICTION REPORT =================\n")

for cid in range(6):   # handles clusters 0–5
    
    idx = test_df.index[test_df["Assigned_Cluster"] == cid]

    print(f"\n============== CLUSTER {cid} ==============")
    print(f"Total rows in this cluster = {len(idx)}")

    # CASE 1 → No subgroup model exists (clusters 1 & 4 OR missing)
    if cid not in subgroup_models:
        print(f"⚠ No model found for cluster {cid} → marking bankrupt = 0 for all")
        pred[idx] = 0
        prob[idx] = 0
        continue

    bundle = subgroup_models[cid]

    # ---------------- CLUSTER 0 (Parth) ----------------
    if cid == 0 and "model" in bundle:
        print("Cluster 0 → Using Parth subgroup stacking model")
        model = bundle["model"]
        feats = bundle["selected_features"]
        X_sub = test_df.iloc[idx][feats]
        sub_prob = model.predict_proba(X_sub)[:, 1]
        prob[idx] = sub_prob
        pred[idx] = (sub_prob >= 0.55).astype(int)
        print(f"Cluster 0 → bankrupt predicted = {pred[idx].sum()}")

    # ---------------- CLUSTER 2 (Dhanisha) ----------------
    elif cid == 2 and "pipeline" in bundle:
        print("Cluster 2 → Using PCA+Oversampling tuned model (with threshold)")
        model = bundle["pipeline"]
        feats = bundle["feature_names"]
        thr   = bundle["threshold"]
        X_sub = test_df.iloc[idx][feats]
        sub_prob = model.predict_proba(X_sub)[:, 1]
        prob[idx] = sub_prob
        pred[idx] = (sub_prob >= thr).astype(int)
        print(f"Cluster 2 → bankrupt = {pred[idx].sum()} (thr={thr})")

    # ---------------- CLUSTER 3 (Lasya) ----------------
    elif cid == 3:
        print("Cluster 3 → Using saved stacking model with stored best threshold")

        # Load saved object from joblib (pipeline + best_threshold)
        obj = joblib.load("models/subgroup3_cluster3_model.joblib")
        pipeline = obj["pipeline"]
        best_t   = obj["best_threshold"]

       # Ensure 'cluster_id' column exists because pipeline.feature_names_in_ expects it
        temp = test_df.copy()
        if "cluster_id" not in temp.columns:
            temp["cluster_id"] = temp["Assigned_Cluster"]

            # Extract the exact features used during training
            feat_3 = list(pipeline.feature_names_in_)

        # Select matching features
        X_sub = temp.iloc[idx][feat_3]

        # Predict
        sub_prob = pipeline.predict_proba(X_sub)[:, 1]
        prob[idx] = sub_prob
        pred[idx] = (sub_prob >= best_t).astype(int)

        print(f"Cluster 3 → bankrupt predicted = {pred[idx].sum()} (thr={best_t})")

    # ---------------- CLUSTER 5 (Premlata – Constant Model) ----------------
    elif cid == 5:
        print("Cluster 5 → ConstantModel detected (no predict_proba)")

        X_sub = test_df.iloc[idx].drop(columns=["Bankrupt?"], errors="ignore")

        sub_pred = cm_model.predict(X_sub)       # always 1
        pred[idx] = sub_pred
        prob[idx] = np.ones(len(idx)) * 0.99     # assign high confidence manually

        print(f"Cluster 5 → bankrupt predicted = {pred[idx].sum()}")


    # ---------------- Unexpected case ----------------
    else:
        print("Unexpected model format — marking all 0")
        pred[idx] = 0
        prob[idx] = 0





Total rows in this cluster = 248
Cluster 0 → Using Parth subgroup stacking model
Cluster 0 → bankrupt predicted = 42

Total rows in this cluster = 39
⚠ No model found for cluster 1 → marking bankrupt = 0 for all

Total rows in this cluster = 371
Cluster 2 → Using PCA+Oversampling tuned model (with threshold)
Cluster 2 → bankrupt = 2 (thr=0.01)

Total rows in this cluster = 354
Cluster 3 → Using saved stacking model with stored best threshold
Cluster 3 → bankrupt predicted = 5 (thr=0.9)

Total rows in this cluster = 0
⚠ No model found for cluster 4 → marking bankrupt = 0 for all

Total rows in this cluster = 0
Cluster 5 → ConstantModel detected (no predict_proba)
Cluster 5 → bankrupt predicted = 0


### ============================
### Export Final Submission File
### ============================

In [20]:
test_df["Bankrupt?"] = pred.astype(int)

out = test_df[["Index","Bankrupt?"]]
out.to_csv("Group4_Generalization.csv", index=False)

print("\n==============================================================")
print("     FINAL SUBMISSION SAVED → Group4_Generalization.csv")
print("==============================================================")
print(out.head())


     FINAL SUBMISSION SAVED → Group4_Generalization.csv
   Index  Bankrupt?
0      0          0
1      1          0
2      2          0
3      3          0
4      4          0


In [21]:
import pandas as pd

df_gen = pd.read_csv("Group4_Generalization.csv")

total_bankrupt = df_gen["Bankrupt?"].sum()
percent_bankrupt = (df_gen["Bankrupt?"].mean()) * 100

print("Total predicted bankrupt:", total_bankrupt)
print("Percentage bankrupt:", percent_bankrupt, "%")


Total predicted bankrupt: 49
Percentage bankrupt: 4.841897233201581 %
