In [None]:
import pandas as pd
import numpy as np
import joblib

# -----------------------------
# STEP-1 Load test dataset
# -----------------------------

In [None]:
test_df = pd.read_csv("test_data.csv")
print("\nTEST SHAPE:", test_df.shape)

# --------------------------------------
# STEP-2 Load cluster-ID model
# --------------------------------------

In [None]:
cluster_bundle = joblib.load("models/cluster_id_model.joblib")
cluster_feats = cluster_bundle["feature_cols"]
cluster_model = cluster_bundle["model"]

X_cluster = test_df[cluster_feats]
test_df["cluster_id_pred"] = cluster_model.predict(X_cluster)
print("\nCluster distribution:\n", test_df["cluster_id_pred"].value_counts())

# -------------------------------------
# STEP-3  Load subgroup stacking models
# -------------------------------------

In [None]:
subgroup_model_paths = {
    0: "models/subgroup0_stacking_bundle.joblib",
    #1: "models/subgroup1_stacking_bundle.joblib",
    2: "models/subgroup2_cluster2_model.joblib",
    #3: "models/subgroup3_stacking_bundle.joblib",
}

subgroups = {}

for cid, path in subgroup_model_paths.items():
    if os.path.exists(path):
        bundle = joblib.load(path)
        subgroups[cid] = {
            "model": bundle["model"],
            "features": bundle["selected_features"]
        }
        print(f"âœ” Loaded Subgroup Model for Cluster {cid}")
    else:
        print(f"âš  No model for cluster {cid} â€” using constant 0 predictor")

# --------------------------------------
# STEP-4 Predict bankruptcy per subgroup
# --------------------------------------

In [None]:
y_pred = np.zeros(len(test_df), dtype=int)
probs_all = np.zeros(len(test_df))  # needed for 20% ranking rule

for cid, idx in test_df.groupby("cluster_id_pred").groups.items():
    rows = test_df.loc[idx]

    if cid in subgroups:
        model = subgroups[cid]["model"]
        feats = subgroups[cid]["features"]
        X_sub = rows[feats]

        # probability for reduction stage
        if hasattr(model, "predict_proba"):
            p = model.predict_proba(X_sub)[:,1]
        else:
            p = model.predict(X_sub)  # fallback classifier

        probs_all[idx] = p
        y_pred[idx] = (p >= 0.5).astype(int)  # raw before reduction

        print(f"Cluster {cid}: Pred bankrupt = {y_pred[idx].sum()}")
    else:
        # cluster with no model predicts 0
        y_pred[idx] = 0
        probs_all[idx] = 0
        print(f"Cluster {cid}: No model â†’ forced bankrupt=0")

# ================================================================
#  ðŸ”¥ 20% Bankruptcy Output Rule (Auto-Enforced)
# ================================================================
max_allowed = int(0.20 * len(test_df))   # 20% limit

if y_pred.sum() > max_allowed:
    print(f"\nâš  {y_pred.sum()} bankrupt predicted > limit {max_allowed}")
    print("Applying confidence reduction â†’ keeping only top 20% risky firms.")

    top_idx = np.argsort(probs_all)[-max_allowed:]   # pick most-probable bankrupt
    y_final = np.zeros(len(test_df), dtype=int)
    y_final[top_idx] = 1
else:
    print(f"\nðŸŸ¢ Bankruptcy count {y_pred.sum()}/{max_allowed} within allowed limit.")
    y_final = y_pred.copy()

test_df["Bankrupt?"] = y_final

print("\nFINAL BANKRUPTCY COUNT:", test_df["Bankrupt?"].sum(),
      f" (Capped at {max_allowed})")


# --------------------------------------
# STEP-5 Create submission file
# --------------------------------------

In [None]:
submission = test_df[["Index","Bankrupt?"]]
submission.to_csv("Group4_Generalization.csv", index=False)

print("\n=================================================")
print(" ðŸ“„ FINAL OUTPUT SAVED â†’ Group4_Generalization.csv")
print("=================================================")
print(submission.head())