In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, roc_curve
from optbinning import OptimalBinning

# ============================================================
# 1. LOAD 3 FILE PD_CALIBRATED
# ============================================================

segment_files = [
    r'C:\Users\PC\Documents\GitHub\Khoa-luan\Log reg\seg1_pd_calibrated.parquet',
    r"C:\Users\PC\Documents\GitHub\Khoa-luan\Log reg\seg2_pd_calibrated.parquet",
    r"C:\Users\PC\Documents\GitHub\Khoa-luan\Log reg\seg3_pd_calibrated.parquet"
]

df_list = []

for f in segment_files:
    temp_df = pd.read_parquet(f)
    df_list.append(temp_df)

df_total = pd.concat(df_list, axis=0, ignore_index=True)

print("Loaded:", df_total.shape)


# ============================================================
# 2. T√çNH SCORE T·ª™ PD
# ============================================================

PDO = 20
ODDS0 = 50
SCORE0 = 600

Factor = PDO / np.log(2)
Offset = SCORE0 + Factor * np.log(ODDS0)

def pd_to_score(pd):
    pd = np.clip(pd, 1e-6, 1-1e-6)
    log_odds = np.log(pd / (1-pd))
    return Offset - Factor * log_odds

df_total["SCORE"] = pd_to_score(df_total["PD"])

df_score_output = df_total.copy()


# ============================================================
# 3. GINI CHUNG (TRAIN / OOS / OOT)
# ============================================================

def calculate_gini(y, p):
    auc = roc_auc_score(y, p)
    return 2 * auc - 1

for dtype in ["TRAIN", "OOS", "OOT"]:
    subset = df_total[df_total["DATA_TYPE"] == dtype]
    gini = calculate_gini(subset["y"], subset["PD"])
    print(f"{dtype}: GINI = {gini:.4f}")


# ============================================================
# 4. KS ‚Äì CUT-OFF TR√äN TRAIN
# ============================================================

train_df = df_total[df_total["DATA_TYPE"]=="TRAIN"]

fpr, tpr, thr = roc_curve(train_df["y"], train_df["PD"])
ks = (tpr - fpr).max()
ks_cut = thr[np.argmax(tpr - fpr)]

print("\nKS =", ks, "| Cut-off =", ks_cut)


# ============================================================
# 5. OPTBINNING ‚Äì FIT TR√äN TRAIN PD
# ============================================================

optb = OptimalBinning(dtype="numerical", monotonic_trend="ascending", max_n_bins=10)
optb.fit(train_df["PD"], train_df["y"])

cuts = optb.splits
print("Cutpoints:", cuts)

df_score_output["BIN"] = np.digitize(df_score_output["PD"], cuts, right=True)


# ============================================================
# 6. MAP BIN ‚Üí RATING
# ============================================================

rating_labels = ["AAA","AA+","AA","AA-","A+","A","A-","BBB","BB","B"]

bin_mean_pd = df_score_output.groupby("BIN")["PD"].mean().sort_values()
bin_to_rating = {b: rating_labels[i] for i, b in enumerate(bin_mean_pd.index)}

df_score_output["RATING"] = df_score_output["BIN"].map(bin_to_rating)


# ============================================================
# 7. SUMMARY ‚Äì THEO KHO·∫¢NG ƒêI·ªÇM
# ============================================================

summary_rows = []
for rating in rating_labels:
    df_r = df_score_output[df_score_output["RATING"] == rating]
    if df_r.empty:
        summary_rows.append([rating, None, None, 0, 0, None, None])
        continue

    score_min = df_r["SCORE"].min()
    score_max = df_r["SCORE"].max()
    pd_mean = df_r["PD"].mean()
    def_rate = df_r["y"].mean()

    summary_rows.append([
        rating,
        f"{score_min:.2f} ‚Üí {score_max:.2f}",
        len(df_r),
        len(df_r) / len(df_score_output),
        pd_mean,
        def_rate
    ])

df_rating_summary = pd.DataFrame(summary_rows, columns=[
    "RATING", "SCORE_RANGE", "COUNT", "PCT", "MEAN_PD", "DEFAULT_RATE"
])

print(df_rating_summary)


# ============================================================
# 8. EXPORT MASTER FILE
# ============================================================

df_score_output.to_parquet("FINAL_MODEL_SCORED_ALL.parquet", index=False)
print("\n‚úî Xu·∫•t file FINAL_MODEL_SCORED_ALL.parquet th√†nh c√¥ng!")

Loaded: (1740237, 5)
TRAIN: GINI = 0.6676
OOS: GINI = 0.6858
OOT: GINI = 0.6850

KS = 0.516161998500056 | Cut-off = 0.07725416741398945




Cutpoints: [0.00195221 0.00361638 0.00902699 0.02013405 0.03356657 0.06590865
 0.12449665 0.18416011 0.27048425]
  RATING      SCORE_RANGE   COUNT       PCT   MEAN_PD  DEFAULT_RATE
0    AAA  892.84 ‚Üí 938.79  146583  0.084232  0.001123      0.001044
1    AA+  875.00 ‚Üí 892.83  108375  0.062276  0.002837      0.003091
2     AA  848.45 ‚Üí 875.00  242294  0.139230  0.005905      0.006199
3    AA-  824.97 ‚Üí 848.44  246007  0.141364  0.014125      0.013975
4     A+  809.83 ‚Üí 824.97  178195  0.102397  0.026183      0.025741
5      A  789.38 ‚Üí 809.83  211833  0.121727  0.048107      0.047434
6     A-  769.16 ‚Üí 789.38  256994  0.147678  0.093657      0.092940
7    BBB  755.82 ‚Üí 769.16  162797  0.093549  0.151477      0.152914
8     BB  741.51 ‚Üí 755.82  107816  0.061955  0.220227      0.214950
9      B  677.17 ‚Üí 741.50   79343  0.045593  0.375097      0.375723

‚úî Xu·∫•t file FINAL_MODEL_SCORED_ALL.parquet th√†nh c√¥ng!


In [2]:
# # ============================================================
# # B∆Ø·ªöC 11 ‚Äî KS STATISTIC (TIM CUT-OFF)
# # ============================================================
# from sklearn.metrics import roc_curve
# from optbinning import OptimalBinning

# print("\n[B∆Ø·ªöC 11] T√¨m cut-off b·∫±ng KS...")

# fpr, tpr, thresholds = roc_curve(y_train_full, pd_train)
# ks_values = tpr - fpr

# ks_max = ks_values.max()
# ks_cutoff = thresholds[np.argmax(ks_values)]

# print(f" ‚Üí KS max = {ks_max:.4f}")
# print(f" ‚Üí Cut-off t·ªëi ∆∞u = {ks_cutoff:.4f}")


# # ============================================================
# # B∆Ø·ªöC 12 ‚Äî X√ÇY THANG H·∫†NG (OPT BINNING)
# # ============================================================
# print("\n[B∆Ø·ªöC 12] X√¢y thang h·∫°ng b·∫±ng OptBinning...")

# optb = OptimalBinning(
#     name="RatingScale",
#     dtype="numerical",
#     monotonic_trend="ascending",   # PD tƒÉng ‚Üí r·ªßi ro tƒÉng
#     max_n_bins=10
# )

# # Fit tr√™n TRAIN
# optb.fit(pd_train, y_train_full)

# bin_table = optb.binning_table.build()
# print(bin_table)


# # ============================================================
# # B∆Ø·ªöC 13 ‚Äî L·∫§Y CUTPOINTS V√Ä G√ÅN BIN
# # ============================================================
# print("\n[B∆Ø·ªöC 13] √Åp d·ª•ng rating cho to√†n b·ªô dataset...")

# cuts = optb.splits
# print("Cutpoints:", cuts)

# df_score_output["BIN"] = np.digitize(df_score_output["PD"], cuts, right=True)


# # ============================================================
# # B∆Ø·ªöC 14 ‚Äî G√ÅN 10 H·∫†NG (AAA ‚Üí CC)
# # ============================================================
# print("\n[B∆Ø·ªöC 14] Mapping BIN ‚Üí RATING...")

# bin_mean_pd = df_score_output.groupby("BIN")["PD"].mean().sort_values()


# rating_labels = ["AAA","AA+","AA","AA-","A+","A","A-","BBB","BB","B"]

# sorted_bins = bin_mean_pd.index.tolist()

# bin_to_rating = {b: rating_labels[i] for i, b in enumerate(sorted_bins)}

# df_score_output["RATING"] = df_score_output["BIN"].map(bin_to_rating)


# print(df_score_output[["SEGMENT", "PD", "SCORE", "BIN", "RATING"]].head())


# # ============================================================
# # B∆Ø·ªöC 15 ‚Äî KI·ªÇM TRA ƒê∆†N ƒêI·ªÜU (MONOTONICITY)
# # ============================================================
# print("\n[B∆Ø·ªöC 15] Ki·ªÉm tra t√≠nh ƒë∆°n ƒëi·ªáu c·ªßa PD theo Rating...")

# rating_pd = df_score_output.groupby("RATING")["PD"].mean().reindex(rating_labels)
# print(rating_pd)

# is_monotonic = rating_pd.is_monotonic_increasing
# print(" ‚Üí PD c√≥ ƒë∆°n ƒëi·ªáu kh√¥ng? ‚Üí", is_monotonic)


# # ============================================================
# # B∆Ø·ªöC 16 ‚Äî KI·ªÇM TRA S·ª∞ KH√ÅC BI·ªÜT R·ª¶I RO GI·ªÆA C√ÅC H·∫†NG
# # ============================================================
# print("\n[B∆Ø·ªöC 16] Ki·ªÉm tra ƒë·ªô t√°ch b·∫°ch gi·ªØa c√°c rating...")

# rating_diff = rating_pd.diff()
# print(rating_diff)


# # ============================================================
# # B∆Ø·ªöC 17 ‚Äî KI·ªÇM TRA M·ª®C ƒê·ªò T·∫¨P TRUNG
# # ============================================================
# print("\n[B∆Ø·ªöC 17] Ki·ªÉm tra m·ª©c ƒë·ªô t·∫≠p trung kh√°ch h√†ng...")

# rating_dist = df_score_output["RATING"].value_counts(normalize=True)
# print(rating_dist)


# # ============================================================
# # B∆Ø·ªöC 18 ‚Äî T√çNH HHI
# # ============================================================
# print("\n[B∆Ø·ªöC 18] T√≠nh Herfindahl-Hirschman Index (HHI)...")

# HHI = np.sum(rating_dist ** 2)
# print(f" ‚Üí HHI = {HHI:.6f}")

# if HHI < 0.10:
#     print(" ‚Üí Ph√¢n b·ªë rating R·∫§T T·ªêT (r·∫•t ƒë·ªÅu).")
# elif HHI < 0.18:
#     print(" ‚Üí Ph√¢n b·ªë rating CH·∫§P NH·∫¨N ƒê∆Ø·ª¢C.")
# else:
#     print(" ‚Üí T·∫≠p trung QU√Å M·ª®C ‚Üí n√™n xem l·∫°i c√°ch binning.")


# # ============================================================
# # B∆Ø·ªöC 19 ‚Äî XU·∫§T B·∫¢NG CU·ªêI
# # ============================================================
# print("\n[B∆Ø·ªöC 19] Xu·∫•t b·∫£ng cu·ªëi c√πng...")

# df_final_rating = df_score_output.copy()
# display(df_final_rating.head())
    
# print("‚Üí Ho√†n t·∫•t to√†n b·ªô Rating Pipeline!")


In [3]:
# # ============================================================
# # B·∫¢NG THANG H·∫†NG CU·ªêI ‚Äî SUMMARY TABLE (THEO KHO·∫¢NG ƒêI·ªÇM)
# # ============================================================

# print("\n[B·∫¢NG T·ªîNG H·ª¢P THANG H·∫†NG ‚Äî KHO·∫¢NG ƒêI·ªÇM]")

# summary_list = []

# for stt, rating in enumerate(rating_labels, start=1):

#     df_r = df_score_output[df_score_output["RATING"] == rating]

#     if df_r.empty:
#         summary_list.append([
#             stt,
#             rating,
#             "",
#             0,
#             0,
#             None,
#             None
#         ])
#         continue

#     # üîπ KHO·∫¢NG ƒêI·ªÇM (SCORE RANGE)
#     score_min = df_r["SCORE"].min()
#     score_max = df_r["SCORE"].max()
#     score_range = f"{score_min:.2f} ‚Üí {score_max:.2f}"

#     # S·ªë l∆∞·ª£ng
#     count = len(df_r)
#     pct = count / len(df_score_output)

#     # PD ∆∞·ªõc t√≠nh trung b√¨nh
#     pd_est = df_r["PD"].mean()

#     # T·ª∑ l·ªá v·ª° n·ª£ th·ª±c t·∫ø
#     default_rate = df_r["y"].mean() if "y" in df_r.columns else None

#     summary_list.append([
#         stt,
#         rating,
#         score_range,
#         count,
#         pct,
#         pd_est,
#         default_rate
#     ])

# # T·∫°o DataFrame
# df_rating_summary = pd.DataFrame(summary_list, columns=[
#     "STT",
#     "H·∫°ng",
#     "Kho·∫£ng ƒëi·ªÉm (Score)",
#     "S·ªë l∆∞·ª£ng quan s√°t",
#     "S·ªë l∆∞·ª£ng quan s√°t (%)",
#     "PD ∆∞·ªõc t√≠nh trung b√¨nh",
#     "T·ª∑ l·ªá v·ª° n·ª£ th·ª±c t·∫ø"
# ])

# # Format
# df_rating_summary["S·ªë l∆∞·ª£ng quan s√°t (%)"] = df_rating_summary["S·ªë l∆∞·ª£ng quan s√°t (%)"].apply(lambda x: f"{x:.2%}")
# df_rating_summary["PD ∆∞·ªõc t√≠nh trung b√¨nh"] = df_rating_summary["PD ∆∞·ªõc t√≠nh trung b√¨nh"].apply(lambda x: None if pd.isna(x) else round(x, 6))
# df_rating_summary["T·ª∑ l·ªá v·ª° n·ª£ th·ª±c t·∫ø"] = df_rating_summary["T·ª∑ l·ªá v·ª° n·ª£ th·ª±c t·∫ø"].apply(lambda x: None if pd.isna(x) else round(x, 6))

# display(df_rating_summary)

# print("\n‚Üí ƒê√£ d·ª±ng xong b·∫£ng thang h·∫°ng cu·ªëi c√πng (kho·∫£ng ƒëi·ªÉm).")
