In [8]:
# - 使用 statsmodels.Logit 拟合逻辑回归模型
# - 自动生成交互变量（来自你提供的图）
# - 输出每个交互项的：p值、系数、AUC、F1、Recall、Precision

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
import statsmodels.api as sm

# === Step 1: Load Data ===
df = pd.read_csv("Final_v3.csv")

# === Step 2: Preprocessing ===
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["addr_state"] = le.fit_transform(df["addr_state"])

# Fill NaNs (example)
df["unemployment_rate"] = df["unemployment_rate"].fillna(df["unemployment_rate"].mean())

target_col = "loan_status"

# === Step 3: Define Interaction Items ===
interaction_items = [
    ("avg_interest_rate", "term", "int_rate_term"),
    ("unemployment_rate", "dti", "unemp_dti"),
    ("implicit_regional_price_deflator", "installment", "inflation_install"),
    ("Population 25 years and over with Bachelor degree or higher", "annual_inc", "edu_inc"),
    ("Poverty Rate", "term", "poverty_term"),
    ("regional_price_parities_rpps_all_items", "purpose", "price_purpose"),
    ("Percent Uninsured", "dti", "uninsured_dti"),
    ("state_real_per_capita_personal_income", "revol_bal", "income_bal"),
    ("inq_last_6mths", "state_gdp_growth__summary_", "inq_gdp"),
    ("emp_length", "state_total_employment_growth", "emp_growth"),
    ("inq_last_6mths", "dti", "inq_dti"),
    ("installment", "revol_util", "install_util"),
    ("emp_length", "annual_inc", "emp_inc"),
    ("loan_amnt", "revol_bal", "loan_bal"),
    ("term", "revol_util", "term_util"),
]

results = []

# === Step 4: Loop Through Interactions ===
for var1, var2, new_var in interaction_items:
    df_copy = df[[var1, var2, target_col]].copy()
    df_copy[new_var] = df[var1] * df[var2]

    # Define features and target
    X = df_copy[[var1, var2, new_var]]
    X = sm.add_constant(X)
    y = df_copy[target_col]

    try:
        model = sm.Logit(y, X).fit(disp=0)
        y_pred_prob = model.predict(X)
        # y_pred = (y_pred_prob >= 0.5).astype(int)
        # 在不同阈值下测试 F1，找到最佳阈值
        best_f1, best_thresh = 0, 0.5
        for t in np.arange(0.1, 0.9, 0.05):
            y_pred_temp = (y_pred_prob >= t).astype(int)
            f1 = f1_score(y, y_pred_temp, zero_division=0)
            if f1 > best_f1:
                best_f1 = f1
                best_thresh = t

        # 用最佳阈值生成最终分类结果
        y_pred_final = (y_pred_prob >= best_thresh).astype(int)

        results.append({
            "交互变量": new_var,
            "变量1": var1,
            "变量2": var2,
            "P值_交互项": model.pvalues[new_var] if new_var in model.pvalues else None,
            "系数_交互项": model.params[new_var] if new_var in model.params else None,
            "AUC": roc_auc_score(y, y_pred_prob),
            "F1": f1_score(y, y_pred_final),
            "Recall": recall_score(y, y_pred_final),
            "Precision": precision_score(y, y_pred_final, zero_division=0),
            "最佳阈值": best_thresh
        })
    except Exception as e:
        print(f"[跳过] 交互项 {new_var} 出错: {e}")

# === Step 5: Export Results ===
result_df = pd.DataFrame(results)
display(result_df.sort_values(by="F1", ascending=False))


Unnamed: 0,交互变量,变量1,变量2,P值_交互项,系数_交互项,AUC,F1,Recall,Precision,最佳阈值
0,int_rate_term,avg_interest_rate,term,1.559675e-09,-0.035232,0.595759,0.320405,0.430242,0.255244,0.15
4,poverty_term,Poverty Rate,term,0.6832692,-0.002427,0.591963,0.320405,0.430242,0.255244,0.15
14,term_util,term,revol_util,3.544991e-07,-0.031122,0.605892,0.320401,0.430242,0.255238,0.2
10,inq_dti,inq_last_6mths,dti,8.597274e-05,-0.023848,0.583155,0.301014,0.697928,0.191887,0.15
1,unemp_dti,unemployment_rate,dti,0.1777282,0.008758,0.566434,0.293374,0.716435,0.184453,0.15
6,uninsured_dti,Percent Uninsured,dti,0.05092669,-0.012694,0.567031,0.293038,0.711419,0.184522,0.15
3,edu_inc,Population 25 years and over with Bachelor deg...,annual_inc,0.06107061,-0.020205,0.551463,0.291962,0.876155,0.175167,0.15
12,emp_inc,emp_length,annual_inc,0.1390408,-0.015596,0.551231,0.291411,0.87062,0.174992,0.15
13,loan_bal,loan_amnt,revol_bal,0.002178419,-0.0257,0.559324,0.288381,0.779223,0.17693,0.15
11,install_util,installment,revol_util,0.0002154332,-0.023325,0.540546,0.287946,0.812502,0.174978,0.15
