In [14]:
import polars as pl
import numpy as np
import pandas as pd

import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score 
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


In [15]:
# 这里需要处理所有6类数据

def set_table_dtypes(df: pl.DataFrame) -> pl.DataFrame:
    # implement here all desired dtypes for tables
    # the following is just an example
    for col in df.columns:
        # last letter of column name will help you determine the type
        if col[-1] in ("P", "A"):
            df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))
         
        '''   
        if col[-1] in ("L, M, T"):
            # 如果类型是布尔值，不变
            # 否则，改名col + "_string"
            if df[col].dtype != pl.Boolean:
                df = df.with_columns(pl.col(col).alias(col + "_string"))
        '''        
            
        # 处理D(日期) 转为时间戳
        if col[-1] == 'D':
            # 0 if null, otherwise timestamp.  
            df = df.with_columns(
                pl.when(pl.col(col).is_null())
                    .then(pl.lit(0))
                    .otherwise(pl.col(col).str.to_date( ).dt.timestamp())
                    .alias(col)
            )
            #print(df[col])

    return df

def convert_strings(df: pd.DataFrame) -> pd.DataFrame:
    #label_encoder = LabelEncoder() # 使用标签编码？似乎不使用效果更好
    
    for col in df.columns:  
        if df[col].dtype.name == 'bool':
            continue
        if (df[col].dtype.name in ['object', 'string']) or (col[-1] in ("L", "M", "T")):
            df[col] = df[col].astype("string").astype('category')
            current_categories = df[col].cat.categories
            new_categories = current_categories.to_list() + ["Unknown"]
            new_dtype = pd.CategoricalDtype(categories=new_categories, ordered=True)
            df[col] = df[col].astype(new_dtype)
            
            #df[col] = label_encoder.fit_transform(df[col])
            
            
    return df

def remove_letters(text):
    text = str(text)
    return int("".join(filter(str.isdigit, text)))

def employment_length(text):
    text = str(text)
    if text == "LESS_ONE":
        return 1
    if text == "MORE_ONE":
        return 2
    if text == "MORE_FIVE":
        return 3
    return 0
    

In [16]:
# 提交的时候要用到这个base_folder
base_folder = "/kaggle/input/home-credit-credit-risk-model-stability/csv_files/"
#  先考虑训练集

# base
train_basetable = pl.read_csv("train/train_base.csv")

# static
train_static = pl.concat(
    [
        pl.read_csv("train/train_static_0_0.csv").pipe(set_table_dtypes),
        pl.read_csv("train/train_static_0_1.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)

# static_cb
train_static_cb = pl.read_csv("train/train_static_cb_0.csv").pipe(set_table_dtypes)

# person 这里添加了新文件；数字对应depth
# 例：person1 depth=1; person2 depth=2
train_person_1 = pl.read_csv("train/train_person_1.csv").pipe(set_table_dtypes)
#train_person_2 = pl.read_csv("train/train_person_2.csv").pipe(set_table_dtypes)

# credit_bureau 这里添加了新文件；数字对应depth
#train_credit_bureau_b_1 = pl.read_csv("train/train_credit_bureau_b_1.csv").pipe(set_table_dtypes) 
train_credit_bureau_b_2 = pl.read_csv("train/train_credit_bureau_b_2.csv").pipe(set_table_dtypes) 

In [17]:
test_basetable = pl.read_csv("test/test_base.csv")
test_static = pl.concat(
    [
        pl.read_csv("test/test_static_0_0.csv").pipe(set_table_dtypes),
        pl.read_csv("test/test_static_0_1.csv").pipe(set_table_dtypes),
        pl.read_csv("test/test_static_0_2.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)
test_static_cb = pl.read_csv("test/test_static_cb_0.csv").pipe(set_table_dtypes)
test_person_1 = pl.read_csv("test/test_person_1.csv").pipe(set_table_dtypes) 
test_credit_bureau_b_2 = pl.read_csv("test/test_credit_bureau_b_2.csv").pipe(set_table_dtypes) 

In [18]:
# (1) 先处理train_person_1

train_person_1_feats = train_person_1.group_by("case_id").agg(
    # 1.0 baseline
    pl.col("mainoccupationinc_384A").max().alias("mainoccupationinc_384A_max"),
    (pl.col("incometype_1044T") == "SELFEMPLOYED").max().alias("mainoccupationinc_384A_any_selfemployed"),
    # 1.1 出生日期 birthdate
    pl.col("birth_259D").first().alias("birth_259D_first"),
    # 1.2 孩子数量 # children
    pl.col("childnum_185L").first().alias("childnum_185L_first"),
    # 1.3 联系人地址邮编 zip code
    pl.col("contaddr_district_15M").first().alias("contaddr_district_15M_string"),
    # 1.4 can match contact address
    pl.col("contaddr_matchlist_1032L").first().alias("contaddr_matchlist_1032L_ismatch"),
    # 1.5 联系地址与工作地址是否相同 addr same with work addr
    pl.col("contaddr_smempladdr_334L").first().alias("contaddr_smempladdr_334L_issame"),
    # 1.7 contact address
    pl.col("contaddr_zipcode_807M").first().alias("contaddr_zipcode_807M_string"),
    # 1.8 education level
    pl.col("education_927M").first().map_elements(remove_letters, pl.Int32).alias("education_927M_first"),
    # 1.9 employment start date
    pl.col("empl_employedfrom_271D").first().alias("empl_employedfrom_271D_first"),
    # 1.10 employment length
    pl.col("empl_employedtotal_800L").first().alias("empl_employedtotal_800L_string"),
    # 1.11 employment industry
    pl.col("empl_industry_691L").first().alias("empl_industry_691L_string"),
    # 1.12 employment district
    pl.col("empladdr_district_926M").first().alias("empladdr_district_926M_string"),
    # 1.13 employment zipcode
    pl.col("empladdr_zipcode_114M").first().alias("empladdr_zipcode_114M_string"),
    # 1.14 family state
    pl.col("familystate_447L").first().alias("familystate_447L_string"),
    # 1.15 gender
    pl.col("gender_992L").first().alias("gender_992L_string"),
    # 1.16 type of housing
    pl.col("housingtype_772L").first().alias("housingtype_772L_string"),
    # 1.17 type of income
    pl.col("incometype_1044T").first().alias("incometype_1044T_string"),
    # 1.18 is reference?
    pl.col("isreference_387L").first().alias("isreference_387L_string"),
    # 1.19 primary language
    pl.col("language1_981M").first().alias("language1_981M_string"),
    # 1.20 amount of main income
    pl.col("mainoccupationinc_384A").first().alias("mainoccupationinc_384A_number"),
    # 1.21 marital status
    pl.col("maritalst_703L").first().alias("maritalst_703L_string"),
    # 1.22 Flag indicating if client is using a flexible product with additional safeguard garanty.
    pl.col("safeguarantyflag_411L").first().alias("safeguarantyflag_411L_first")
)

#
#pl.col("").first().alias(""),

# Here num_group1=0 has special meaning, it is the person who applied for the loan.
train_person_1_feats_2 = train_person_1.select(["case_id", "num_group1", "housetype_905L"]).filter(
    pl.col("num_group1") == 0
).drop("num_group1").rename({"housetype_905L": "person_housetype"})

train_person_1_feats.shape

(1526659, 24)

In [19]:
# Here we have num_goup1 and num_group2, so we need to aggregate again.
train_credit_bureau_b_2_feats = train_credit_bureau_b_2.group_by("case_id").agg(
    pl.col("pmts_pmtsoverdue_635A").max().alias("pmts_pmtsoverdue_635A_max"),
    (pl.col("pmts_dpdvalue_108P") > 31).max().alias("pmts_dpdvalue_108P_over31")
)

In [20]:
# We will process in this examples only A-type and M-type columns, so we need to select them.
# 这里D类型也包括在内
selected_static_cols = []
for col in train_static.columns:
    if col[-1] in ("A", "M", "D", "L", "M", "T"):
        selected_static_cols.append(col)
print(selected_static_cols)

selected_static_cb_cols = []
for col in train_static_cb.columns:
    if col[-1] in ("A", "M", "D", "L", "M", "T"):
        selected_static_cb_cols.append(col)
print(selected_static_cb_cols)

data = train_basetable.join(
    train_static.select(["case_id"]+selected_static_cols), how="left", on="case_id"
).join(
    train_static_cb.select(["case_id"]+selected_static_cb_cols), how="left", on="case_id"
).join(
    train_person_1_feats, how="left", on="case_id"
).join(
    train_person_1_feats_2, how="left", on="case_id"
).join(
    train_credit_bureau_b_2_feats, how="left", on="case_id"
)

['amtinstpaidbefduel24m_4187115A', 'annuity_780A', 'annuitynextmonth_57A', 'applicationcnt_361L', 'applications30d_658L', 'applicationscnt_1086L', 'applicationscnt_464L', 'applicationscnt_629L', 'applicationscnt_867L', 'avginstallast24m_3658937A', 'avglnamtstart24m_4525187A', 'avgoutstandbalancel6m_4187114A', 'avgpmtlast12m_4525200A', 'bankacctype_710L', 'cardtype_51L', 'clientscnt12m_3712952L', 'clientscnt3m_3712950L', 'clientscnt6m_3712949L', 'clientscnt_100L', 'clientscnt_1022L', 'clientscnt_1071L', 'clientscnt_1130L', 'clientscnt_136L', 'clientscnt_157L', 'clientscnt_257L', 'clientscnt_304L', 'clientscnt_360L', 'clientscnt_493L', 'clientscnt_533L', 'clientscnt_887L', 'clientscnt_946L', 'cntincpaycont9m_3716944L', 'cntpmts24_3658933L', 'commnoinclast6m_3546845L', 'credamount_770A', 'credtype_322L', 'currdebt_22A', 'currdebtcredtyperange_828A', 'datefirstoffer_1144D', 'datelastinstal40dpd_247D', 'datelastunpaid_3546854D', 'daysoverduetolerancedd_3976961L', 'deferredmnthsnum_166L', 'd

In [21]:
test_person_1_feats = test_person_1.group_by("case_id").agg(
    # 1.0 baseline
    pl.col("mainoccupationinc_384A").max().alias("mainoccupationinc_384A_max"),
    (pl.col("incometype_1044T") == "SELFEMPLOYED").max().alias("mainoccupationinc_384A_any_selfemployed"),
    # 1.1 出生日期 birthdate
    pl.col("birth_259D").first().alias("birth_259D_first"),
    # 1.2 孩子数量 # children
    pl.col("childnum_185L").first().alias("childnum_185L_first"),
    # 1.3 联系人地址邮编 zip code
    pl.col("contaddr_district_15M").first().alias("contaddr_district_15M_string"),
    # 1.4 can match contact address
    pl.col("contaddr_matchlist_1032L").first().alias("contaddr_matchlist_1032L_ismatch"),
    # 1.5 联系地址与工作地址是否相同 addr same with work addr
    pl.col("contaddr_smempladdr_334L").first().alias("contaddr_smempladdr_334L_issame"),
    # 1.7 contact address
    pl.col("contaddr_zipcode_807M").first().alias("contaddr_zipcode_807M_string"),
    # 1.8 education level
    pl.col("education_927M").first().map_elements(remove_letters, pl.Int32).alias("education_927M_first"),
    # 1.9 employment start date
    pl.col("empl_employedfrom_271D").first().alias("empl_employedfrom_271D_first"),
    # 1.10 employment length
    pl.col("empl_employedtotal_800L").first().alias("empl_employedtotal_800L_string"),
    # 1.11 employment industry
    pl.col("empl_industry_691L").first().alias("empl_industry_691L_string"),
    # 1.12 employment district
    pl.col("empladdr_district_926M").first().alias("empladdr_district_926M_string"),
    # 1.13 employment zipcode
    pl.col("empladdr_zipcode_114M").first().alias("empladdr_zipcode_114M_string"),
    # 1.14 family state
    pl.col("familystate_447L").first().alias("familystate_447L_string"),
    # 1.15 gender
    pl.col("gender_992L").first().alias("gender_992L_string"),
    # 1.16 type of housing
    pl.col("housingtype_772L").first().alias("housingtype_772L_string"),
    # 1.17 type of income
    pl.col("incometype_1044T").first().alias("incometype_1044T_string"),
    # 1.18 is reference?
    pl.col("isreference_387L").first().alias("isreference_387L_string"),
    # 1.19 primary language
    pl.col("language1_981M").first().alias("language1_981M_string"),
    # 1.20 amount of main income
    pl.col("mainoccupationinc_384A").first().alias("mainoccupationinc_384A_number"),
    # 1.21 marital status
    pl.col("maritalst_703L").first().alias("maritalst_703L_string"),
    # 1.22 Flag indicating if client is using a flexible product with additional safeguard garanty.
    pl.col("safeguarantyflag_411L").first().alias("safeguarantyflag_411L_first")
)

test_person_1_feats_2 = test_person_1.select(["case_id", "num_group1", "housetype_905L"]).filter(
    pl.col("num_group1") == 0
).drop("num_group1").rename({"housetype_905L": "person_housetype"})

test_credit_bureau_b_2_feats = test_credit_bureau_b_2.group_by("case_id").agg(
    pl.col("pmts_pmtsoverdue_635A").max().alias("pmts_pmtsoverdue_635A_max"),
    (pl.col("pmts_dpdvalue_108P") > 31).max().alias("pmts_dpdvalue_108P_over31")
)

data_submission = test_basetable.join(
    test_static.select(["case_id"]+selected_static_cols), how="left", on="case_id"
).join(
    test_static_cb.select(["case_id"]+selected_static_cb_cols), how="left", on="case_id"
).join(
    test_person_1_feats, how="left", on="case_id"
).join(
    test_person_1_feats_2, how="left", on="case_id"
).join(
    test_credit_bureau_b_2_feats, how="left", on="case_id"
)

In [22]:

case_ids = data["case_id"].unique().shuffle(seed=1)
case_ids_train, case_ids_test = train_test_split(case_ids, train_size=0.6, random_state=1)
case_ids_valid, case_ids_test = train_test_split(case_ids_test, train_size=0.5, random_state=1)

cols_pred = []
for col in data.columns:
    #if col[-1].isupper() and col[:-1].islower():
    #    cols_pred.append(col)
    if col in ["case_id", "WEEK_NUM", "target"]:
        continue
    cols_pred.append(col)

print(cols_pred)

def from_polars_to_pandas(case_ids: pl.DataFrame) -> pl.DataFrame:
    return (
        data.filter(pl.col("case_id").is_in(case_ids))[["case_id", "WEEK_NUM", "target"]].to_pandas(),
        data.filter(pl.col("case_id").is_in(case_ids))[cols_pred].to_pandas(),
        data.filter(pl.col("case_id").is_in(case_ids))["target"].to_pandas()
    )

base_train, X_train, y_train = from_polars_to_pandas(case_ids_train)
base_valid, X_valid, y_valid = from_polars_to_pandas(case_ids_valid)
base_test, X_test, y_test = from_polars_to_pandas(case_ids_test)

for df in [X_train, X_valid, X_test]:
    df = convert_strings(df)

['date_decision', 'MONTH', 'amtinstpaidbefduel24m_4187115A', 'annuity_780A', 'annuitynextmonth_57A', 'applicationcnt_361L', 'applications30d_658L', 'applicationscnt_1086L', 'applicationscnt_464L', 'applicationscnt_629L', 'applicationscnt_867L', 'avginstallast24m_3658937A', 'avglnamtstart24m_4525187A', 'avgoutstandbalancel6m_4187114A', 'avgpmtlast12m_4525200A', 'bankacctype_710L', 'cardtype_51L', 'clientscnt12m_3712952L', 'clientscnt3m_3712950L', 'clientscnt6m_3712949L', 'clientscnt_100L', 'clientscnt_1022L', 'clientscnt_1071L', 'clientscnt_1130L', 'clientscnt_136L', 'clientscnt_157L', 'clientscnt_257L', 'clientscnt_304L', 'clientscnt_360L', 'clientscnt_493L', 'clientscnt_533L', 'clientscnt_887L', 'clientscnt_946L', 'cntincpaycont9m_3716944L', 'cntpmts24_3658933L', 'commnoinclast6m_3546845L', 'credamount_770A', 'credtype_322L', 'currdebt_22A', 'currdebtcredtyperange_828A', 'datefirstoffer_1144D', 'datelastinstal40dpd_247D', 'datelastunpaid_3546854D', 'daysoverduetolerancedd_3976961L', '

In [23]:
print(f"Train: {X_train.shape}")
print(f"Valid: {X_valid.shape}")
print(f"Test: {X_test.shape}")
X_train



# ***********************************************************
# 
# ！！！！！！！！！！！！！！！！！！！！！！！！！
# 数据处理部分结束
# 在这之后就可以修改了
# ！！！！！！！！！！！！！！！！！！！！！！！！！
# 
# ***********************************************************


Train: (915995, 225)
Valid: (305332, 225)
Test: (305332, 225)


Unnamed: 0,date_decision,MONTH,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,applicationscnt_629L,...,housingtype_772L_string,incometype_1044T_string,isreference_387L_string,language1_981M_string,mainoccupationinc_384A_number,maritalst_703L_string,safeguarantyflag_411L_first,person_housetype,pmts_pmtsoverdue_635A_max,pmts_dpdvalue_108P_over31
0,2019-01-03,201901,,1917.6000,0.0,0.0,0.0,0.0,0.0,0.0,...,,SALARIED_GOVT,,P10_39_147,10800.0,,True,,,
1,2019-01-04,201901,,4937.0000,0.0,0.0,0.0,0.0,0.0,0.0,...,,EMPLOYED,,P10_39_147,14000.0,,True,,,
2,2019-01-02,201901,,3600.0000,0.0,0.0,1.0,0.0,8.0,2.0,...,,PRIVATE_SECTOR_EMPLOYEE,,P209_127_106,64000.0,,True,,,
3,2019-01-03,201901,,3110.8000,0.0,0.0,0.0,0.0,0.0,0.0,...,PARENTAL,,False,a55475b1,,SINGLE,,,,
4,2019-01-03,201901,,1218.0000,0.0,0.0,0.0,0.0,1.0,1.0,...,,SALARIED_GOVT,,P209_127_106,46000.0,,True,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915990,2020-10-05,202010,104557.25,8218.0000,5293.2,0.0,0.0,0.0,0.0,0.0,...,,SALARIED_GOVT,,P10_39_147,40000.0,,True,,,
915991,2020-10-05,202010,176561.36,3675.4001,0.0,0.0,0.0,0.0,0.0,0.0,...,,RETIRED_PENSIONER,,P209_127_106,40000.0,,True,OWNED,,
915992,2020-10-05,202010,14232.40,7788.8003,0.0,0.0,0.0,0.0,0.0,0.0,...,,PRIVATE_SECTOR_EMPLOYEE,,P209_127_106,30000.0,,False,,,
915993,2020-10-05,202010,197371.58,1195.4000,2827.2,0.0,0.0,36.0,0.0,0.0,...,,RETIRED_PENSIONER,,P209_127_106,30000.0,,False,,,


In [24]:
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_valid = lgb.Dataset(X_valid, label=y_valid, reference=lgb_train)

params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 3,
    "num_leaves": 31,
    "learning_rate": 0.05,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "n_estimators": 1000,
    "verbose": -1,
}

model_gbm = lgb.train(
    params,
    lgb_train,
    valid_sets=lgb_valid,
    callbacks=[lgb.log_evaluation(50), lgb.early_stopping(10)]
)


def frequency_encoding(train, valid, test, column):
    # 计算训练集中每个类别的出现频率
    frequency = train[column].value_counts() / len(train)
    # 映射这些频率到训练集、验证集和测试集
    train[column] = train[column].map(frequency).astype(float)
    valid[column] = valid[column].map(frequency).fillna(0).astype(float)  # 使用0填充验证集中未见过的类别
    test[column] = test[column].map(frequency).fillna(0).astype(float)   # 使用0填充测试集中未见过的类别
    return train, valid, test


# 对每个类别特征应用频率编码
for column in X_train.columns:
    if X_train[column].dtype == 'object' :
        X_train, X_valid, X_test = frequency_encoding(X_train, X_valid, X_test, column)


X_train_encoded = X_train.apply(pd.to_numeric, errors='coerce').fillna(0)
X_valid_encoded = X_valid.apply(pd.to_numeric, errors='coerce').fillna(0)
X_test_encoded = X_test.apply(pd.to_numeric, errors='coerce').fillna(0)


# 创建 XGBoost 模型实例
model_XGBoost = XGBClassifier(objective='binary:logistic', n_estimators=100, learning_rate=0.1, max_depth=3)

model_XGBoost.fit(X_train_encoded, y_train)

# 训练随机森林模型
model_random_forest = RandomForestClassifier(n_estimators=1000, max_depth=3, random_state=42, n_jobs=-1)
model_random_forest.fit(X_train_encoded, y_train)


# 训练岭回归模型
model_ridge = Ridge(alpha=1.0, random_state=42)

model_ridge.fit(X_train_encoded, y_train)

# 进行预测
for base, X_encoded in [(base_train, X_train_encoded), (base_valid, X_valid_encoded), (base_test, X_test_encoded)]:
    y_pred_random_forest = model_random_forest.predict_proba(X_encoded)[:, 1]
    
    y_pred_ridge = model_ridge.predict(X_encoded)
    y_pred_XGBoost = model_XGBoost.predict_proba(X_encoded)[:, 1]
    base["score"] = y_pred_random_forest + y_pred_ridge + y_pred_XGBoost
    
for base, X in [(base_train, X_train), (base_valid, X_valid), (base_test, X_test)]:
    y_pred = model_gbm.predict(X, num_iteration=model_gbm.best_iteration)
    base["score"] += y_pred
    base["score"] /= 4
print(f'The AUC score on the train set is: {roc_auc_score(base_train["target"], base_train["score"])}') 
print(f'The AUC score on the valid set is: {roc_auc_score(base_valid["target"], base_valid["score"])}') 
print(f'The AUC score on the test set is: {roc_auc_score(base_test["target"], base_test["score"])}')

def gini_stability(base, w_fallingrate=88.0, w_resstd=-0.5):
    gini_in_time = base.loc[:, ["WEEK_NUM", "target", "score"]]\
        .sort_values("WEEK_NUM")\
        .groupby("WEEK_NUM")[["target", "score"]]\
        .apply(lambda x: 2*roc_auc_score(x["target"], x["score"])-1).tolist()
    
    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a*x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

stability_score_train = gini_stability(base_train)
stability_score_valid = gini_stability(base_valid)
stability_score_test = gini_stability(base_test)

print(f'The stability score on the train set is: {stability_score_train}') 
print(f'The stability score on the valid set is: {stability_score_valid}') 
print(f'The stability score on the test set is: {stability_score_test}')  



Training until validation scores don't improve for 10 rounds
[50]	valid_0's auc: 0.770421
[100]	valid_0's auc: 0.789826
[150]	valid_0's auc: 0.797236
[200]	valid_0's auc: 0.801872
[250]	valid_0's auc: 0.805283
[300]	valid_0's auc: 0.80746
[350]	valid_0's auc: 0.809216
[400]	valid_0's auc: 0.810628
[450]	valid_0's auc: 0.811484
[500]	valid_0's auc: 0.812413
[550]	valid_0's auc: 0.813244
[600]	valid_0's auc: 0.814062
[650]	valid_0's auc: 0.8147
[700]	valid_0's auc: 0.815301
[750]	valid_0's auc: 0.815831
[800]	valid_0's auc: 0.81633
[850]	valid_0's auc: 0.816546
[900]	valid_0's auc: 0.816859
Early stopping, best iteration is:
[933]	valid_0's auc: 0.817001


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


The AUC score on the train set is: 0.8110331084948619
The AUC score on the valid set is: 0.8028152321701041
The AUC score on the test set is: 0.8033491868874653
The stability score on the train set is: 0.5951721788322244
The stability score on the valid set is: 0.5757576290872999
The stability score on the test set is: 0.5600917079154909


In [26]:
def frequency_encoding(train, col):
    # 计算训练集中每个类别的出现频率
    freq = train[col].value_counts() / len(train)
    return freq

X_submission = data_submission[cols_pred].to_pandas()
X_submission = convert_strings(X_submission)
categorical_cols = X_train.select_dtypes(include=['category']).columns

X_submission_encode = X_submission.copy()
# 首先对训练数据进行频率编码，并保存每列的频率映射

frequency_mappings = {}
for col in categorical_cols:
    frequency_mappings[col] = frequency_encoding(X_train, col)

# 应用频率编码到训练集和提交数据集
def apply_freq_encoding(data, col, freq_mapping):
    if col in data.columns:
        # 映射频率，并填充那些未在训练集中出现过的类别
        data[col] = data[col].map(freq_mapping).fillna(0)  # 先填充，避免类型问题
    return data



for col in categorical_cols:
    X_train = apply_freq_encoding(X_train, col, frequency_mappings[col])
    X_submission_encode = apply_freq_encoding(X_submission_encode, col, frequency_mappings[col])

# 使用 pd.to_numeric 尝试转换所有列，确保所有数据都是数值类型
X_train = X_train.apply(pd.to_numeric, errors='coerce').fillna(0)
X_submission_encode = X_submission_encode.apply(pd.to_numeric, errors='coerce').fillna(0)

      
# 确保X_submission的列与X_train对齐
X_submission_encode = X_submission_encode.reindex(columns=X_train.columns, fill_value=0)



y_submission_pred_random_forest = model_random_forest.predict_proba(X_submission_encode)[:, 1]
y_submission_pred_gbm = model_gbm.predict(X_submission)
y_submission_pred_ridge = model_ridge.predict(X_submission_encode)
y_submission_pred_XGBoost = model_XGBoost.predict_proba(X_submission_encode)[:, 1]
y_submission_pred = (y_submission_pred_random_forest + y_submission_pred_ridge + y_submission_pred_XGBoost + y_submission_pred_gbm) / 4

In [27]:
submission = pd.DataFrame({
    "case_id": data_submission["case_id"].to_numpy(),
    "score": y_submission_pred
}).set_index('case_id')
submission.to_csv("./submission_catboost_frequent_encode_255.csv")