In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
import shap
import datetime
from sklearn.feature_selection import VarianceThreshold



In [None]:
def convert_work_year(x):
    if isinstance(x, str) and x.strip():
        if x == "< 1 year":
            return 0
        elif x == "1 year":
            return 1
        elif x == "10+ years":
            return 10
        elif x.endswith(" years"):
            return int(x.split(" ")[0])
        else:
            return None
    else:
        return None

def convert_month(x):
    if isinstance(x, str) and "-" in x:
        parts = x.split("-")
        if len(parts) == 2:
            if parts[0].isdigit():
                return (
                    2000 + int(parts[0]),
                    {
                        "Jan": 1,
                        "Feb": 2,
                        "Mar": 3,
                        "Apr": 4,
                        "May": 5,
                        "Jun": 6,
                        "Jul": 7,
                        "Aug": 8,
                        "Sep": 9,
                        "Oct": 10,
                        "Nov": 11,
                        "Dec": 12,
                    }[parts[1]],
                )
            else:
                if parts[1] == "00":
                    return (
                        2000,
                        {
                            "Jan": 1,
                            "Feb": 2,
                            "Mar": 3,
                            "Apr": 4,
                            "May": 5,
                            "Jun": 6,
                            "Jul": 7,
                            "Aug": 8,
                            "Sep": 9,
                            "Oct": 10,
                            "Nov": 11,
                            "Dec": 12,
                        }[parts[0]],
                    )
                else:
                    return (
                        1900 + int(parts[1]),
                        {
                            "Jan": 1,
                            "Feb": 2,
                            "Mar": 3,
                            "Apr": 4,
                            "May": 5,
                            "Jun": 6,
                            "Jul": 7,
                            "Aug": 8,
                            "Sep": 9,
                            "Oct": 10,
                            "Nov": 11,
                            "Dec": 12,
                        }[parts[0]],
                    )
        else:
            return None, None
    else:
        return None, None

def preprocess_dataset(dataset_path):
    train_data = pd.read_csv(dataset_path)
    train_data = train_data.fillna(train_data.median())
    label_encoder = LabelEncoder()
    train_data["employer_type"] = label_encoder.fit_transform(train_data["employer_type"])
    train_data["industry"] = label_encoder.fit_transform(train_data["industry"])
    train_data["class"] = label_encoder.fit_transform(train_data["class"])

    ohe = OneHotEncoder()
    train_employer_type_oh = ohe.fit_transform(train_data[["employer_type"]]).toarray()
    train_industry_oh = ohe.fit_transform(train_data[["industry"]]).toarray()
    train_class_oh = ohe.fit_transform(train_data[["class"]]).toarray()

    train_data = pd.concat(
        [
            train_data,
            pd.DataFrame(
                train_employer_type_oh,
                columns=[
                    f"employer_type_{i}" for i in range(train_employer_type_oh.shape[1])
                ],
            ),
        ],
        axis=1,
    )
    train_data = pd.concat(
        [
            train_data,
            pd.DataFrame(
                train_industry_oh,
                columns=[f"industry_{i}" for i in range(train_industry_oh.shape[1])],
            ),
        ],
        axis=1,
    )
    train_data = pd.concat(
        [
            train_data,
            pd.DataFrame(
                train_class_oh,
                columns=[f"class_{i}" for i in range(train_class_oh.shape[1])],
            ),
        ],
        axis=1,
    )

    # 处理日期特征

    train_data["issue_year"] = pd.to_datetime(
        train_data["issue_date"], format="%Y/%m/%d"
    ).dt.year
    train_data["issue_month"] = pd.to_datetime(
        train_data["issue_date"], format="%Y/%m/%d"
    ).dt.month
    train_data["issue_day"] = pd.to_datetime(
        train_data["issue_date"], format="%Y/%m/%d"
    ).dt.day
    train_data["issue_date_days"] = (
        datetime.datetime.now()
        - pd.to_datetime(train_data["issue_date"], format="%Y/%m/%d")
    ).dt.days

    # 特征工程
    train_data["total_loan_per_year"] = (
        train_data["total_loan"] / train_data["year_of_loan"]
    )
    train_data["monthly_payment_per_thousand"] = train_data["monthly_payment"] / (
        train_data["total_loan"] / 1000
    )

    train_data["work_year"] = train_data["work_year"].apply(convert_work_year)
    train_data["work_year"] = train_data["work_year"].interpolate()
    train_data = pd.concat(
        [train_data, pd.DataFrame(train_data["work_year"], columns=["work_year"])], axis=1
    )
    train_data = train_data.drop("issue_date", axis=1)

    train_data[["earlies_credit_year", "earlies_credit_month"]] = pd.DataFrame(
        train_data["earlies_credit_mon"].apply(convert_month).tolist(),
        columns=["earlies_credit_year", "earlies_credit_month"],
    )
    train_data = train_data.drop("earlies_credit_mon", axis=1)

    # 特征选择

    selector = VarianceThreshold(threshold=0.0)
    train_X = selector.fit_transform(train_data.drop("isDefault", axis=1))
    selected_features = train_data.drop("isDefault", axis=1).columns[selector.get_support()]
    train_data = train_data[["isDefault"] + list(selected_features)]

    # 数据标准化
    scaler = StandardScaler()
    train_X = scaler.fit_transform(train_data.drop("isDefault", axis=1))
    train_data = pd.concat(
        [
            pd.DataFrame(train_X, columns=train_data.drop("isDefault", axis=1).columns),
            train_data["isDefault"],
        ],
        axis=1,
    )

    # 处理类别不平衡
    sm = SMOTE(random_state=42)
    train_X, train_y = sm.fit_resample(
        train_data.drop("isDefault", axis=1), train_data["isDefault"]
    )
    train_data = pd.concat(
        [pd.DataFrame(train_X), pd.Series(train_y, name="isDefault")], axis=1
    )
    
    return train_data

In [None]:
dataset_path = "./train_public.csv"
if not os.path.exists("./preprocessed_train_public.csv"):
    train_data = preprocess_dataset(dataset_path)
    train_data.to_csv("./preprocessed_train_public.csv", index=False)
else:
    train_data = pd.read_csv("./preprocessed_train_public.csv")