In [1]:
import os

import pandas as pd

In [2]:
# filter files starting with a given prefix and ending with a given suffix
def filter_files(prefix, suffix, files):
    return [f for f in files if f.startswith(prefix) and f.endswith(suffix)]

In [None]:
results_dir = "./results/"
files = os.listdir(results_dir)
files

In [None]:
res_2ap = filter_files("res", "_2ap.xlsx", files)
res_2ap

In [None]:
import re


def extract_model_name(filename):
    match = re.search(r"_(\w+)_", filename)
    return match.group(1) if match else None


data_dict = {}
for f in res_2ap:
    data_dict[extract_model_name(f)] = pd.read_excel(results_dir + f, sheet_name=None)

len(data_dict)

In [None]:
data_dict["catboost"]["best_params"]

In [None]:
data_dict["catboost"]["f1"]

In [None]:
best_mean_r2 = 0
best_model = None
best_params = None
mean_r2_dict = {}
for model, data in data_dict.items():
    mean_r2 = data["f1"]["test"].mean()
    mean_r2_dict[model] = mean_r2
    # print(f"{model}: {mean_r2}")
    if mean_r2 > best_mean_r2:
        best_mean_r2 = mean_r2
        best_model = model
        best_params = data["best_params"].set_index("Parameter")["Value"].to_dict()

mean_r2_dict

In [9]:
# mean_r2_dict["lightgbm"] = 0.92

In [10]:
for k, v in best_params.items():
    if pd.isna(v):
        best_params[k] = None

# del best_params["early_stopping_round"]

# best_params["early_stopping_round"] = None

In [None]:
from modeling_old import clear_column_names


def load_data(path="./df_2ap_final.csv"):
    df = pd.read_csv(path, header=[0, 1], index_col=None)

    y = df[("mcs_nss", "_")]
    X = df.drop(columns=[("mcs_nss", "_")])

    X.columns = clear_column_names(X)

    return X, y


X, y = load_data()
X_test, y_test = load_data("./df_2_2ap_test_final.csv")
X.shape, X_test.shape

In [12]:
from sklearn.preprocessing import LabelEncoder

# 创建 LabelEncoder 实例
le = LabelEncoder()

# 转换目标变量
y_encoded = le.fit_transform(y)

# to pd.Series
y_encoded = pd.Series(y_encoded)

In [13]:
# from modeling_old import regressor_final

# r = regressor_final(model_type=best_model, params=best_params, suffix="2ap")
# r.fit(X, y)

In [14]:
# r.plot_fit_error(X, y)

In [None]:
# all_best_params = {}
# for model, data in data_dict.items():
#     best_params = data["best_params"].set_index("Parameter")["Value"].to_dict()
#     for k, v in best_params.items():
#         if pd.isna(v):
#             best_params[k] = None
#     all_best_params[model] = best_params

# all_best_params

all_best_params = {
    "catboost": {
        "iterations": 200,
        "depth": 8,
        "learning_rate": 0.1,
        "l2_leaf_reg": 3,
        "loss_function": "MultiClass",
        "bootstrap_type": "MVS",
    },
    "extra_trees": {"n_estimators": 200, "max_features": None, "max_depth": 20},
    "xgboost": {
        "objective": "multi:softmax",
        "num_class": 15,
        "n_estimators": 200,
        "max_depth": None,
        "learning_rate": 0.1,
    },
    # "mlp": {
    #     "hidden_size1": 256,
    #     "hidden_size2": 64,
    #     "weight_decay": 0.0001,
    #     "output_size": 14,
    # },
    "lightgbm": {
        "objective": "multiclass",
        "metric": "multi_logloss",
        "n_estimators": 200,
        "learning_rate": 0.2,
        "max_depth": 10,
        "num_leaves": 31,
        "min_data_in_leaf": 20,
        "feature_fraction": 0.9,
        "early_stopping_round": None,
    },
    "random_forest": {"n_estimators": 50, "max_features": "sqrt", "max_depth": None},
}

all_best_params_sorted = sorted(all_best_params.items(), key=lambda x: x[0])

best_model, all_best_params_sorted

In [22]:
# X["mcs_nss"]

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib_inline import backend_inline
from modeling_old import model_selection, process_X
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

backend_inline.set_matplotlib_formats("svg")

# 设置图形的布局
num_models = len(all_best_params)
cols = 2  # 每行2个子图
rows = (num_models + cols - 1) // cols  # 计算行数

plt.figure(figsize=(10, 6 * rows))

for i, (model_type, params) in enumerate(all_best_params_sorted):
    model, _ = model_selection(
        model_type=model_type, params=params, random_state=42, input_size=X.shape[1]
    )

    X = process_X(X, model_type=model_type)
    model.fit(X, y_encoded)

    y_pred = model.predict(X)

    # 计算混淆矩阵
    cm = confusion_matrix(y_encoded, y_pred)

    # 创建子图
    ax = plt.subplot(rows, cols, i + 1)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=np.unique(y))
    disp.plot(ax=ax, cmap=plt.cm.Blues, values_format="d")

    # 设置横轴标度旋转45度
    plt.xticks(rotation=45)

    ax.set_title(f"{model.__class__.__name__} MEAN F1: {mean_r2_dict[model_type]:.2f}")

# 调整布局
plt.tight_layout()

# 保存图形
plt.savefig("./fig/q2_2ap_train_confusion_matrix.svg")
plt.show()