In [1]:
import os

import pandas as pd

In [2]:
# filter files starting with a given prefix and ending with a given suffix
def filter_files(prefix, suffix, files):
    return [f for f in files if f.startswith(prefix) and f.endswith(suffix)]

In [3]:
results_dir = "./results/"
files = os.listdir(results_dir)
files

['results_random_forest_3ap.xlsx',
 'results_xgboost_3ap.xlsx',
 'results_mlp_3ap.xlsx',
 'results_catboost_2ap.xlsx',
 'y_2_3ap_test_pred.csv',
 'results_extra_trees_2ap.xlsx',
 'results_extra_trees_3ap.xlsx',
 'results_lightgbm_3ap.xlsx',
 'results_xgboost_2ap.xlsx',
 'results_mlp_2ap.xlsx',
 'y_2_2ap_test_pred.csv',
 'results_lightgbm_2ap.xlsx',
 'results_random_forest_2ap.xlsx',
 'results_catboost_3ap.xlsx']

In [4]:
res_2ap = filter_files("res", "_2ap.xlsx", files)
res_2ap

['results_catboost_2ap.xlsx',
 'results_extra_trees_2ap.xlsx',
 'results_xgboost_2ap.xlsx',
 'results_mlp_2ap.xlsx',
 'results_lightgbm_2ap.xlsx',
 'results_random_forest_2ap.xlsx']

In [5]:
import re


def extract_model_name(filename):
    match = re.search(r"_(\w+)_", filename)
    return match.group(1) if match else None


data_dict = {}
for f in res_2ap:
    data_dict[extract_model_name(f)] = pd.read_excel(results_dir + f, sheet_name=None)

len(data_dict)

6

In [6]:
data_dict["catboost"]["best_params"]

Unnamed: 0.1,Unnamed: 0,Parameter,Value,Search List
0,0,iterations,200,"[50, 100, 200]"
1,1,depth,8,"[4, 6, 8]"
2,2,learning_rate,0.1,"[0.01, 0.1, 0.2]"
3,3,l2_leaf_reg,3,"[1, 3, 5]"
4,4,loss_function,MultiClass,MultiClass
5,5,bootstrap_type,MVS,"['Bernoulli', 'MVS']"


In [7]:
data_dict["catboost"]["f1"]

Unnamed: 0.1,Unnamed: 0,train,test
0,0,1,0.935555
1,1,1,0.803005
2,2,1,0.934498
3,3,1,0.876058
4,4,1,0.879342


In [8]:
best_mean_r2 = 0
best_model = None
best_params = None
mean_r2_dict = {}
for model, data in data_dict.items():
    mean_r2 = data["f1"]["test"].mean()
    mean_r2_dict[model] = mean_r2
    # print(f"{model}: {mean_r2}")
    if mean_r2 > best_mean_r2:
        best_mean_r2 = mean_r2
        best_model = model
        best_params = data["best_params"].set_index("Parameter")["Value"].to_dict()

mean_r2_dict

{'catboost': 0.8856915790131717,
 'extra_trees': 0.9197021302537365,
 'xgboost': 0.9063442300492917,
 'mlp': 0.9014655234109729,
 'lightgbm': 0.9015726138162133,
 'random_forest': 0.8953561514053654}

In [9]:
# mean_r2_dict["lightgbm"] = 0.92

In [10]:
for k, v in best_params.items():
    if pd.isna(v):
        best_params[k] = None

# del best_params["early_stopping_round"]

# best_params["early_stopping_round"] = None

In [11]:
from modeling_old import clear_column_names


def load_data(path="./df_2ap_final.csv"):
    df = pd.read_csv(path, header=[0, 1], index_col=None)

    y = df[("mcs_nss", "_")]
    X = df.drop(columns=[("mcs_nss", "_")])

    X.columns = clear_column_names(X)

    return X, y


X, y = load_data()
X_test, y_test = load_data("./df_2_2ap_test_final.csv")
X.shape, X_test.shape

((392, 212), (64, 212))

In [12]:
from sklearn.preprocessing import LabelEncoder

# 创建 LabelEncoder 实例
le = LabelEncoder()

# 转换目标变量
y_encoded = le.fit_transform(y)

# to pd.Series
y_encoded = pd.Series(y_encoded)

In [13]:
# from modeling_old import regressor_final

# r = regressor_final(model_type=best_model, params=best_params, suffix="2ap")
# r.fit(X, y)

In [14]:
# r.plot_fit_error(X, y)

In [15]:
# all_best_params = {}
# for model, data in data_dict.items():
#     best_params = data["best_params"].set_index("Parameter")["Value"].to_dict()
#     for k, v in best_params.items():
#         if pd.isna(v):
#             best_params[k] = None
#     all_best_params[model] = best_params

# all_best_params

all_best_params = {
    "catboost": {
        "iterations": 200,
        "depth": 8,
        "learning_rate": 0.1,
        "l2_leaf_reg": 3,
        "loss_function": "MultiClass",
        "bootstrap_type": "MVS",
    },
    "extra_trees": {"n_estimators": 200, "max_features": None, "max_depth": 20},
    "xgboost": {
        "objective": "multi:softmax",
        "num_class": 15,
        "n_estimators": 200,
        "max_depth": None,
        "learning_rate": 0.1,
    },
    # "mlp": {
    #     "hidden_size1": 256,
    #     "hidden_size2": 64,
    #     "weight_decay": 0.0001,
    #     "output_size": 14,
    # },
    "lightgbm": {
        "objective": "multiclass",
        "metric": "multi_logloss",
        "n_estimators": 200,
        "learning_rate": 0.2,
        "max_depth": 10,
        "num_leaves": 31,
        "min_data_in_leaf": 20,
        "feature_fraction": 0.9,
        "early_stopping_round": None,
    },
    "random_forest": {"n_estimators": 50, "max_features": "sqrt", "max_depth": None},
}

all_best_params_sorted = sorted(all_best_params.items(), key=lambda x: x[0])

best_model, all_best_params_sorted

('extra_trees',
 [('catboost',
   {'iterations': 200,
    'depth': 8,
    'learning_rate': 0.1,
    'l2_leaf_reg': 3,
    'loss_function': 'MultiClass',
    'bootstrap_type': 'MVS'}),
  ('extra_trees',
   {'n_estimators': 200, 'max_features': None, 'max_depth': 20}),
  ('lightgbm',
   {'objective': 'multiclass',
    'metric': 'multi_logloss',
    'n_estimators': 200,
    'learning_rate': 0.2,
    'max_depth': 10,
    'num_leaves': 31,
    'min_data_in_leaf': 20,
    'feature_fraction': 0.9,
    'early_stopping_round': None}),
  ('random_forest',
   {'n_estimators': 50, 'max_features': 'sqrt', 'max_depth': None}),
  ('xgboost',
   {'objective': 'multi:softmax',
    'num_class': 15,
    'n_estimators': 200,
    'max_depth': None,
    'learning_rate': 0.1})])

In [16]:
from modeling_old import model_selection

model, _ = model_selection(
    model_type=best_model,
    params=all_best_params[best_model],
    random_state=42,
    input_size=X.shape[1],
)

model.fit(X, y_encoded)

y_test_pred = model.predict(X_test)
y_test_pred = le.inverse_transform(y_test_pred)

# save predictions
y_test_pred = pd.Series(y_test_pred, name="mcs_nss")

df = pd.DataFrame()
df[["mcs", "nss"]] = y_test_pred.str.split("_", expand=True).astype(int)

df.to_csv("./results/y_2_2ap_test_pred.csv", index=False)

In [None]:
# import matplotlib.pyplot as plt
# import numpy as np
# import pandas as pd
# from matplotlib_inline import backend_inline
# from modeling_old import model_selection, process_X
# from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

# backend_inline.set_matplotlib_formats("svg")

# # 设置图形的布局
# num_models = len(all_best_params)
# cols = 2  # 每行2个子图
# rows = (num_models + cols - 1) // cols  # 计算行数

# plt.figure(figsize=(10, 6 * rows))

# for i, (model_type, params) in enumerate(all_best_params_sorted):
#     model, _ = model_selection(
#         model_type=model_type, params=params, random_state=42, input_size=X.shape[1]
#     )

#     X = process_X(X, model_type=model_type)
#     model.fit(X, y_encoded)

#     y_pred = model.predict(X)

#     # 计算混淆矩阵
#     cm = confusion_matrix(y_encoded, y_pred)

#     # 创建子图
#     ax = plt.subplot(rows, cols, i + 1)
#     disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=np.unique(y))
#     disp.plot(ax=ax, cmap=plt.cm.Blues, values_format="d")

#     # 设置横轴标度旋转45度
#     plt.xticks(rotation=45)

#     ax.set_title(f"{model.__class__.__name__} MEAN F1: {mean_r2_dict[model_type]:.2f}")

# # 调整布局
# plt.tight_layout()

# # 保存图形
# plt.savefig("./fig/q2_2ap_train_confusion_matrix.svg")
# plt.show()