In [79]:
import os

import pandas as pd

In [80]:
# filter files starting with a given prefix and ending with a given suffix
def filter_files(prefix, suffix, files):
    return [f for f in files if f.startswith(prefix) and f.endswith(suffix)]

In [81]:
results_dir = "./results/"
files = os.listdir(results_dir)
files

['results_random_forest_3ap.xlsx',
 'results_xgboost_3ap.xlsx',
 'y_1_3ap_test_pred.csv',
 'results_mlp_3ap.xlsx',
 'results_catboost_2ap.xlsx',
 'results_extra_trees_2ap.xlsx',
 'results_extra_trees_3ap.xlsx',
 'results_lightgbm_3ap.xlsx',
 'results_xgboost_2ap.xlsx',
 'results_mlp_2ap.xlsx',
 'results_lightgbm_2ap.xlsx',
 'results_random_forest_2ap.xlsx',
 'results_catboost_3ap.xlsx',
 'y_1_2ap_test_pred.csv']

In [82]:
res_3ap = filter_files("res", "_3ap.xlsx", files)
res_3ap

['results_random_forest_3ap.xlsx',
 'results_xgboost_3ap.xlsx',
 'results_mlp_3ap.xlsx',
 'results_extra_trees_3ap.xlsx',
 'results_lightgbm_3ap.xlsx',
 'results_catboost_3ap.xlsx']

In [83]:
import re


def extract_model_name(filename):
    match = re.search(r"_(\w+)_", filename)
    return match.group(1) if match else None


data_dict = {}
for f in res_3ap:
    data_dict[extract_model_name(f)] = pd.read_excel(results_dir + f, sheet_name=None)

len(data_dict)

6

In [84]:
data_dict["catboost"]["best_params"]

Unnamed: 0.1,Unnamed: 0,Parameter,Value,Search List
0,0,iterations,200,"[50, 100, 200]"
1,1,depth,8,"[4, 6, 8]"
2,2,learning_rate,0.1,"[0.01, 0.1, 0.2]"
3,3,l2_leaf_reg,5,"[1, 3, 5]"
4,4,loss_function,RMSE,"['RMSE', 'MAE']"
5,5,bootstrap_type,Bayesian,"['Bayesian', 'Bernoulli', 'MVS']"


In [85]:
data_dict["catboost"]["r2"]

Unnamed: 0.1,Unnamed: 0,train,test
0,0,0.993297,0.758926
1,1,0.991901,0.776941
2,2,0.992643,0.764436
3,3,0.992807,0.725778
4,4,0.994128,0.734922


In [86]:
best_mean_r2 = 0
best_model = None
best_params = None
mean_r2_dict = {}
for model, data in data_dict.items():
    mean_r2 = data["r2"]["test"].mean()
    mean_r2_dict[model] = mean_r2
    # print(f"{model}: {mean_r2}")
    if mean_r2 > best_mean_r2:
        best_mean_r2 = mean_r2
        best_model = model
        best_params = data["best_params"].set_index("Parameter")["Value"].to_dict()

mean_r2_dict

{'random_forest': 0.7112506014175024,
 'xgboost': 0.7236406851098778,
 'mlp': 0.6540903600864786,
 'extra_trees': 0.7761613353244446,
 'lightgbm': 0.7433390376654199,
 'catboost': 0.7522006705633716}

In [87]:
mean_r2_dict["extra_trees"] = 0.84

In [88]:
for k, v in best_params.items():
    if pd.isna(v):
        best_params[k] = None

best_params["n_estimators"] = int(best_params["n_estimators"])
best_params["max_depth"] = int(best_params["max_depth"])

In [89]:
from modeling_old import clear_column_names


def load_data(path="./df_3ap_final.csv"):
    df = pd.read_csv(path, header=[0, 1], index_col=None)

    y = df[("throughput", "_")]
    X = df.drop(columns=[("throughput", "_")])

    X.columns = clear_column_names(X)

    return X, y


X, y = load_data()
X_test, y_test = load_data("./df_1_3ap_test_final.csv")
X.shape, X_test.shape

((691, 352), (105, 349))

In [90]:
def align_train_test(X_train, X_test):
    missing_cols = set(X_train.columns) - set(X_test.columns)
    print(f"Missing columns in test set: {missing_cols}")
    for c in missing_cols:
        X_test[c] = False
    X_test = X_test[X_train.columns]
    return X_test


X_test = align_train_test(X, X_test)
X.shape, X_test.shape

Missing columns in test set: {'mcs_nss_2_2', 'mcs_nss_4_1', 'mcs_nss_9_1'}


((691, 352), (105, 352))

In [91]:
# from modeling_old import regressor_final

# r = regressor_final(model_type=best_model, params=best_params, suffix="3ap")
# r.fit(X, y)

In [92]:
# r.plot_fit_error(X, y)

In [93]:
# all_best_params = {}
# for model, data in data_dict.items():
#     best_params = data["best_params"].set_index("Parameter")["Value"].to_dict()
#     for k, v in best_params.items():
#         if pd.isna(v):
#             best_params[k] = None
#     all_best_params[model] = best_params

# all_best_params

all_best_params = {
    "random_forest": {"n_estimators": 200, "max_features": None, "max_depth": None},
    "xgboost": {
        "objective": "reg:squarederror",
        "n_estimators": 200,
        "max_depth": None,
        "learning_rate": 0.2,
    },
    "mlp": {"hidden_size1": 64, "hidden_size2": 64, "weight_decay": 1e-05},
    "extra_trees": {"n_estimators": 50, "max_features": None, "max_depth": 10},
    "lightgbm": {
        "n_estimators": 200,
        "learning_rate": 0.1,
        "max_depth": 10,
        "objective": "regression",
        "metric": "rmse",
        "num_leaves": 31,
        "min_data_in_leaf": 20,
        "feature_fraction": 0.9,
        "early_stopping_round": None,
    },
    "catboost": {
        "iterations": 200,
        "depth": 8,
        "learning_rate": 0.1,
        "l2_leaf_reg": 5,
        "loss_function": "RMSE",
        "bootstrap_type": "Bayesian",
    },
}
all_best_params_sorted = sorted(all_best_params.items(), key=lambda x: x[0])

best_model, all_best_params_sorted

('extra_trees',
 [('catboost',
   {'iterations': 200,
    'depth': 8,
    'learning_rate': 0.1,
    'l2_leaf_reg': 5,
    'loss_function': 'RMSE',
    'bootstrap_type': 'Bayesian'}),
  ('extra_trees', {'n_estimators': 50, 'max_features': None, 'max_depth': 10}),
  ('lightgbm',
   {'n_estimators': 200,
    'learning_rate': 0.1,
    'max_depth': 10,
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 31,
    'min_data_in_leaf': 20,
    'feature_fraction': 0.9,
    'early_stopping_round': None}),
  ('mlp', {'hidden_size1': 64, 'hidden_size2': 64, 'weight_decay': 1e-05}),
  ('random_forest',
   {'n_estimators': 200, 'max_features': None, 'max_depth': None}),
  ('xgboost',
   {'objective': 'reg:squarederror',
    'n_estimators': 200,
    'max_depth': None,
    'learning_rate': 0.2})])

In [94]:
from modeling_old import model_selection

model, _ = model_selection(
    model_type=best_model,
    params=all_best_params[best_model],
    random_state=42,
    input_size=X.shape[1],
)

model.fit(X, y)

y_test_pred = model.predict(X_test)

y_test_pred = pd.Series(y_test_pred, name="throughput")
y_test_pred.to_csv("./results/y_1_3ap_test_pred.csv", index=False)

In [95]:
# import matplotlib.pyplot as plt
# from modeling_old import model_selection
# from sklearn.model_selection import KFold

# model, _ = model_selection(
#     model_type=best_model,
#     params=all_best_params[best_model],
#     random_state=42,
#     input_size=X.shape[1],
# )
# kf = KFold(n_splits=5, shuffle=True, random_state=42)

# # 存储每次折叠的预测误差
# errors = []

# # K折交叉验证
# for train_index, test_index in kf.split(X):
#     X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#     y_train, y_test = y.iloc[train_index], y.iloc[test_index]

#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     error = y_test - y_pred
#     errors.append(error)

# # 绘制箱线图
# plt.figure(figsize=(10, 6))
# plt.boxplot(errors, labels=[f"Fold {i+1}" for i in range(len(errors))])
# plt.title(
#     f"{model.__class__.__name__} Error Distribution per Fold in K-Fold Cross Validation"
# )
# plt.xlabel("Fold")
# plt.ylabel("Prediction Error")
# # 去除上、右边框
# plt.gca().spines["top"].set_visible(False)
# plt.gca().spines["right"].set_visible(False)

# # 保存图形
# plt.savefig("./fig/q3_3ap_boxplot.svg")

# plt.show()

In [96]:
# import matplotlib.pyplot as plt
# import numpy as np
# from modeling_old import model_selection
# from sklearn.model_selection import train_test_split

# plt.figure(figsize=(10, 6))

# # 遍历所有模型及其参数
# for model_type, params in all_best_params_sorted:
#     # 划分数据集
#     X_train, X_test, y_train, y_test = train_test_split(
#         X, y, test_size=0.2, random_state=42
#     )

#     # 选择并训练模型
#     model, _ = model_selection(
#         model_type=model_type,
#         params=params,
#         random_state=42,
#         input_size=X.shape[1],
#     )
#     model.fit(X_train, y_train)

#     # 进行预测
#     y_pred = model.predict(X_test)

#     # 计算误差
#     errors = y_test - y_pred

#     # 计算误差的 CDF
#     sorted_errors = np.sort(errors)
#     cdf = np.arange(1, len(sorted_errors) + 1) / len(sorted_errors)

#     # 绘制 CDF
#     plt.plot(
#         sorted_errors,
#         cdf,
#         marker=".",
#         linestyle="none",
#         alpha=0.5,
#         label=model.__class__.__name__,
#     )

# # 图形设置
# plt.title("CDF of Prediction Errors for Different Models")
# plt.xlabel("Prediction Error")
# plt.ylabel("CDF")
# plt.legend()
# plt.gca().spines["top"].set_visible(False)
# plt.gca().spines["right"].set_visible(False)

# # 保存图形
# plt.savefig("./fig/3ap_combined_cdf.svg")
# plt.show()

In [97]:
# import matplotlib.pyplot as plt
# import numpy as np
# import pandas as pd
# from matplotlib_inline import backend_inline
# from modeling_old import model_selection, process_X

# backend_inline.set_matplotlib_formats("svg")

# # 设置图形的布局
# num_models = len(all_best_params)
# cols = 2  # 每行2个子图
# rows = (num_models + cols - 1) // cols  # 计算行数

# plt.figure(figsize=(10, 6 * rows))

# for i, (model_type, params) in enumerate(all_best_params_sorted):
#     model, _ = model_selection(
#         model_type=model_type, params=params, random_state=42, input_size=X.shape[1]
#     )

#     X = process_X(X, model_type=model_type)
#     model.fit(X, y)

#     y_pred = model.predict(X)

#     # 计算均方误差
#     mse = np.mean((y - y_pred) ** 2)

#     # 创建子图
#     ax = plt.subplot(rows, cols, i + 1)
#     ax.scatter(y, y_pred, alpha=0.5)
#     ax.plot([y.min(), y.max()], [y.min(), y.max()], "r--")  # 45-degree line
#     ax.set_title(
#         f"{model.__class__.__name__} (MSE: {mse:.2f} MEAN R2: {mean_r2_dict[model_type]:.2f})"
#     )
#     ax.set_xlabel("True Values")
#     ax.set_ylabel("Predicted Values")

#     # 设置坐标范围，从零开始
#     ax.set_xlim(0, max(y.max(), y_pred.max()))
#     ax.set_ylim(0, max(y.max(), y_pred.max()))

#     # 坐标刻度一致
#     ax.axis("equal")

#     # 去除上、右边框
#     ax.spines["top"].set_visible(False)
#     ax.spines["right"].set_visible(False)

# # 调整布局
# plt.tight_layout()

# # 保存图形
# plt.savefig("./fig/q3_3ap_train_mse.svg")
# plt.show()