In [None]:
import pandas as pd
import smogn
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

def find_optimal_k(file_path, y_column, rg_mtrx):
    # 存储最佳 k 值和相应的性能指标
    best_k = None
    best_performance = float('inf')

    # 遍历 k 的范围
    for k in range(1, 11):
        processed_data = process_data(file_path, y_column, k, rg_mtrx)

        # 分割数据集为训练集和测试集
        X = processed_data.drop(y_column, axis=1)
        y = processed_data[y_column]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # 创建和训练 LightGBM 模型
        lgbm_model = lgb.LGBMRegressor()
        lgbm_model.fit(X_train, y_train)

        # 进行预测并评估模型
        y_pred = lgbm_model.predict(X_test)
        performance = mean_squared_error(y_test, y_pred)

        # 检查是否有更好的性能，并记录最佳 k
        if performance < best_performance:
            best_k = k
            best_performance = performance

    # 返回最佳 k 值和相应的性能指标
    return best_k, best_performance

def process_data(file_path, y_column, k_value, rg_mtrx):
    train = pd.read_csv(file_path)
    data_com = smogn.smoter(
        data = train,
        y = y_column,
        k = k_value,
        pert = 0.1,
        samp_method = 'balance',
        drop_na_col = True,
        drop_na_row = True,
        replace = False,
        rel_thres = 0.10,
        rel_method = 'manual',
        rel_ctrl_pts_rg = rg_mtrx
    )
    return data_com

# 使用示例
file_path = "path/to/your/data.csv"
y_column = '预测列名'
rg_mtrx = [[50, 1, 0], [45, 1, 0], [40, 1, 0], ...]  # 根据您的需求定义

best_k, best_performance = find_optimal_k(file_path, y_column, rg_mtrx)
print(f"Best k: {best_k} ")