In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.decomposition import PCA

In [2]:
noise_list = [0,0.05,0.25,0.55,1]
MSE_train_list, RMSE_train_list, MAE_train_list, R2_train_list = [], [], [], []
MSE_test_list, RMSE_test_list, MAE_test_list, R2_test_list = [], [], [], []

In [3]:
# 读取CSV文件，不指定数据类型，让pandas自动识别
data = pd.read_csv('preprocess2.csv')

# 处理缺失值（填充或删除含有缺失值的行）
data = data.dropna()
data = data.tail(10000)

# 将分类变量转化为虚拟变量（独热编码），排除目标变量
categorical_cols = [col for col in data.columns if data[col].dtype == 'object']
data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)

# 定义需要预测的目标变量
target = '紫外线（指数）'
results = {}

# 设置交叉验证的折数和参数范围
kf = KFold(n_splits=5, shuffle=True, random_state=1)
neighbors = list(range(1, 31))  # 邻居数从1到30


# 将当前目标变量y，其余作为特征X
X = data.drop(columns=[target])
y = data[target]
# 分割数据集为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
pca = PCA(n_components=0.95)  # 保留95%的方差
X_train_scaled = pca.fit_transform(X_train)
X_test_scaled = pca.transform(X_test)

# 假设我们添加高斯噪声
for noise_level in noise_list:  # 噪声水平，根据数据范围和需求调整

    # 向训练数据添加噪声
    X_train_noisy = X_train_scaled + noise_level * np.random.normal(size=X_train_scaled.shape)

    # 确保噪声数据在原数据的范围内，例如通过clip方法
    X_train_noisy = np.clip(X_train_noisy, X_train_scaled.min(), X_train_scaled.max())

    # 存储不同n_neighbors的分数
    scores = []

    # 对每个邻居数进行交叉验证
    for n in neighbors:
        knn_regressor = KNeighborsRegressor(n_neighbors=n)
        scores.append(cross_val_score(knn_regressor, X_train_noisy, y_train, cv=kf, scoring='neg_mean_squared_error').mean())

    # 找到最佳分数和对应的n_neighbors
    best_score_index = scores.index(max(scores))
    best_n_neighbors = neighbors[best_score_index]
    best_score = max(scores) * -1  # 因为分数是负的MSE

    # 使用最佳n_neighbors重新训练模型
    best_knn_regressor = KNeighborsRegressor(n_neighbors=best_n_neighbors)
    best_knn_regressor.fit(X_train_noisy, y_train)


    # 在原始测试集上进行预测
    y_pred_perturbed_test = best_knn_regressor.predict(X_test_scaled)
    y_pred_perturbed_train = best_knn_regressor.predict(X_train_noisy)

    # 计算扰动后的评估指标
    test_mse_perturbed = mean_squared_error(y_test, y_pred_perturbed_test)
    test_rmse_perturbed = np.sqrt(test_mse_perturbed)
    test_mae_perturbed = mean_absolute_error(y_test, y_pred_perturbed_test)
    test_r2_perturbed = r2_score(y_test, y_pred_perturbed_test)
    train_r2_perturbed = r2_score(y_train, y_pred_perturbed_train)


    # 打印结果
    results = {
        '最佳n_neighbors': best_n_neighbors,
        '训练集扰动后的R²分数': train_r2_perturbed,
        '测试集扰动后的MSE': test_mse_perturbed,
        '测试集扰动后的RMSE': test_rmse_perturbed,
        '测试集扰动后的MAE': test_mae_perturbed,
        '测试集扰动后的R²分数': test_r2_perturbed
    }


    R2_train_list.append((noise_level,train_r2_perturbed))
    MSE_test_list.append((noise_level,test_mse_perturbed))
    RMSE_test_list.append((noise_level,test_rmse_perturbed))
    MAE_test_list.append((noise_level,test_mae_perturbed))
    R2_test_list.append((noise_level,test_r2_perturbed))

    print(f"\n{target}的评价指标:")
    for metric, value in results.items():
        print(f"{metric}: {value:.3f}")


紫外线（指数）的评价指标:
最佳n_neighbors: 2.000
训练集扰动后的R²分数: 0.974
测试集扰动后的MSE: 0.510
测试集扰动后的RMSE: 0.714
测试集扰动后的MAE: 0.183
测试集扰动后的R²分数: 0.936

紫外线（指数）的评价指标:
最佳n_neighbors: 2.000
训练集扰动后的R²分数: 0.973
测试集扰动后的MSE: 0.603
测试集扰动后的RMSE: 0.777
测试集扰动后的MAE: 0.235
测试集扰动后的R²分数: 0.924

紫外线（指数）的评价指标:
最佳n_neighbors: 2.000
训练集扰动后的R²分数: 0.965
测试集扰动后的MSE: 0.648
测试集扰动后的RMSE: 0.805
测试集扰动后的MAE: 0.273
测试集扰动后的R²分数: 0.919

紫外线（指数）的评价指标:
最佳n_neighbors: 3.000
训练集扰动后的R²分数: 0.917
测试集扰动后的MSE: 1.001
测试集扰动后的RMSE: 1.001
测试集扰动后的MAE: 0.471
测试集扰动后的R²分数: 0.874

紫外线（指数）的评价指标:
最佳n_neighbors: 8.000
训练集扰动后的R²分数: 0.799
测试集扰动后的MSE: 1.582
测试集扰动后的RMSE: 1.258
测试集扰动后的MAE: 0.803
测试集扰动后的R²分数: 0.801


In [5]:
print(f'MSE result:{MSE_test_list}')
print(f'RMSE result:{RMSE_test_list}')
print(f'MAE result:{MAE_test_list}')
print(f'R2 Score result or train set:{R2_train_list}')
print(f'R2 Score result or test set:{R2_test_list}')

MSE result:[(0, 0.509875), (0.05, 0.603125), (0.25, 0.64775), (0.55, 1.0014444444444444), (1, 1.5818828125)]
RMSE result:[(0, 0.7140553199857838), (0.05, 0.7766112283504534), (0.25, 0.8048291744215043), (0.55, 1.0007219616079406), (1, 1.2577292286100374)]
MAE result:[(0, 0.18275), (0.05, 0.23475), (0.25, 0.2725), (0.55, 0.471), (1, 0.8028125)]
R2 Score result or train set:[(0, 0.9737514860195947), (0.05, 0.9729409157740414), (0.25, 0.9653549090740307), (0.55, 0.9167331911707749), (1, 0.7990612359370328)]
R2 Score result or test set:[(0, 0.9358992169112669), (0.05, 0.9241759552823885), (0.25, 0.9185657617146813), (0.55, 0.8740997830669279), (1, 0.8011278704861828)]


In [6]:
R2_train_list,MSE_test_list, RMSE_test_list, MAE_test_list, R2_test_list = [], [], [], [],[]

In [7]:
from sklearn.model_selection import GridSearchCV

scaler_X = StandardScaler()
X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)

for noise_level in noise_list:  # 噪声水平，根据数据范围和需求调整
    # 向训练数据添加噪声
    X_train_noisy = X_train_scaled + noise_level * np.random.normal(size=X_train_scaled.shape)

    # 确保噪声数据在原数据的范围内，例如通过clip方法
    X_train_noisy = np.clip(X_train_noisy, X_train_scaled.min(), X_train_scaled.max())

    # 使用GridSearchCV选择最佳n_neighbors和权重参数
    param_grid = {
        'n_neighbors': range(1, 21),
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    }


    grid_search = GridSearchCV(KNeighborsRegressor(), param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train_noisy, y_train)

    best_knn_regressor = grid_search.best_estimator_

    # 进行预测
    y_train_pred = best_knn_regressor.predict(X_train_noisy)
    y_pred = best_knn_regressor.predict(X_test_scaled)

    # 计算训练集和测试集的MSE、RMSE、MAE和R²分数
    train_r2 = best_knn_regressor.score(X_train_scaled, y_train)
    test_mse = mean_squared_error(y_test, y_pred)
    test_rmse = np.sqrt(test_mse)
    test_mae = mean_absolute_error(y_test, y_pred)
    test_r2 = best_knn_regressor.score(X_test_scaled, y_test)

    results = {
        '测试集MSE': test_mse,
        '测试集RMSE': test_rmse,
        '测试集MAE': test_mae,
        '训练集R²分数': train_r2,
        '测试集R²分数': test_r2
    }


    R2_train_list.append((noise_level, train_r2))
    MSE_test_list.append((noise_level, test_mse))
    RMSE_test_list.append((noise_level, test_rmse))
    MAE_test_list.append((noise_level, test_mae))
    R2_test_list.append((noise_level, test_r2))

    # 打印结果
    best_params = best_knn_regressor.get_params()
    print(best_params)
    print(f"\n目标的评价指标:")
    for metric, value in results.items():
        print(f"{metric}: {value:.3f}")


{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'manhattan', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 3, 'p': 2, 'weights': 'distance'}

目标的评价指标:
测试集MSE: 1.001
测试集RMSE: 1.001
测试集MAE: 0.454
训练集R²分数: 0.986
测试集R²分数: 0.874
{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'manhattan', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 4, 'p': 2, 'weights': 'distance'}

目标的评价指标:
测试集MSE: 1.088
测试集RMSE: 1.043
测试集MAE: 0.557
训练集R²分数: 0.946
测试集R²分数: 0.863
{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'manhattan', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 10, 'p': 2, 'weights': 'distance'}

目标的评价指标:
测试集MSE: 1.884
测试集RMSE: 1.373
测试集MAE: 0.932
训练集R²分数: 0.811
测试集R²分数: 0.763
{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'euclidean', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 20, 'p': 2, 'weights': 'distance'}

目标的评价指标:
测试集MSE: 2.562
测试集RMSE: 1.601
测试集MAE: 1.171
训练集R²分数: 0.693
测试集R²分数: 0.678
{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'euclidean', 'met

In [8]:
print(f'MSE result:{MSE_test_list}')
print(f'RMSE result:{RMSE_test_list}')
print(f'MAE result:{MAE_test_list}')
print(f'R2 Score result or train set:{R2_train_list}')
print(f'R2 Score result or test set:{R2_test_list}')

MSE result:[(0, 1.001363813712909), (0.05, 1.0881535203540627), (0.25, 1.8844491946329192), (0.55, 2.561937254397314), (1, 3.210108016133624)]
RMSE result:[(0, 1.0006816745163813), (0.05, 1.0431459726970442), (0.25, 1.372752415635434), (0.55, 1.60060527751139), (1, 1.791677430826661)]
MAE result:[(0, 0.45406773538822026), (0.05, 0.5569750656263976), (0.25, 0.932358117687272), (0.55, 1.1705150263538437), (1, 1.373485709857196)]
R2 Score result or train set:[(0, 0.9858665162606821), (0.05, 0.9456169308800786), (0.25, 0.8108619514616444), (0.55, 0.6926220913867733), (1, 0.6008401823264653)]
R2 Score result or test set:[(0, 0.8741099198514976), (0.05, 0.8631988374102337), (0.25, 0.7630896414476048), (0.55, 0.677916775227131), (1, 0.5964296393555584)]


In [9]:
R2_train_list,MSE_test_list, RMSE_test_list, MAE_test_list, R2_test_list = [], [], [], [],[]

In [10]:
scaler_X = StandardScaler()
X_train_scaled= scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)

# 假设我们添加高斯噪声
for noise_level in noise_list:  # 噪声水平，根据数据范围和需求调整

    # 向训练数据添加噪声
    X_train_noisy = X_train_scaled + noise_level * np.random.normal(size=X_train_scaled.shape)

    # 确保噪声数据在原数据的范围内，例如通过clip方法
    X_train_noisy = np.clip(X_train_noisy, X_train_scaled.min(), X_train_scaled.max())

    # 存储不同n_neighbors的分数
    scores = []

    # 对每个邻居数进行交叉验证
    # 使用GridSearchCV选择最佳n_neighbors和权重参数
    param_grid = {
        'n_neighbors': range(1, 21),
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    }


    grid_search = GridSearchCV(KNeighborsRegressor(), param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train_noisy, y_train)
    best_knn_regressor = grid_search.best_estimator_
    
    # 在原始测试集上进行预测
    y_pred_perturbed_test = best_knn_regressor.predict(X_test_scaled)
    y_pred_perturbed_train = best_knn_regressor.predict(X_train_noisy)

    # 计算扰动后的评估指标
    test_mse_perturbed = mean_squared_error(y_test, y_pred_perturbed_test)
    test_rmse_perturbed = np.sqrt(test_mse_perturbed)
    test_mae_perturbed = mean_absolute_error(y_test, y_pred_perturbed_test)
    test_r2_perturbed = r2_score(y_test, y_pred_perturbed_test)

    train_mse_perturbed = mean_squared_error(y_train, y_pred_perturbed_train)
    train_rmse_perturbed = np.sqrt(train_mse_perturbed)
    train_mae_perturbed = mean_absolute_error(y_train, y_pred_perturbed_train)
    train_r2_perturbed = r2_score(y_train, y_pred_perturbed_train)


    # 打印结果
    results = {
        '最佳n_neighbors': best_n_neighbors,
        '训练集扰动后的MSE': train_mse_perturbed,
        '训练集扰动后的RMSE': train_rmse_perturbed,
        '训练集扰动后的MAE': train_mae_perturbed,
        '训练集扰动后的R²分数': train_r2_perturbed,
        '测试集扰动后的MSE': test_mse_perturbed,
        '测试集扰动后的RMSE': test_rmse_perturbed,
        '测试集扰动后的MAE': test_mae_perturbed,
        '测试集扰动后的R²分数': test_r2_perturbed
    }

    MSE_train_list.append((noise_level,train_mse_perturbed))
    RMSE_train_list.append((noise_level,train_rmse_perturbed))
    MAE_train_list.append((noise_level,train_mae_perturbed))
    R2_train_list.append((noise_level,train_r2_perturbed))

    MSE_test_list.append((noise_level,test_mse_perturbed))
    RMSE_test_list.append((noise_level,test_rmse_perturbed))
    MAE_test_list.append((noise_level,test_mae_perturbed))
    R2_test_list.append((noise_level,test_r2_perturbed))

    print(f"\n{target}的评价指标:")
    for metric, value in results.items():
        print(f"{metric}: {value:.3f}")


紫外线（指数）的评价指标:
最佳n_neighbors: 8.000
训练集扰动后的MSE: 0.114
训练集扰动后的RMSE: 0.337
训练集扰动后的MAE: 0.052
训练集扰动后的R²分数: 0.986
测试集扰动后的MSE: 1.001
测试集扰动后的RMSE: 1.001
测试集扰动后的MAE: 0.454
测试集扰动后的R²分数: 0.874

紫外线（指数）的评价指标:
最佳n_neighbors: 8.000
训练集扰动后的MSE: 0.000
训练集扰动后的RMSE: 0.000
训练集扰动后的MAE: 0.000
训练集扰动后的R²分数: 1.000
测试集扰动后的MSE: 1.113
测试集扰动后的RMSE: 1.055
测试集扰动后的MAE: 0.591
测试集扰动后的R²分数: 0.860

紫外线（指数）的评价指标:
最佳n_neighbors: 8.000
训练集扰动后的MSE: 0.000
训练集扰动后的RMSE: 0.000
训练集扰动后的MAE: 0.000
训练集扰动后的R²分数: 1.000
测试集扰动后的MSE: 1.751
测试集扰动后的RMSE: 1.323
测试集扰动后的MAE: 0.883
测试集扰动后的R²分数: 0.780

紫外线（指数）的评价指标:
最佳n_neighbors: 8.000
训练集扰动后的MSE: 0.000
训练集扰动后的RMSE: 0.000
训练集扰动后的MAE: 0.000
训练集扰动后的R²分数: 1.000
测试集扰动后的MSE: 2.593
测试集扰动后的RMSE: 1.610
测试集扰动后的MAE: 1.199
测试集扰动后的R²分数: 0.674

紫外线（指数）的评价指标:
最佳n_neighbors: 8.000
训练集扰动后的MSE: 0.000
训练集扰动后的RMSE: 0.000
训练集扰动后的MAE: 0.000
训练集扰动后的R²分数: 1.000
测试集扰动后的MSE: 3.331
测试集扰动后的RMSE: 1.825
测试集扰动后的MAE: 1.407
测试集扰动后的R²分数: 0.581


In [11]:
print(f'MSE result:{MSE_test_list}')
print(f'RMSE result:{RMSE_test_list}')
print(f'MAE result:{MAE_test_list}')
print(f'R2 Score result or train set:{R2_train_list}')
print(f'R2 Score result or test set:{R2_test_list}')

MSE result:[(0, 1.001363813712909), (0.05, 1.1132827750748377), (0.25, 1.7511579333975646), (0.55, 2.593152587210412), (1, 3.3308617738259283)]
RMSE result:[(0, 1.0006816745163813), (0.05, 1.0551221612092307), (0.25, 1.3233132408457058), (0.55, 1.6103268572592373), (1, 1.8250648683884987)]
MAE result:[(0, 0.45406773538822026), (0.05, 0.5913673352684506), (0.25, 0.882917003911968), (0.55, 1.1989030697078091), (1, 1.4067964689967307)]
R2 Score result or train set:[(0, 0.9858665162606821), (0.05, 1.0), (0.25, 1.0), (0.55, 1.0), (1, 1.0)]
R2 Score result or test set:[(0, 0.8741099198514976), (0.05, 0.8600396221005246), (0.25, 0.7798468353168299), (0.55, 0.6739924265579571), (1, 0.5812486431722117)]
