In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# 读取CSV文件，不指定数据类型，让pandas自动识别
data = pd.read_csv('preprocess2.csv')

# 处理缺失值（填充或删除含有缺失值的行）
data = data.dropna()
data = data.tail(10000)

# 将分类变量转化为虚拟变量（独热编码），排除目标变量
categorical_cols = [col for col in data.columns if data[col].dtype == 'object']
data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)

# 定义需要预测的目标变量
target = '紫外线（指数）'
results = {}

# 设置交叉验证的折数和参数范围
kf = KFold(n_splits=5, shuffle=True, random_state=1)
max_depths = list(range(1, 21))  # 树的最大深度从1到20

# 将当前目标变量y，其余作为特征X
X = data.drop(columns=[target])
y = data[target]

# 分割数据集为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 标准化处理特征
scaler_X = StandardScaler()
X_train = scaler_X.fit_transform(X_train)
X_test = scaler_X.transform(X_test)

# 应用PCA进行特征降维
pca = PCA(n_components=0.95)  # 保留95%的方差
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# 存储不同max_depth的分数
scores = []

# 对每个树的最大深度进行交叉验证
for max_depth in max_depths:
    dt_regressor = DecisionTreeRegressor(max_depth=max_depth)
    scores.append(cross_val_score(dt_regressor, X_train_pca, y_train, cv=kf, scoring='neg_mean_squared_error').mean())

# 找到最佳分数和对应的max_depth
best_score_index = scores.index(max(scores))
best_max_depth = max_depths[best_score_index]
best_score = max(scores) * -1  # 因为分数是负的MSE

# 使用最佳max_depth训练模型
best_dt_regressor = DecisionTreeRegressor(max_depth=best_max_depth)
best_dt_regressor.fit(X_train_pca, y_train)

# 进行预测
y_pred = best_dt_regressor.predict(X_test_pca)
y_train_pred = best_dt_regressor.predict(X_train_pca)

# 计算训练集和测试集的MSE、RMSE、MAE和R²分数
train_mse = mean_squared_error(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)
train_mae = mean_absolute_error(y_train, y_train_pred)
train_r2 = best_dt_regressor.score(X_train_pca, y_train)

test_mse = mean_squared_error(y_test, y_pred)
test_rmse = np.sqrt(test_mse)
test_mae = mean_absolute_error(y_test, y_pred)
test_r2 = best_dt_regressor.score(X_test_pca, y_test)

results = {
    '最佳dt_regressor': best_dt_regressor,
    '最佳MSE': best_score,
    '训练集MSE': train_mse,
    '训练集RMSE': train_rmse,
    '训练集MAE': train_mae,
    '训练集R²分数': train_r2,
    '测试集MSE': test_mse,
    '测试集RMSE': test_rmse,
    '测试集MAE': test_mae,
    '测试集R²分数': test_r2
}

# 打印结果
print(f"\n{target}的评价指标:")
for metric, value in results.items():
    print(f"{metric}: {value}")

# 检查数据是否有异常值
print("\n数据描述统计:")
print(data.describe())

# 检查训练和测试集的分布
print("\n训练集和测试集的分布差异:")
print("训练集的形状:", X_train_pca.shape)
print("测试集的形状:", X_test_pca.shape)


In [None]:
# 可视化回归残差
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.title('True Values vs Predictions')
plt.show()

plt.figure(figsize=(10, 6))
residuals = y_test - y_pred
sns.histplot(residuals, kde=True)
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.title('Residuals Distribution')
plt.show()