In [31]:
# 文件路径配置
file_paths = {
    "data_file": r'F:\大模型项目\Paper_1\第二部分_微调测试数据集\微调数据集\ML二轮训练数据\yield_data.csv',
    "save_xgb_model": r'F:\大模型项目\Paper_1\第二部分_微调测试数据集\微调数据集\ML二轮训练数据\yield_xgboost_model.pkl',  # XGBoost模型保存路径
    "save_rf_model": r'F:\大模型项目\Paper_1\第二部分_微调测试数据集\微调数据集\ML二轮训练数据\yield_random_forest_model.pkl',  # 随机森林模型保存路径
    "save_ann_model": r'F:\大模型项目\Paper_1\第二部分_微调测试数据集\微调数据集\ML二轮训练数据\yield_ann_model.h5', # ANN模型保存路径
    "test_data_file": r'F:\大模型项目\Paper_1\第二部分_微调测试数据集\微调数据集\二轮微调数据集\test_csv\yield_data.csv'  # 测试集文件路径
}

XGBoost_train

In [32]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import joblib

# 1. 加载数据
df = pd.read_csv(file_paths["data_file"])

# 2. 提取特征和目标变量
X = df.iloc[:, 6:26]  # 第7到第26列
y = df.iloc[:, 26]    # 第27列

# 3. 分割数据集（训练集和验证集）
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. 训练 XGBoost 模型
model = XGBRegressor(random_state=42, n_estimators=100)
model.fit(X_train, y_train)

# 5. 在训练集验证集上评估模型
y_val_pred = model.predict(X_val)
r2_val = r2_score(y_val, y_val_pred)
rmse_val = mean_squared_error(y_val, y_val_pred, squared=False)

print(f'Validation R2: {r2_val}')
print(f'Validation RMSE: {rmse_val}')

# 6. 保存模型
joblib.dump(model, file_paths["save_xgb_model"])

Validation R2: 0.7470069043399985
Validation RMSE: 6.508778712718615




['F:\\大模型项目\\Paper_1\\第二部分_微调测试数据集\\微调数据集\\ML二轮训练数据\\yield_xgboost_model.pkl']

XGBoost_test

In [33]:
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# 1. 加载测试集数据
df_test = pd.read_csv(file_paths["test_data_file"])

# 2. 提取特征和目标变量
X_test = df_test.iloc[:, 6:26]  # 第7到第26列
y_test = df_test.iloc[:, 26]    # 第27列

# 3. 加载保存的 XGBoost 模型
model = joblib.load(file_paths["save_xgb_model"])

# 4. 使用模型对测试集进行预测
y_test_pred = model.predict(X_test)

# 5. 计算 R² 和 RMSE
r2_test = r2_score(y_test, y_test_pred)
rmse_test = mean_squared_error(y_test, y_test_pred, squared=False)

# 6. 输出结果
print(f'Test R2: {r2_test}')
print(f'Test RMSE: {rmse_test}')

Test R2: 0.3518341777452405
Test RMSE: 10.282157336093624




随机森林_train

In [34]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import joblib

# 1. 加载数据
df = pd.read_csv(file_paths["data_file"])

# 2. 提取特征和目标变量
X = df.iloc[:, 6:26]  # 第7到第26列
y = df.iloc[:, 26]    # 第27列

# 3. 分割数据集（训练集和验证集）
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. 训练随机森林模型
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# 5. 在训练集验证集上评估模型
y_val_pred = model.predict(X_val)
r2_val = r2_score(y_val, y_val_pred)
rmse_val = mean_squared_error(y_val, y_val_pred, squared=False)

print(f'Validation R2: {r2_val}')
print(f'Validation RMSE: {rmse_val}')

# 6. 保存模型
joblib.dump(model, file_paths["save_rf_model"])

Validation R2: 0.7757592698431064
Validation RMSE: 6.1277695100859




['F:\\大模型项目\\Paper_1\\第二部分_微调测试数据集\\微调数据集\\ML二轮训练数据\\yield_random_forest_model.pkl']

随机森林_test

In [35]:
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# 1. 加载测试集数据
df_test = pd.read_csv(file_paths["test_data_file"])

# 2. 提取特征和目标变量
X_test = df_test.iloc[:, 6:26]  # 第7到第26列
y_test = df_test.iloc[:, 26]    # 第27列

# 3. 加载保存的随机森林模型
model = joblib.load(file_paths["save_rf_model"])

# 4. 使用模型对测试集进行预测
y_test_pred = model.predict(X_test)

# 5. 计算 R² 和 RMSE
r2_test = r2_score(y_test, y_test_pred)
rmse_test = mean_squared_error(y_test, y_test_pred, squared=False)

# 6. 输出结果
print(f'Test R2: {r2_test}')
print(f'Test RMSE: {rmse_test}')

Test R2: 0.5648470148734626
Test RMSE: 8.42484976583659




ANN_train

In [36]:
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import joblib
from sklearn.preprocessing import StandardScaler

# 1. 加载数据
df = pd.read_csv(file_paths["data_file"])

# 2. 提取特征和目标变量
X = df.iloc[:, 6:26]  # 第7到第26列
y = df.iloc[:, 26]    # 第27列

# 3. 数据归一化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 4. 分割数据集（训练集和验证集）
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 5. 构建 ANN 模型
model = Sequential()
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))  # 输入层和第一隐藏层
model.add(Dense(64, activation='relu'))  # 第二隐藏层
model.add(Dense(1))  # 输出层，回归问题只有一个输出

# 6. 编译模型
model.compile(optimizer='adam', loss='mean_squared_error')

# 7. 训练模型
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_val, y_val))

# 8. 在训练集验证集上评估模型
y_val_pred = model.predict(X_val)
r2_val = r2_score(y_val, y_val_pred)
rmse_val = mean_squared_error(y_val, y_val_pred, squared=False)

print(f'Validation R2: {r2_val}')
print(f'Validation RMSE: {rmse_val}')

# 9. 保存模型
model.save(file_paths["save_ann_model"])

Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 2026.7557 - val_loss: 1883.9921
Epoch 2/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 1916.4375 - val_loss: 1865.3298
Epoch 3/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1983.8896 - val_loss: 1841.5869
Epoch 4/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1894.4092 - val_loss: 1811.6388
Epoch 5/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1927.3650 - val_loss: 1774.5920
Epoch 6/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1862.9287 - val_loss: 1730.7140
Epoch 7/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1851.0822 - val_loss: 1679.9182
Epoch 8/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1755.3142 - val_loss: 1623.0151
Epoch 9/100




Validation R2: -0.0026609601826481644
Validation RMSE: 12.957530049885365


ANN_test

In [37]:
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
from tensorflow.keras.models import load_model
from sklearn.preprocessing import StandardScaler

# 1. 加载测试集数据
df_test = pd.read_csv(file_paths["test_data_file"])

# 2. 提取特征和目标变量
X_test = df_test.iloc[:, 6:26]  # 第7到第26列
y_test = df_test.iloc[:, 26]    # 第27列

# 3. 数据归一化（训练时也使用了归一化，测试集也需要归一化）
scaler = StandardScaler()
X_test_scaled = scaler.fit_transform(X_test)

# 4. 加载保存的 ANN 模型
model = load_model(file_paths["save_ann_model"])

# 5. 使用模型对测试集进行预测
y_test_pred = model.predict(X_test_scaled)

# 6. 计算 R² 和 RMSE
r2_test = r2_score(y_test, y_test_pred)
rmse_test = mean_squared_error(y_test, y_test_pred, squared=False)

# 7. 输出结果
print(f'Test R2: {r2_test}')
print(f'Test RMSE: {rmse_test}')



[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
Test R2: -2.321247381149133
Test RMSE: 23.275112782660692




In [1]:
import tensorflow as tf

# 检查 GPU 是否可用
if tf.config.list_physical_devices('GPU'):
    print("GPU 可用")
else:
    print("GPU 不可用")


GPU 不可用
