In [14]:
import pandas as pd
import os


def process_sheet(df):
    """处理单个sheet，添加下一时间步的酸钠值作为新特征"""
    # 创建下一时间步的酸钠列
    df['next_酸钠'] = df['酸钠'].shift(-1)
    df['next_残糖'] = df['残糖g/dl'].shift(-1)
    return df

def main():
    # 读取Excel文件
    excel_path = 'bio_train_trans.xlsx'  # 请替换为您的Excel文件路径
    
    # 读取所有sheet
    excel_file = pd.ExcelFile(excel_path)
    sheet_names = excel_file.sheet_names
    
    # 创建新的Excel文件
    output_path = 'processed_' + os.path.basename(excel_path)
    
    # 处理每个sheet
    with pd.ExcelWriter(output_path) as writer:
        for sheet_name in sheet_names:
            # 读取当前sheet
            df = pd.read_excel(excel_path, sheet_name=sheet_name)
            
            # 处理数据
            processed_df = process_sheet(df)
            
            # 保存到新的Excel文件
            processed_df.to_excel(writer, sheet_name=sheet_name, index=False)
    
    print(f"处理完成！结果已保存到: {output_path}")

if __name__ == "__main__":
    main()

处理完成！结果已保存到: processed_bio_train_trans.xlsx


In [15]:
# 合并所有的sheet

# 读取Excel文件中的所有工作表
excel_file = 'processed_bio_train_trans.xlsx'
all_sheets = pd.read_excel(excel_file, sheet_name=None)

# 初始化一个空的DataFrame来存储合并后的数据
merged_df = pd.DataFrame()

# 遍历所有工作表并合并
for sheet_name, df in all_sheets.items():
    # 假设发酵周期列名为'发酵周期'或'Time'，请根据实际情况调整
    if merged_df.empty:
        merged_df = df
    else:
        merged_df = pd.concat([merged_df, df], ignore_index=True)

# 按发酵周期排序（如果需要的话）
#merged_df = merged_df.sort_values('发酵周期/h')

# 保存合并后的数据到新的Excel文件
output_file = 'merged_bio_train.xlsx'
merged_df.to_excel(output_file, index=False)

In [16]:
# 使用随机森林填补缺失值
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split


In [19]:
merged_df = pd.read_excel('merged_bio_train.xlsx')

# 填补缺失值
def fill_missing_with_rf(df):
    # 对每个包含缺失值的列进行填充
    for column in df.columns[df.isnull().any()]:
        # 分离包含缺失值和不包含缺失值的行
        known = df[df[column].notnull()]
        unknown = df[df[column].isnull()]
        
        if len(known) == 0 or len(unknown) == 0:
            continue
            
        # 准备特征（使用其他列作为特征）
        features = [x for x in df.columns if x != column]
        
        # 去除特征中包含缺失值的行
        known_features = known[features].fillna(known[features].mean())
        
        # 训练随机森林模型
        rf = RandomForestRegressor(n_estimators=100, random_state=42)
        rf.fit(known_features, known[column])
        
        # 预测缺失值
        unknown_features = unknown[features].fillna(known[features].mean())
        predictions = rf.predict(unknown_features)
        
        # 填充缺失值
        df.loc[df[column].isnull(), column] = predictions
    
    return df

# 应用随机森林填充缺失值
merged_df = fill_missing_with_rf(merged_df)

# 保存处理后的数据
output_file = 'filled_merged_bio_train.xlsx'
merged_df.to_excel(output_file, index=False)

In [22]:
# LSTM网络架构进行

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader


In [29]:

# 数据预处理
def prepare_data(df, sequence_length=1):
    # 准备特征列和标签列
    feature_columns = df.columns.difference(['next_酸钠', 'next_残糖'])
    X = df[feature_columns].values
    y = df[['next_酸钠', 'next_残糖']].values
    
    # 数据标准化
    X_scaler = MinMaxScaler()
    y_scaler = MinMaxScaler()
    X_normalized = X_scaler.fit_transform(X)
    y_normalized = y_scaler.fit_transform(y)
    
    # 创建序列数据
    X_sequences = []
    y_sequences = []
    
    for i in range(len(X_normalized) - sequence_length):
        X_sequences.append(X_normalized[i:i+sequence_length])
        y_sequences.append(y_normalized[i+sequence_length])
    
    return (np.array(X_sequences), np.array(y_sequences), 
            X_scaler, y_scaler)

# 自定义数据集类
class FermentationDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        self.y = torch.FloatTensor(y)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# LSTM模型定义
class LSTMPredictor(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMPredictor, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, 
                           batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        # 初始化隐藏状态
        h0 = torch.zeros(self.num_layers, x.size(0), 
                        self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), 
                        self.hidden_size).to(x.device)
        
        # LSTM前向传播
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# 训练函数
def train_model(model, train_loader, criterion, optimizer, device, num_epochs=100):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch_X, batch_y in train_loader:
            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)
            
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}')

def main():
    # 读取数据
    df = pd.read_excel('filled_merged_bio_train.xlsx')
    
    # 数据预处理
    sequence_length = 3  # 可以调整序列长度
    X, y, X_scaler, y_scaler = prepare_data(df, sequence_length)
    
    # 划分训练集和验证集
    train_size = int(0.8 * len(X))
    X_train, X_val = X[:train_size], X[train_size:]
    y_train, y_val = y[:train_size], y[train_size:]
    
    # 创建数据集和数据加载器
    train_dataset = FermentationDataset(X_train, y_train)
    val_dataset = FermentationDataset(X_val, y_val)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
    
    # 设置设备
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"使用设备: {device}")
    
    # 初始化模型
    input_size = X.shape[2]  # 特征数量
    hidden_size = 64  # 增加隐藏层大小
    num_layers = 3   # 增加LSTM层数
    output_size = 2  # 预测两个值：next_酸钠和next_残糖
    
    model = LSTMPredictor(input_size, hidden_size, num_layers, output_size)
    model = model.to(device)
    
    # 定义损失函数和优化器
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    
    # 训练模型
    best_val_loss = float('inf')
    patience = 10
    counter = 0
    
    for epoch in range(200):  # 增加训练轮数
        # 训练阶段
        model.train()
        train_loss = 0
        for batch_X, batch_y in train_loader:
            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)
            
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        # 验证阶段
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch_X, batch_y in val_loader:
                batch_X = batch_X.to(device)
                batch_y = batch_y.to(device)
                outputs = model(batch_X)
                val_loss += criterion(outputs, batch_y).item()
        
        train_loss /= len(train_loader)
        val_loss /= len(val_loader)
        
        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch+1}/200], 训练损失: {train_loss:.4f}, 验证损失: {val_loss:.4f}')
        
        # # 早停机制
        # if val_loss < best_val_loss:
        #     best_val_loss = val_loss
        #     counter = 0
        #     # 保存最佳模型
        #     torch.save({
        #         'epoch': epoch,
        #         'model_state_dict': model.state_dict(),
        #         'optimizer_state_dict': optimizer.state_dict(),
        #         'train_loss': train_loss,
        #         'val_loss': val_loss,
        #         'X_scaler': X_scaler,
        #         'y_scaler': y_scaler,
        #     }, 'best_fermentation_model.pth')
        # else:
        #     counter += 1
        #     if counter >= patience:
        #          print(f'早停: {patience} 轮验证损失没有改善')
        #          break
    
    print('训练完成！')
    # 添加可视化部分
    plt.style.use('seaborn')
    
    # 加载最佳模型
    checkpoint = torch.load('best_fermentation_model.pth')
    model.load_state_dict(checkpoint['model_state_dict'])
    
    # 可视化预测结果
    predictions, actual_values = visualize_results(model, val_loader, y_scaler, device)
    
    # 保存预测结果到CSV文件
    results_df = pd.DataFrame({
        '实际_酸钠': actual_values[:, 0],
        '预测_酸钠': predictions[:, 0],
        '实际_残糖': actual_values[:, 1],
        '预测_残糖': predictions[:, 1]
    })
    results_df.to_csv('prediction_results.csv', index=False)
    print("\n预测结果已保存到 'prediction_results.csv' 和 'prediction_results.png'")

if __name__ == '__main__':
    main() 

使用设备: cpu
Epoch [10/200], 训练损失: 0.0880, 验证损失: 0.0757
Epoch [20/200], 训练损失: 0.0727, 验证损失: 0.0748
Epoch [30/200], 训练损失: 0.0635, 验证损失: 0.0634
Epoch [40/200], 训练损失: 0.0535, 验证损失: 0.0912
Epoch [50/200], 训练损失: 0.0364, 验证损失: 0.0742
Epoch [60/200], 训练损失: 0.0525, 验证损失: 0.0706
Epoch [70/200], 训练损失: 0.0363, 验证损失: 0.0630
Epoch [80/200], 训练损失: 0.0430, 验证损失: 0.0657
Epoch [90/200], 训练损失: 0.0263, 验证损失: 0.0599
Epoch [100/200], 训练损失: 0.0318, 验证损失: 0.0840
Epoch [110/200], 训练损失: 0.0223, 验证损失: 0.0614
Epoch [120/200], 训练损失: 0.0176, 验证损失: 0.0619
Epoch [130/200], 训练损失: 0.0132, 验证损失: 0.0793
Epoch [140/200], 训练损失: 0.0126, 验证损失: 0.0596
Epoch [150/200], 训练损失: 0.0098, 验证损失: 0.0534
Epoch [160/200], 训练损失: 0.0107, 验证损失: 0.0508
Epoch [170/200], 训练损失: 0.0085, 验证损失: 0.0438
Epoch [180/200], 训练损失: 0.0071, 验证损失: 0.0423
Epoch [190/200], 训练损失: 0.0085, 验证损失: 0.0442
Epoch [200/200], 训练损失: 0.0073, 验证损失: 0.0462
训练完成！


  plt.style.use('seaborn')
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig('prediction_results.png')
  plt.savefig('prediction_results.png')
  plt.savefig('prediction_results.png')
  plt.savefig('prediction_results.png')
  plt.savefig('prediction_results.png')
  plt.savefig('prediction_results.png')
  plt.savefig('prediction_results.png')
  plt.savefig('prediction_results.png')
  plt.savefig('prediction_results.png')
  plt.savefig('prediction_results.png')
  plt.savefig('prediction_results.png')
  plt.savefig('prediction_results.png')
  plt.savefig('prediction_results.png')
  plt.savefig('prediction_results.png')
  plt.savefig('prediction_results.png')



评估指标:
酸钠 - RMSE: 9.2223, MAE: 8.1924, R2: 0.0520
残糖 - RMSE: 9.6078, MAE: 8.4730, R2: -0.0142

预测结果已保存到 'prediction_results.csv' 和 'prediction_results.png'


In [28]:
# 可视化数据
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score


In [27]:
def visualize_results(model, val_loader, y_scaler, device):
    """可视化预测结果"""
    model.eval()
    predictions = []
    actual_values = []
    
    with torch.no_grad():
        for batch_X, batch_y in val_loader:
            batch_X = batch_X.to(device)
            outputs = model(batch_X)
            # 将预测值移到CPU并转换为numpy数组
            predictions.append(outputs.cpu().numpy())
            actual_values.append(batch_y.numpy())
    
    # 合并所有批次的结果
    predictions = np.vstack(predictions)
    actual_values = np.vstack(actual_values)
    
    # 将标准化的值转换回原始范围
    predictions = y_scaler.inverse_transform(predictions)
    actual_values = y_scaler.inverse_transform(actual_values)
    # 创建图表
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
    
    # 绘制酸钠预测结果
    ax1.plot(actual_values[:, 0], label='实际值', color='blue', alpha=0.7)
    ax1.plot(predictions[:, 0], label='预测值', color='red', alpha=0.7)
    ax1.set_title('酸钠预测结果对比')
    ax1.set_xlabel('样本')
    ax1.set_ylabel('酸钠值')
    ax1.legend()
    ax1.grid(True)
    
    # 绘制残糖预测结果
    ax2.plot(actual_values[:, 1], label='实际值', color='blue', alpha=0.7)
    ax2.plot(predictions[:, 1], label='预测值', color='red', alpha=0.7)
    ax2.set_title('残糖预测结果对比')
    ax2.set_xlabel('样本')
    ax2.set_ylabel('残糖值')
    ax2.legend()
    ax2.grid(True)
    
    plt.tight_layout()
    plt.savefig('prediction_results.png')
    plt.close()
    # 计算评估指标
    mse = np.mean((predictions - actual_values) ** 2, axis=0)
    rmse = np.sqrt(mse)
    mae = np.mean(np.abs(predictions - actual_values), axis=0)
    r2 = r2_score(actual_values, predictions, multioutput='raw_values')
    
    print("\n评估指标:")
    print(f"酸钠 - RMSE: {rmse[0]:.4f}, MAE: {mae[0]:.4f}, R2: {r2[0]:.4f}")
    print(f"残糖 - RMSE: {rmse[1]:.4f}, MAE: {mae[1]:.4f}, R2: {r2[1]:.4f}")
    
    return predictions, actual_values
