In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, LSTM, RepeatVector, TimeDistributed, Dense, Conv1D, MaxPooling1D, Flatten

# 加载数据并填充缺失值
file_path = 'C:/Users/13593/Desktop/dsp/Data Analysis/Merged_Sorted_Data_Herd_Daily.xlsx'
data = pd.read_excel(file_path, sheet_name='Sheet1')
data_filled = data.fillna(data.median(numeric_only=True))

# 将列名中的空格替换为下划线
data_filled = data_filled.rename(columns=lambda x: x.replace(' ', '_'))

# 选择数值型特征列
numeric_cols = data_filled.select_dtypes(include=['float64', 'int64']).columns
X = data_filled[numeric_cols].values  # 将数据转化为数组
print(numeric_cols)

Index(['Lac_Avg_Days', 'Weight', 'Rumination_Minutes', 'Total_feed',
       'Average_cell_count', 'Day_production', 'Expected_Daily_Yield',
       'Fat_indication', 'Fat/Protein_Ratio', 'Protein_indication',
       'Concentrate_/_100_kg_Milk', 'Number_of_milkings',
       'Total_Amount_of_Milk_Produced', 'Amount_of_Milk_Separated'],
      dtype='object')


In [7]:
# 定义时间步长
timesteps = 10
n_features = X.shape[1]

# 将数据转换为LSTM输入格式 (样本数, 时间步数, 特征数)
X_sequence = []
for i in range(len(X) - timesteps):
    X_sequence.append(X[i:i + timesteps])

X_sequence = np.array(X_sequence)  # 3D数组 (样本数, 时间步长, 特征数)

# 将数据集划分为训练集和测试集
X_train, X_test = train_test_split(X_sequence, test_size=0.2, random_state=42)

# CNN + SAE 组合模型
input_seq = Input(shape=(timesteps, n_features))

# CNN 卷积层提取时间序列局部特征
x = Conv1D(filters=64, kernel_size=3, activation='relu')(input_seq)
x = MaxPooling1D(pool_size=2)(x)
x = Flatten()(x)

# 将 CNN 提取的特征通过 SAE 压缩为低维表示
encoded = Dense(64, activation='relu')(x)
encoded = RepeatVector(timesteps)(encoded)

# 解码器部分重构时间序列
decoded = LSTM(64, return_sequences=True, activation='relu')(encoded)
decoded = TimeDistributed(Dense(n_features))(decoded)

# 构建 CNN + SAE 模型
cnn_sae_model = Model(inputs=input_seq, outputs=decoded)
cnn_sae_model.compile(optimizer='adam', loss='mse')

# 训练 CNN + SAE 模型
cnn_sae_model.fit(X_train, X_train, epochs=50, batch_size=32, validation_split=0.2)

# 提取低维特征
encoder = Model(inputs=input_seq, outputs=encoded)
X_train_encoded = encoder.predict(X_train)
X_test_encoded = encoder.predict(X_test)

# 输出低维表示
print("Low-dimensional Representation from CNN + SAE (Train):", X_train_encoded.shape)
print("Low-dimensional Representation from CNN + SAE (Test):", X_test_encoded.shape)

Epoch 1/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 11ms/step - loss: 950471.0000 - val_loss: 535140.5625
Epoch 2/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 362634.0625 - val_loss: 151980.3125
Epoch 3/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 131236.8125 - val_loss: 83478.7500
Epoch 4/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 86115.2109 - val_loss: 40620.5430
Epoch 5/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 35112.8203 - val_loss: 25760.7480
Epoch 6/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 23158.3555 - val_loss: 18809.6387
Epoch 7/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 17233.5566 - val_loss: 15089.1709
Epoch 8/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 14358.9570 - val_los

In [8]:
# 准备目标变量 (预测未来牛奶产量)，假设第0列是牛奶产量
y_sequence = []
for i in range(len(X) - timesteps):
    y_sequence.append(X[i + timesteps, 5])  # 假设第0列是牛奶产量

y_sequence = np.array(y_sequence)  # 目标变量
y_train, y_test = train_test_split(y_sequence, test_size=0.2, random_state=42)

# 将 CNN + SAE 提取的低维特征转换为适合 MLP 的格式
X_train_encoded_flat = X_train_encoded.reshape(X_train_encoded.shape[0], -1)  # 扁平化
X_test_encoded_flat = X_test_encoded.reshape(X_test_encoded.shape[0], -1)  # 扁平化

# 构建 MLP 模型
mlp_model = Sequential()
mlp_model.add(Dense(128, activation='relu', input_shape=(X_train_encoded_flat.shape[1],)))  # 输入层 + 隐藏层1
mlp_model.add(Dense(64, activation='relu'))  # 隐藏层2
mlp_model.add(Dense(32, activation='relu'))  # 隐藏层3
mlp_model.add(Dense(1))  # 输出层，回归任务预测一个值（未来的牛奶产量）

# 编译 MLP 模型
mlp_model.compile(optimizer='adam', loss='mse')

# 训练 MLP 模型
mlp_model.fit(X_train_encoded_flat, y_train, epochs=50, batch_size=32, validation_data=(X_test_encoded_flat, y_test))

# 使用 MLP 模型进行预测
y_pred = mlp_model.predict(X_test_encoded_flat)

# 查看测试集的预测结果
print("Predicted Milk Production:", y_pred[:5])
print("True Milk Production:", y_test[:5])

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 44584.5469 - val_loss: 26.6172
Epoch 2/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 33.3589 - val_loss: 42.2378
Epoch 3/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 31.8766 - val_loss: 34.7798
Epoch 4/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 30.9711 - val_loss: 38.3751
Epoch 5/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 27.8899 - val_loss: 24.0881
Epoch 6/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 24.7809 - val_loss: 25.9887
Epoch 7/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 24.3322 - val_loss: 19.2699
Epoch 8/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 24.2178 - val_loss: 16.2717
Epoch 9/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━

In [9]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")


Mean Squared Error: 15.108878282454716
Mean Absolute Error: 3.0585262032516867
