<a href="https://colab.research.google.com/github/RicardoMiles/Dissertation-template/blob/main/augmented-data-202411221204.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/My Drive/Colab Notebooks/SummerProject/src')
!pwd

Mounted at /content/drive
/content/drive/My Drive/Colab Notebooks/SummerProject/src


In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# 定义包含CSV文件的文件夹路径
current_dir = os.getcwd()
root_dir = os.path.dirname(current_dir)
data_dir = os.path.join(root_dir, 'Data')
left_data = os.path.join(data_dir,'left')
right_data = os.path.join(data_dir,'right')

folder_path = left_data

# 获取left文件夹中所有CSV文件的列表
csv_files_left = [os.path.join(left_data, file) for file in os.listdir(left_data) if file.endswith('.csv')]
# 获取right文件夹中所有CSV文件的列表
csv_files_right = [os.path.join(right_data, file) for file in os.listdir(right_data) if file.endswith('.csv')]

print(f"Number of left files: {len(csv_files_left)}")
print(f"Number of right files: {len(csv_files_right)}")

Number of left files: 26
Number of right files: 26


In [3]:
from scipy import stats
# 定义特征提取函数
def extract_features(group):
    features = []
    # 遍历第三列到最后一列
    for col in group.columns[2:]:  # 从第三列开始提取特征
        data = group[col].values
        features.append(data.mean())  # 均值
        features.append(data.std())   # 标准差
        features.append(stats.skew(data))  # 偏度
        features.append(stats.kurtosis(data))  # 峰度
    return features

In [72]:
# 初始化存储所有特征和标签的列表
all_features = []
all_labels = []

# 初始化CSV文件计数器
csv_files_count = 0

# 处理left文件夹中的CSV文件
for file_name in csv_files_left:
    df = pd.read_csv(file_name)
    csv_files_count += 1  # 增加文件计数
    # 将数据按照epoch分组
    grouped = df.groupby(df.columns[1])  # 第二列是epoch分组的列
    for name, group in grouped:
        # 过滤掉time小于0的数据
        group = group[group[df.columns[0]] >= 0]  # 假设time列是第一列
        if len(group) == 81:  # 确保过滤后有81条数据
        # if len(group) == 101:
            # 提取特征
            features = group.iloc[:, 2:].values  # 第三列及以后是EEG通道信号
            all_features.append(features)
            # 获取标签
            all_labels.append('left')

# 处理right文件夹中的CSV文件
for file_name in csv_files_right:
    df_right = pd.read_csv(file_name)
    csv_files_count += 1  # 增加文件计数
    # 将数据按照epoch分组
    grouped_right = df_right.groupby(df_right.columns[1])  # 第二列是epoch分组的列
    for name, group in grouped_right:
        # 过滤掉time小于0的数据
        group = group[group[df_right.columns[0]] >= 0]  # 假设time列是第一列
        if len(group) == 81:  # 确保过滤后有81条数据
        # if len(group) == 101:
            # 提取特征
            features = group.iloc[:, 2:].values  # 第三列及以后是EEG通道信号
            all_features.append(features)
            # 获取标签
            all_labels.append('right')

# 将特征和标签转换为NumPy数组
features_array = np.array(all_features)
labels_array = np.array(all_labels)

# 打印处理的CSV文件数量
print(f"Total CSV files processed: {csv_files_count}")

Total CSV files processed: 52


In [106]:
import numpy as np
from sklearn.model_selection import train_test_split

# 定义衡量分布相似性的函数
def evaluate_split_similarity(X_train, X_test):
    train_mean = X_train.mean(axis=0)
    test_mean = X_test.mean(axis=0)
    # 使用均方差衡量分布差异
    return np.mean((train_mean - test_mean) ** 2)

best_seed = None
min_difference = float('inf')

for seed in range(100):  # 遍历多个种子
    # 将 X 和 y 替换为 features_array 和 labels_array
    X_train, X_test, y_train, y_test = train_test_split(features_array, labels_array, test_size=0.2, random_state=seed)
    diff = evaluate_split_similarity(X_train, X_test)
    if diff < min_difference:
        min_difference = diff
        best_seed = seed

print(f"Best seed: {best_seed}, Minimum difference: {min_difference}")


Best seed: 76, Minimum difference: 0.10239083541940588


In [73]:
print(f"Total samples: {features_array.shape[0]}")  # 样本数量
print(f"Timepoints per sample: {features_array.shape[1]}")  # 每个样本的时间点
print(f"Features per timepoint: {features_array.shape[2]}")  # 每个时间点的特征


Total samples: 7220
Timepoints per sample: 81
Features per timepoint: 101


In [99]:
from datetime import datetime
import tensorflow as tf

np.random.seed(76)
tf.random.set_seed(76)

# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(features_array, labels_array, test_size=0.2, random_state=42)

# 打印当前处理的时间
current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(f"Current processing time: {current_time}")

Current processing time: 2024-11-22 11:00:06


In [113]:
print(f"Initial X_train shape: {X_train.shape}")
print(f"Initial X_test shape: {X_test.shape}")

Initial X_train shape: (11552, 81, 101)
Initial X_test shape: (1444, 81, 101)


In [114]:
# 数据增强 (增加训练集的多样性)
noise_factor = 0.1
X_train_augmented = X_train + noise_factor * np.random.normal(size=X_train.shape)

# 如果数据增强后想替代原训练集，可以直接覆盖：
X_train = np.vstack((X_train, X_train_augmented))
y_train = np.hstack((y_train, y_train))  # 扩展标签，假设数据增强不改变标签

# 数据标准化
# 获取数据的维度信息
n_samples, n_timesteps, n_features = X_train.shape

# 遍历每个特征进行标准化
scalers = []  # 用于保存每个特征的标准化器
for i in range(n_features):
    scaler = StandardScaler()  # 为每个特征创建独立标准化器
    # 对训练集的每个特征标准化
    X_train[:, :, i] = scaler.fit_transform(X_train[:, :, i])
    # 对测试集的每个特征标准化
    X_test[:, :, i] = scaler.transform(X_test[:, :, i])
    scalers.append(scaler)  # 保存该特征的标准化器

    # 打印标准化后该特征的均值和标准差
    print(f"Feature {i} - Train mean: {X_train[:, :, i].mean():.5f}, Train std: {X_train[:, :, i].std():.5f}")
    print(f"Feature {i} - Test mean: {X_test[:, :, i].mean():.5f}, Test std: {X_test[:, :, i].std():.5f}")

# 打印最终形状，确保没有改变
print(f"X_train shape after scaling: {X_train.shape}")
print(f"X_test shape after scaling: {X_test.shape}")


Feature 0 - Train mean: 0.00000, Train std: 1.00000
Feature 0 - Test mean: -0.01397, Test std: 1.01018
Feature 1 - Train mean: -0.00000, Train std: 1.00000
Feature 1 - Test mean: -0.01167, Test std: 1.02506
Feature 2 - Train mean: -0.00000, Train std: 1.00000
Feature 2 - Test mean: -0.00441, Test std: 1.01323
Feature 3 - Train mean: 0.00000, Train std: 1.00000
Feature 3 - Test mean: -0.00876, Test std: 0.99427
Feature 4 - Train mean: -0.00000, Train std: 1.00000
Feature 4 - Test mean: -0.01991, Test std: 0.98973
Feature 5 - Train mean: 0.00000, Train std: 1.00000
Feature 5 - Test mean: -0.03362, Test std: 1.04537
Feature 6 - Train mean: -0.00000, Train std: 1.00000
Feature 6 - Test mean: 0.01633, Test std: 1.01341
Feature 7 - Train mean: -0.00000, Train std: 1.00000
Feature 7 - Test mean: -0.00246, Test std: 1.00679
Feature 8 - Train mean: -0.00000, Train std: 1.00000
Feature 8 - Test mean: -0.03391, Test std: 0.97638
Feature 9 - Train mean: 0.00000, Train std: 1.00000
Feature 9 - Test

In [115]:
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# 初始化标签编码器
label_encoder = LabelEncoder()

# 将训练集和测试集的标签转换为数值
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# 确保转换后的标签是正确的数据类型
y_train_encoded = y_train_encoded.astype('int32')
y_test_encoded = y_test_encoded.astype('int32')

In [116]:
mean_diff = np.abs(X_train.mean(axis=0) - X_test.mean(axis=0))
std_diff = np.abs(X_train.std(axis=0) - X_test.std(axis=0))

print("Mean differences:", mean_diff)
print("Standard deviation differences:", std_diff)


Mean differences: [[3.37241854e-04 6.87156399e-03 1.79684577e-03 ... 1.41793396e-02
  3.33282879e-02 6.08989240e-02]
 [6.64291291e-04 1.92440216e-02 1.18605785e-02 ... 2.60691529e-02
  5.43354333e-02 6.79238397e-02]
 [1.19866106e-03 1.43251492e-02 8.93471500e-03 ... 2.58806970e-02
  3.43823334e-02 6.28413570e-02]
 ...
 [6.51485050e-03 6.21707397e-03 1.44112486e-02 ... 2.82699617e-03
  7.95581256e-05 6.23286344e-02]
 [8.48273665e-03 1.47875201e-02 2.04747564e-02 ... 9.91116080e-03
  3.95445301e-03 5.93630617e-02]
 [5.03086636e-03 1.16587860e-02 2.05010026e-02 ... 6.47689070e-03
  5.76516174e-03 5.41574191e-02]]
Standard deviation differences: [[0.04518907 0.03333864 0.00735574 ... 0.03441924 0.08676005 0.02991205]
 [0.01854414 0.03550021 0.0026479  ... 0.01855029 0.14991921 0.03794212]
 [0.00494061 0.03906155 0.03295384 ... 0.01929248 0.16633659 0.04211394]
 ...
 [0.00684873 0.01066254 0.07231851 ... 0.05737158 0.01775333 0.03345705]
 [0.00805849 0.01147021 0.07786249 ... 0.05715694 0.0

In [117]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, InputLayer
from keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam

# 假设 X_train, X_test, y_train, y_test 已经定义并准备好了
# 构建LSTM模型
model = Sequential()
model.add(InputLayer(input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(LSTM(units=50, return_sequences=True))
model.add(Dropout(0.3))  # 添加Dropout层
model.add(LSTM(units=50))
model.add(Dropout(0.3))  # 添加Dropout层
model.add(Dense(1, activation='sigmoid'))  # 二分类问题

# 编译模型
model.compile(Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])

# 早停法
# early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=1, restore_best_weights=True)

# 训练模型
model.fit(X_train, y_train_encoded, epochs=35, batch_size=32, validation_data=(X_test, y_test_encoded))

# 评估模型
loss, accuracy = model.evaluate(X_test, y_test_encoded, verbose=0)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

# 打印当前处理的时间
current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(f"Current processing time: {current_time}")



Epoch 1/35
[1m722/722[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 9ms/step - accuracy: 0.5239 - loss: 0.6937 - val_accuracy: 0.5235 - val_loss: 0.6952
Epoch 2/35
[1m722/722[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.5672 - loss: 0.6779 - val_accuracy: 0.5173 - val_loss: 0.7011
Epoch 3/35
[1m722/722[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.5990 - loss: 0.6598 - val_accuracy: 0.5097 - val_loss: 0.7191
Epoch 4/35
[1m722/722[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.6468 - loss: 0.6276 - val_accuracy: 0.5028 - val_loss: 0.7532
Epoch 5/35
[1m722/722[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.6882 - loss: 0.5894 - val_accuracy: 0.5062 - val_loss: 0.8041
Epoch 6/35
[1m722/722[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.7340 - loss: 0.5399 - val_accuracy: 0.5173 - val_loss: 0.8582
Epoch 7/35
[1m722/722[0m 

In [118]:
# 预测测试集
predictions = model.predict(X_test)

# 将概率转换为类别
predicted_classes = (predictions > 0.5).astype("int32")

# 打印预测结果
print(predicted_classes)

# 打印当前处理的时间
current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(f"Current processing time: {current_time}")

[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
[[0]
 [1]
 [0]
 ...
 [0]
 [0]
 [1]]
Current processing time: 2024-11-22 11:45:56


In [119]:
# 计算精确度、召回率和F1分数
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

precision = precision_score(y_test_encoded, predicted_classes)
recall = recall_score(y_test_encoded, predicted_classes)
f1 = f1_score(y_test_encoded, predicted_classes)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# 计算混淆矩阵
cm = confusion_matrix(y_test_encoded, predicted_classes)
print("Confusion Matrix:")
print(cm)

# 打印当前处理的时间
current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(f"Current processing time: {current_time}")

Precision: 0.4962852897473997
Recall: 0.4744318181818182
F1 Score: 0.4851125635439361
Confusion Matrix:
[[401 339]
 [370 334]]
Current processing time: 2024-11-22 11:45:59
