split dataset into 20 parts, in which each of them will serve as training and test

In [None]:
#! conda install -c conda-forge imbalanced-learn
# conda install seaborn

In [1]:
import torch
print(torch.cuda.is_available())

True


In [2]:
# 设置 GPU
device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu")

In [3]:
in_features = 400

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, log_loss
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, roc_auc_score
import seaborn as sns
import time
import pickle
import csv

In [5]:
# 定义 PyTorch 模型
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(in_features, 256),
            nn.ReLU(),
            nn.Linear(256, 32),
            nn.ReLU(),
            nn.Linear(32, 2)
        )
    
    def forward(self, x):
        return self.layers(x)

def run_pytorch_MLP(X_train, X_test, y_train, y_test, 
                    batch_size, learning_rate, weight_decay, num_train_epochs):
    # 使用 SMOTE
    smote = SMOTE(sampling_strategy='minority')
    X_train, y_train = smote.fit_resample(X_train, y_train)
    
    # 将 numpy 数据转换为 torch tensors
    X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
    y_train = torch.tensor(y_train, dtype=torch.long).to(device)
    X_test = torch.tensor(X_test, dtype=torch.float32).to(device)
    y_test = torch.tensor(y_test, dtype=torch.long).to(device)

    # 创建 DataLoader
    train_dataset = TensorDataset(X_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    # 初始化模型
    model = MLP()
    model.to(device)  # 移动模型到 GPU
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    
    train_losses = []
    validation_losses = []

    for epoch in range(num_train_epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        train_losses.append(running_loss / len(train_loader))
        
        model.eval()
        with torch.no_grad():
            outputs = model(X_test.to(device))
            validation_loss = criterion(outputs, y_test.to(device))
            validation_losses.append(validation_loss.item())
        
        print(f'Epoch {epoch+1} - Train Loss: {running_loss / len(train_loader):.3f}, Validation Loss: {validation_loss.item():.3f}')

    return model, train_losses, validation_losses

In [None]:
with open("training_record.txt", mode='a') as file:

    for i in range(20):

        # Read the data back from the binary file.
        with open(f'/data2/xpgeng/iPC815/X_{i}.pickle', 'rb') as f:
            X_train = pickle.load(f)

        with open(f'/data2/xpgeng/iPC815/y_{i}.pickle', 'rb') as f:
            y_train = pickle.load(f)
            
        print('data read')

        X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_val = scaler.transform(X_val)

        # 记录训练开始时间
        start_time = time.time()

        # 运行模型
        num_train_epochs = 20
        model, train_losses, validation_losses = run_pytorch_MLP(
            X_train, X_val, y_train, y_val, batch_size=500, 
            learning_rate=1e-4, weight_decay=1e-3, num_train_epochs=num_train_epochs)

        # 记录训练结束时间
        end_time = time.time()

        # 计算训练时间
        training_time = end_time - start_time
        #print(f"Training time: {training_time:.2f} seconds")

        for j in range(20):
            # Read the data back from the binary file.
            with open(f'/data2/xpgeng/iPC815/X_{j}.pickle', 'rb') as f:
                X_test = pickle.load(f)

            with open(f'/data2/xpgeng/iPC815/y_{j}.pickle', 'rb') as f:
                y_test = pickle.load(f)

            print('test read')
            
            X_test = scaler.transform(X_test)

            # 将 numpy 数据转换回 torch tensor 并将其移至正确的设备
            X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)

            model.eval()  # 设置模型为评估模式
            start_time = time.time()
            with torch.no_grad():
                outputs = model(X_test_tensor)
                _, predicted = torch.max(outputs, 1)
                predicted = predicted.cpu().numpy()
                y_test_np = y_test  # 直接使用 y_test，因为它已经是 numpy 数组
            end_time = time.time()
            print(f"Prediction time: {end_time - start_time} seconds")

            # 计算混淆矩阵
            conf_matrix = confusion_matrix(y_test_np, predicted)
            print("Confusion Matrix:")
            print(conf_matrix)

            # 计算 Precision, Recall 和 F1-Score
            precision, recall, f1, _ = precision_recall_fscore_support(y_test_np, predicted, average='binary')
            print(f"Precision: {precision:.4f}")
            print(f"Recall: {recall:.4f}")
            print(f"F1-Score: {f1:.4f}")

            # 计算 AUC
            probabilities = torch.nn.functional.softmax(outputs, dim=1)[:, 1].cpu().numpy()
            auc_score = roc_auc_score(y_test_np, probabilities)
            print(f"AUC Score: {auc_score:.4f}")
            
            record = (
                f"{i}, "
                f"{round(training_time, 3)}, "
                f"{j}, "
                f"{round(end_time - start_time, 3)}, "
                f"{round(precision, 3)}, "
                f"{round(recall, 3)}, "
                f"{round(f1, 3)}, "
                f"{round(auc_score, 3)}\n"
            )

            file.write(record)

data read
Epoch 1 - Train Loss: 0.041, Validation Loss: 0.004
Epoch 2 - Train Loss: 0.004, Validation Loss: 0.004
Epoch 3 - Train Loss: 0.004, Validation Loss: 0.003
Epoch 4 - Train Loss: 0.004, Validation Loss: 0.003
Epoch 5 - Train Loss: 0.004, Validation Loss: 0.004
Epoch 6 - Train Loss: 0.004, Validation Loss: 0.003
Epoch 7 - Train Loss: 0.004, Validation Loss: 0.003
Epoch 8 - Train Loss: 0.004, Validation Loss: 0.003
Epoch 9 - Train Loss: 0.004, Validation Loss: 0.003
Epoch 10 - Train Loss: 0.004, Validation Loss: 0.003
Epoch 11 - Train Loss: 0.004, Validation Loss: 0.003
Epoch 12 - Train Loss: 0.004, Validation Loss: 0.003
Epoch 13 - Train Loss: 0.004, Validation Loss: 0.003
Epoch 14 - Train Loss: 0.004, Validation Loss: 0.003
Epoch 15 - Train Loss: 0.004, Validation Loss: 0.003
Epoch 16 - Train Loss: 0.004, Validation Loss: 0.003
Epoch 17 - Train Loss: 0.004, Validation Loss: 0.004
Epoch 18 - Train Loss: 0.004, Validation Loss: 0.004
Epoch 19 - Train Loss: 0.004, Validation Loss

AUC Score: 1.0000
test read
Prediction time: 0.0902099609375 seconds
Confusion Matrix:
[[2493349     331]
 [   1162 2005158]]
Precision: 0.9998
Recall: 0.9994
F1-Score: 0.9996
AUC Score: 1.0000
test read
Prediction time: 0.0894780158996582 seconds
Confusion Matrix:
[[2493465     339]
 [   1262 2004934]]
Precision: 0.9998
Recall: 0.9994
F1-Score: 0.9996
AUC Score: 1.0000
test read
Prediction time: 0.08940005302429199 seconds
Confusion Matrix:
[[2493539     363]
 [   1227 2004871]]
Precision: 0.9998
Recall: 0.9994
F1-Score: 0.9996
AUC Score: 1.0000
test read
Prediction time: 0.09197473526000977 seconds
Confusion Matrix:
[[2492857     352]
 [   1261 2005530]]
Precision: 0.9998
Recall: 0.9994
F1-Score: 0.9996
AUC Score: 1.0000
test read
Prediction time: 0.08774375915527344 seconds
Confusion Matrix:
[[2493118     340]
 [   1249 2005293]]
Precision: 0.9998
Recall: 0.9994
F1-Score: 0.9996
AUC Score: 1.0000
test read
Prediction time: 0.08124303817749023 seconds
Confusion Matrix:
[[2249869     

AUC Score: 1.0000
test read
Prediction time: 0.13457179069519043 seconds
Confusion Matrix:
[[2493544      13]
 [   1010 2005433]]
Precision: 1.0000
Recall: 0.9995
F1-Score: 0.9997
AUC Score: 1.0000
test read
Prediction time: 0.12105226516723633 seconds
Confusion Matrix:
[[2492682      21]
 [   1035 2006262]]
Precision: 1.0000
Recall: 0.9995
F1-Score: 0.9997
AUC Score: 1.0000
test read
Prediction time: 0.14455556869506836 seconds
Confusion Matrix:
[[2494599      17]
 [   1014 2004370]]
Precision: 1.0000
Recall: 0.9995
F1-Score: 0.9997
AUC Score: 1.0000
test read
Prediction time: 0.14102435111999512 seconds
Confusion Matrix:
[[2493012      21]
 [   1006 2005961]]
Precision: 1.0000
Recall: 0.9995
F1-Score: 0.9997
AUC Score: 1.0000
test read
Prediction time: 0.14291620254516602 seconds
Confusion Matrix:
[[2495413      19]
 [   1021 2003547]]
Precision: 1.0000
Recall: 0.9995
F1-Score: 0.9997
AUC Score: 1.0000
test read
Prediction time: 0.14476394653320312 seconds
Confusion Matrix:
[[2493660

Train all data

In [6]:
def load_data(filename):
    with open(filename, 'rb') as f:
        data = pickle.load(f)
    return data

def run_pytorch_MLP(model, X_train, X_test, y_train, y_test, 
                    batch_size, learning_rate, weight_decay, num_train_epochs, device):
    # 使用 SMOTE
    smote = SMOTE(sampling_strategy='minority')
    X_train, y_train = smote.fit_resample(X_train, y_train)
    
    # 将 numpy 数据转换为 torch tensors
    X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
    y_train = torch.tensor(y_train, dtype=torch.long).to(device)
    X_test = torch.tensor(X_test, dtype=torch.float32).to(device)
    y_test = torch.tensor(y_test, dtype=torch.long).to(device)

    # 创建 DataLoader
    train_dataset = TensorDataset(X_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    # 初始化模型
    #model = MLP()
    model.to(device)  # 移动模型到 GPU
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    
    for epoch in range(num_train_epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        model.eval()
        with torch.no_grad():
            outputs = model(X_test.to(device))
            validation_loss = criterion(outputs, y_test.to(device))
        
        print(f'Epoch {epoch+1} - Train Loss: {running_loss / len(train_loader):.3f}, Validation Loss: {validation_loss.item():.3f}')

    return model

In [None]:
# 设置超参数和设备
batch_size = 500
learning_rate = 0.0001
weight_decay = 0.001
num_train_epochs = 20
device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu")

a = time.time()
# 加载测试数据集（假设测试集是固定的）
X_test = load_data('/data2/xpgeng/iPC815/X_0.pickle')
y_test = load_data('/data2/xpgeng/iPC815/y_0.pickle')

# 循环读取每个数据分片
for i in range(20):
    X_train = load_data(f'/data2/xpgeng/iPC815/X_{i}.pickle')
    y_train = load_data(f'/data2/xpgeng/iPC815/y_{i}.pickle')
    
    # 如果是第一次迭代，初始化模型
    if i == 0:
        model = MLP()
        
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
        model = run_pytorch_MLP(model, X_train, X_test, y_train, y_test, batch_size, learning_rate, weight_decay, num_train_epochs, device)
    else:
        # 继续训练模型
        X_train = scaler.fit_transform(X_train)
        model = run_pytorch_MLP(model, X_train, X_test, y_train, y_test, batch_size, learning_rate, weight_decay, num_train_epochs, device)

# 保存模型
torch.save(model.state_dict(), 'mlp_model.pth')

#print("Model saved successfully.")

b = time.time()

with open("all_time.txt", "w") as file:
    file.write(f"all time: {b-a:.2f} seconds\n")

In [6]:
in_features = 400

In [14]:
1

1