After the autoencoder compress the gene feature from 400 to 3, the autoencoder2 is used to compress the gene number from 4303 to 400, here we add a zero row to change the gene number from 4303 to 4304. Aim to make our model could be 

To avoid confusion, we still use the model, Autoencoder to represent the model1, use the Autoencoder2, model2 to represent the model2.

### Data Preparation 

In [1]:
from Bio import SeqIO
import numpy as np
from collections import Counter
from sklearn.decomposition import PCA

# 定义读取FASTA文件并返回基因-序列字典的函数
def read_fasta(file_path):
    gene_sequence_dict = {}
    
    # 使用SeqIO解析FASTA文件
    for record in SeqIO.parse(file_path, "fasta"):
        # 获取基因名称（FASTA文件的标头部分）
        gene_name = record.id
        # 获取蛋白质序列（序列部分）
        sequence = str(record.seq)
        # 将基因-序列的键值对添加到字典中
        gene_sequence_dict[gene_name] = sequence

    return gene_sequence_dict

# 读取FASTA文件并生成字典
file_path = "E.coli.tag_seq.fasta"  # 请替换为你的FASTA文件路径
gene_sequence_dict = read_fasta(file_path)

# 打印字典的前几个项以确认
for gene, sequence in list(gene_sequence_dict.items())[:5]:
    print(f"Gene: {gene}, Sequence: {sequence[:30]}...")  # 只打印前30个氨基酸

# 定义所有可能的2-mer组合
standard_amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
all_2mers = [a + b for a in standard_amino_acids for b in standard_amino_acids]
two_mer_index = {two_mer: idx for idx, two_mer in enumerate(all_2mers)}

# 生成基因2-mer特征字典
two_mer_dict = {}

# 遍历每个基因和序列
for gene, sequence in gene_sequence_dict.items():
    # 清洗序列，移除非标准氨基酸字符
    sequence = ''.join([aa for aa in sequence if aa in standard_amino_acids])
    
    # 计算2-mer出现次数
    two_mer_counts = Counter([sequence[i:i+2] for i in range(len(sequence)-1)])
    
    # 计算2-mer的总数
    total_two_mers = sum(two_mer_counts.values())
    
    # 初始化400维的零向量
    feature_vector = np.zeros(400)
    
    # 将2-mer的频率映射到向量的对应位置
    for two_mer, count in two_mer_counts.items():
        if two_mer in two_mer_index:
            # 计算频率而不是计数
            frequency = count / total_two_mers
            feature_vector[two_mer_index[two_mer]] = frequency
            
    # 将计算的特征向量保存到字典中
    two_mer_dict[gene] = feature_vector

# 打印前5个基因的2-mer特征查看
for gene, feature_vector in list(two_mer_dict.items())[:5]:
    print(f"Gene: {gene}")
    print(f"2-mer Feature Vector: {feature_vector}")

Gene: b0001, Sequence: MKRISTTITTTITITTGNGAG...
Gene: b0002, Sequence: MRVLKFGGTSVANAERFLRVADILESNARQ...
Gene: b0003, Sequence: MVKVYAPASSANMSVGFDVLGAAVTPVDGA...
Gene: b0004, Sequence: MKLYNLKDHNEQVSFAQAVTQGLGKNQGLF...
Gene: b0005, Sequence: MKKMQSIVLALSLVLVAPMAAQAAEITLVP...
Gene: b0001
2-mer Feature Vector: [0.   0.   0.   0.   0.   0.05 0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.05 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.05
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0

### AutoEncoder2 training 

#### Cross-Validate 

In [None]:
# cross-validation no batch in training dataloader - batch when read data - 6 epochs - test only calculate mse
import torch
import random
import pickle
import numpy as np
from sklearn.metrics import roc_auc_score
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
import os
import traceback

zero_row = np.zeros((1, 400)) # define zero row to add row number from 4303 to 4304

# Define the model
class Autoencoder(torch.nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(400, 256),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.35),
            torch.nn.Linear(256, 128),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.35),
            torch.nn.Linear(128, 3),
        )
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(3, 128),
            torch.nn.ReLU(),
            torch.nn.Linear(128, 256),
            torch.nn.ReLU(),
            torch.nn.Linear(256, 400),
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded
    
class Autoencoder2(torch.nn.Module):
    def __init__(self):
        super(Autoencoder2, self).__init__()
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(4304, 3000),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.2),
            torch.nn.Linear(3000, 1000),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.3),
            torch.nn.Linear(1000, 400),
        )
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(400, 1000),
            torch.nn.ReLU(),
            torch.nn.Linear(1000, 3000),
            torch.nn.ReLU(),
            torch.nn.Linear(3000, 4304),
        )
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# Check if CUDA device is available
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
log_file = os.path.join(os.getcwd(), "ae2_crossval_training_log_test1.txt")
error_log_file = os.path.join(os.getcwd(), "ae2_crossval_error_log_test1.txt")

# Function to load and update del_twogenes files in batches
def load_and_update_del_twogenes(file_idx, batch_size):
    pickle_filename = f'del_twogenes_{file_idx}.pkl'
    
    with open(pickle_filename, 'rb') as f:
        del_twogenes = pickle.load(f)

    # Split del_twogenes into batches of size batch_size
    keys = list(del_twogenes.keys())
    batches = [keys[i:i + batch_size] for i in range(0, len(keys), batch_size)]

    # Process each batch separately
    del_twogenes_batches = []
    for batch in tqdm(batches, desc=f"Processing file {file_idx}"):
        batch_data = {key: del_twogenes[key] for key in batch}
        updated_batch = {key: random.sample(val, 4303) for key, val in batch_data.items()}
        #updated_batch = {key: random.shuffle(val) for key, val in batch_data.items()}
        del_twogenes_batches.append(updated_batch)
        
        #break # 1

    return del_twogenes_batches

# Redirect print to a file
def log_print(message):
    with open(log_file, "a") as f:
        f.write(message + "\n")
    print(message)

# Log error messages to a file
def log_error(message):
    with open(error_log_file, "a") as f:
        f.write(message + "\n")
    print(f"Error logged: {message}")  # Debugging line to ensure errors are logged

# Training function
def train_autoencoder(train_data, model, criterion, optimizer, num_epochs=6):
    model.train()

    # Store training logs
    with open(log_file, "a") as f:
        f.write("Epoch,Train Loss MSE\n")

    for epoch in range(num_epochs):
        epoch_loss = 0.0
        for data in tqdm(train_data, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False):
            inputs = data
            inputs = inputs.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, inputs)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        
        # Log results
        with open(log_file, "a") as f:
            f.write(f"{epoch+1},{epoch_loss/len(train_data)}\n")

        log_print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(train_data):.8f}")

# Evaluation function (MSE computation, since we're not using labels in unsupervised learning)
def evaluate_mse(model, test_loader):
    model.eval()
    total_mse = 0.0
    with torch.no_grad():
        for data in test_loader:
            inputs = data  # Only use inputs for autoencoder (no labels)
            inputs = inputs.to(device)
            outputs = model(inputs)
            
            # Calculate MSE between input and output
            mse = nn.MSELoss()(outputs, inputs)  # Compare reconstructed output with original input
            total_mse += mse.item()

    return total_mse / len(test_loader)

# Main function to perform 10-fold cross-validation
def cross_validate():
    #all_indices = list(range(1, 11))
    #total_mse = []

    for fold in range(1):
        try:
            # Prepare training and testing sets
            train_files = [2,3,4,5,6,7,8,9,10]
            test_file = 1

            # Initialize the model1
            model = Autoencoder().to(device) 
            model.load_state_dict(torch.load('ae1_all_data_training.pth'))  # 加载训练好的模型
            model.eval()  # 切换到推理模式（关闭 dropout 等）
            
            # Initialize the model2, criterion, and optimizer
            model2 = Autoencoder2().to(device)
            criterion = torch.nn.MSELoss()
            optimizer = torch.optim.Adam(model2.parameters(), lr=0.001)

            # Load and update training files (9 files)
            for file_idx in train_files:
                log_print(f"Processing train file {file_idx}")
                del_twogenes_batches = load_and_update_del_twogenes(file_idx, batch_size=2000)
                for batch_idx, batch in enumerate(del_twogenes_batches):
                    
                    train_data = []

                    for key, val in batch.items():
                        gene_features = np.array([two_mer_dict[gene] for gene in val])
                        gene_features = np.vstack([gene_features, zero_row])                       
                        inputs2 = torch.tensor(gene_features).to(device).float() 
                        encoded = model.encoder(inputs2)                       
                        
                        gene_features = encoded.cpu().detach().numpy().T # transpose otherwise T for 3-D will error                     
                        
                        # 如果所有元素都为0，保持不变 normalization step
                        if np.all(gene_features == 0):
                            normalized_gene_features = gene_features
                        else:
                            # 最小-最大归一化
                            min_val = np.min(gene_features)
                            max_val = np.max(gene_features)

                            # 防止除零错误
                            if max_val != min_val:
                                normalized_gene_features = (gene_features - min_val) / (max_val - min_val)
                            else:
                                normalized_gene_features = gene_features  # 如果 min == max，保持原样                        
                                             
                        train_data.append(normalized_gene_features)
                    # print(np.array(train_data).shape) = (batch_size, 3, 4304)
                        
                    # Convert train data to tensor and train the model
                    train_data = torch.tensor(np.array(train_data), dtype=torch.float32) 
                    train_autoencoder(train_data, model2, criterion, optimizer) # change as model2 for train
                    
                    #break # 2

            # Load and update test file (1 file)
            log_print(f"Processing test file {test_file}")
            del_twogenes_batches = load_and_update_del_twogenes(test_file, batch_size=2000)
            fold_mse = []
            for batch in del_twogenes_batches:
                test_data = []
                for key, val in batch.items():
                    gene_features = np.array([two_mer_dict[gene] for gene in val])
                    gene_features = np.vstack([gene_features, zero_row])                                         
                    inputs2 = torch.tensor(gene_features).to(device).float() 
                    encoded = model.encoder(inputs2) # model1
                    gene_features = encoded.cpu().detach().numpy().T 
                    
                    # 如果所有元素都为0，保持不变
                    if np.all(gene_features == 0):
                        normalized_gene_features = gene_features
                    else:
                        # 最小-最大归一化
                        min_val = np.min(gene_features)
                        max_val = np.max(gene_features)

                        # 防止除零错误
                        if max_val != min_val:
                            normalized_gene_features = (gene_features - min_val) / (max_val - min_val)
                        else:
                            normalized_gene_features = gene_features  # 如果 min == max，保持原样          
                    
                    test_data.append(normalized_gene_features)                    

                # Convert test data to tensor
                test_data = torch.tensor(np.array(test_data), dtype=torch.float32)

                # Evaluate model on the test set
                test_loader = DataLoader(test_data, batch_size=500) # In train, no additional batchsize in model
                mse = evaluate_mse(model2, test_loader)
                fold_mse.append(mse)
                #total_mse.append(mse)

            log_print(f"Fold MSE: {np.mean(fold_mse):.8f}")

        except Exception as e:
            error_message = f"Error in fold {fold+1}: {str(e)}\n{traceback.format_exc()}"
            log_error(error_message)

    #log_print(f"Average MSE over 10 folds: {np.mean(total_mse):.8f}")

# Start cross-validation
cross_validate()

Processing train file 2


Processing file 2:  48%|██████████▌           | 223/464 [18:27<19:23,  4.83s/it]

#### Train all data 

In [None]:
# train all data no batch in training dataloader - batch when read data - 6 epochs
import torch
import random
import pickle
import numpy as np
from sklearn.metrics import roc_auc_score
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
import os
import traceback
import time

zero_row = np.zeros((1, 400)) # define zero row to add row number from 4303 to 4304

# Define the model
class Autoencoder(torch.nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(400, 256),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.35),
            torch.nn.Linear(256, 128),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.35),
            torch.nn.Linear(128, 3),
        )
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(3, 128),
            torch.nn.ReLU(),
            torch.nn.Linear(128, 256),
            torch.nn.ReLU(),
            torch.nn.Linear(256, 400),
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded
    
class Autoencoder2(torch.nn.Module):
    def __init__(self):
        super(Autoencoder2, self).__init__()
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(4304, 3000),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.2),
            torch.nn.Linear(3000, 1000),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.3),
            torch.nn.Linear(1000, 400),
        )
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(400, 1000),
            torch.nn.ReLU(),
            torch.nn.Linear(1000, 3000),
            torch.nn.ReLU(),
            torch.nn.Linear(3000, 4304),
        )
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# Check if CUDA device is available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
log_file = os.path.join(os.getcwd(), "ae2_all_data_training_log.txt")
error_log_file = os.path.join(os.getcwd(), "ae2_all_data_error_log.txt")

# Function to load and update del_twogenes files in batches
def load_and_update_del_twogenes(file_idx, batch_size):
    pickle_filename = f'del_twogenes_whole_random_{file_idx}.pkl'   # read files
    
    with open(pickle_filename, 'rb') as f:
        del_twogenes = pickle.load(f)

    # Split del_twogenes into batches of size batch_size
    keys = list(del_twogenes.keys())
    batches = [keys[i:i + batch_size] for i in range(0, len(keys), batch_size)]

    # Process each batch separately
    del_twogenes_batches = []
    for batch in tqdm(batches, desc=f"Processing file {file_idx}"):
        batch_data = {key: del_twogenes[key] for key in batch}
        updated_batch = {key: random.sample(val, 4303) for key, val in batch_data.items()}
        #updated_batch = {key: random.shuffle(val) for key, val in batch_data.items()}
        del_twogenes_batches.append(updated_batch)
        
        #break # 1

    return del_twogenes_batches

# Redirect print to a file
def log_print(message):
    with open(log_file, "a") as f:
        f.write(message + "\n")
    print(message)

# Log error messages to a file
def log_error(message):
    with open(error_log_file, "a") as f:
        f.write(message + "\n")
    print(f"Error logged: {message}")  # Debugging line to ensure errors are logged

# Training function
def train_autoencoder(train_data, model, criterion, optimizer, num_epochs=6):
    model.train()

    # Store training logs
    with open(log_file, "a") as f:
        f.write("Epoch,Train Loss MSE\n")

    for epoch in range(num_epochs):
        epoch_loss = 0.0
        for data in tqdm(train_data, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False):
            inputs = data
            inputs = inputs.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, inputs)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        
        # Log results
        with open(log_file, "a") as f:
            f.write(f"{epoch+1},{epoch_loss/len(train_data)}\n")

        log_print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(train_data):.8f}")

# Evaluation function (MSE computation, since we're not using labels in unsupervised learning)
def evaluate_mse(model, test_loader):
    model.eval()
    total_mse = 0.0
    with torch.no_grad():
        for data in test_loader:
            inputs = data  # Only use inputs for autoencoder (no labels)
            inputs = inputs.to(device)
            outputs = model(inputs)
            
            # Calculate MSE between input and output
            mse = nn.MSELoss()(outputs, inputs)  # Compare reconstructed output with original input
            total_mse += mse.item()

    return total_mse / len(test_loader)

def train_all():
    all_indices = list(range(1, 11)) # files from 1-10

    for fold in range(1): # loop one time
        try:
            # Prepare training and testing sets
            train_files = all_indices
            #test_file = fold + 1

            # Initialize the model1
            model = Autoencoder().to(device) 
            model.load_state_dict(torch.load('ae1_all_data_training.pth'))  # 加载训练好的模型
            model.eval()  # 切换到推理模式（关闭 dropout 等）
            
            # Initialize the model2, criterion, and optimizer
            model2 = Autoencoder2().to(device)
            criterion = torch.nn.MSELoss()
            optimizer = torch.optim.Adam(model2.parameters(), lr=0.001)

            # Load and update training files (10 files)
            log_print(f"start time is {time.time()}")
            
            for file_idx in train_files:
                log_print(f"Processing train file {file_idx}")
                del_twogenes_batches = load_and_update_del_twogenes(file_idx, batch_size=2000)
                for batch_idx, batch in enumerate(del_twogenes_batches):
                    
                    train_data = []

                    for key, val in batch.items():
                        gene_features = np.array([two_mer_dict[gene] for gene in val])
                        gene_features = np.vstack([gene_features, zero_row])                       
                        inputs2 = torch.tensor(gene_features).to(device).float() 
                        encoded = model.encoder(inputs2)                    
                        gene_features = encoded.cpu().detach().numpy().T # transpose otherwise T for 3-D will error                     
                        
                        # 如果所有元素都为0，保持不变
                        if np.all(gene_features == 0):
                            normalized_gene_features = gene_features
                        else:
                            # 最小-最大归一化
                            min_val = np.min(gene_features)
                            max_val = np.max(gene_features)

                            # 防止除零错误
                            if max_val != min_val:
                                normalized_gene_features = (gene_features - min_val) / (max_val - min_val)
                            else:
                                normalized_gene_features = gene_features  # 如果 min == max，保持原样

                        train_data.append(normalized_gene_features)
                    # print(np.array(train_data).shape) = (batch_size, 3, 4304)
                        
                    # Convert train data to tensor and train the model
                    train_data = torch.tensor(np.array(train_data), dtype=torch.float32) 
                    train_autoencoder(train_data, model2, criterion, optimizer) # change as model2 for train
                    
                    #break # 2

            # Save the model parameters after each fold
            torch.save(model2.state_dict(), f'ae2_all_data_training.pth') #save model2 state
            log_print(f"end time is {time.time()}")

        except Exception as e:
            error_message = f"Error in fold {fold+1}: {str(e)}\n{traceback.format_exc()}"
            log_error(error_message)

# train_all
train_all()

start time is 1745039603.1843216
Processing train file 1


111 hours to train all data.