### Define Model 

In [1]:
import os 
import numpy as np
import pandas as pd
import time
from tqdm import tqdm
import scipy

import sklearn
from sklearn.metrics import auc, roc_curve, matthews_corrcoef,accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score, hamming_loss, multilabel_confusion_matrix,confusion_matrix
from sklearn.preprocessing import MultiLabelBinarizer
#from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

import torch
from torch import nn, Tensor
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss, MSELoss
from torch.utils.data import TensorDataset, DataLoader, WeightedRandomSampler

import glob
import matplotlib.pyplot as plt
import pickle
import seaborn as sns
import json

import warnings
warnings.filterwarnings('ignore')
import configparser
from accelerate import Accelerator, DeepSpeedPlugin
from transformers import AdamW, set_seed
#from transformers import bioinfo_compute_metrics as compute_metrics

config = configparser.ConfigParser()


'''模型'''
class Simple_Head(nn.Module):
    
    def __init__(self, config):
        
        super().__init__()
        self.num_labels = config.num_labels
        self.hidden_size = config.hidden_size
        self.cls_dropout_prob = config.dropout
        self.feature_size = config.feature_size
        
        self.in_proj = torch.nn.Linear(self.feature_size, self.hidden_size[0])
        self.hidden_1 = torch.nn.Linear(self.hidden_size[0] , self.hidden_size[1])
        self.hidden_2 = torch.nn.Linear(self.hidden_size[1] , self.hidden_size[2])
        #self.hidden_3 = torch.nn.Linear(self.hidden_size[2] , self.hidden_size[3])
        self.out_proj = torch.nn.Linear(self.hidden_size[2], self.num_labels)
        self.batchnorm0 = nn.BatchNorm1d(self.hidden_size[0])
        #self.batchnorm1 = nn.BatchNorm1d(self.hidden_size[1])
        #self.batchnorm2 = nn.BatchNorm1d(self.hidden_size[2])
        #self.batchnorm3 = nn.BatchNorm1d(self.hidden_size[3])
        
        self.dropout = torch.nn.Dropout(self.cls_dropout_prob) 
    
    def forward(self, features, **kwargs):
        x = features
        
        # x = [batch size, feature_dim * number_base_model]
        batch_size = x.shape[0]
        x = x.view(batch_size, -1)
        
        x = self.dropout(F.relu(self.batchnorm0(self.in_proj(x))))
        x = self.dropout(F.relu(self.hidden_1(x)))
        x = self.dropout(F.relu(self.hidden_2(x)))
        #x = self.dropout(F.relu(self.batchnorm1(self.hidden_1(x))))
        #x = self.dropout(F.relu(self.batchnorm2(self.hidden_2(x))))
        #x = self.dropout(F.relu(self.batchnorm3(self.hidden_3(x))))
        x = self.out_proj(x)
        
        return x 
    
class MLP_learner(nn.Module):
    
    def __init__(self, config):
        super().__init__()
        
        self.model_type = 'MLP_learner'
        self.dropout = config.dropout
        self.num_labels = config.num_labels
        self.classifier = Simple_Head(config)
        self.criterion = CrossEntropyLoss
        
    def _init_weights(self, module):
        """ Initialize the weights """
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=0.02)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    
    def forward(self, feature=None, 
                labels=None, weights=None):
    
        logits = self.classifier(feature)
        # print(logits.shape, labels.shape)
        loss_fct = self.criterion()
        loss = loss_fct(logits, labels)

        return_dict = {}
        return_dict['logits'] = logits
        return_dict['loss'] = loss
        
        return return_dict

def simple_accuracy(preds, labels):
    return (preds == labels).mean()

def metrics(preds, labels, probs):
    acc = simple_accuracy(preds, labels)

    precision_mi = precision_score(y_true=labels, y_pred=preds, average='micro')
    recall_mi = recall_score(y_true=labels, y_pred=preds, average='micro')
    precision_ma = precision_score(y_true=labels, y_pred=preds, average='macro')
    recall_ma = recall_score(y_true=labels, y_pred=preds, average='macro')

    f1_macro = f1_score(y_true=labels, y_pred=preds, average='macro')
    f1_micro = f1_score(y_true=labels, y_pred=preds, average='micro')

    hamming = hamming_loss(y_true=labels, y_pred=preds)
    auc_micro = roc_auc_score(y_true = labels, y_score = probs, average='macro', multi_class = 'ovo')
    auc_macro = roc_auc_score(y_true = labels, y_score = probs, average='macro', multi_class = 'ovo')
    
    # aupr_micro = average_precision_score(y_true = labels, y_score = probs, average='macro')
    # aupr_macro = average_precision_score(y_true = labels, y_score = probs, average='macro')
    # auc = []
    # for i in range(max(labels)+1):
    #     try:
    #         auc.append( round(roc_auc_score(y_true = labels[:,i], y_score = probs[:,i] ), 3))
    #     except ValueError:
    #         auc.append(0)
    # cm = multilabel_confusion_matrix(y_true=labels, y_pred=preds)
    return {
        "acc": acc,
        "f1_micro": f1_micro,
        "f1_macro": f1_macro,
        "hamming_loss":hamming,
        "precision_mi": precision_mi,
        "recall_mi": recall_mi,
        "precision_ma": precision_ma,
        "recall_ma": recall_ma,
        # "auc": auc,
        "auc_micro":auc_micro,
        "auc_macro":auc_macro,
        # "aupr_micro":aupr_micro,
        # "aupr_macro":aupr_macro,
        # "confusion_matrix": cm,
    }

def binar_metrics(preds, labels, probs):
    acc = simple_accuracy(preds, labels)
    precision = precision_score(y_true=labels, y_pred=preds)
    recall = recall_score(y_true=labels, y_pred=preds)
    f1 = f1_score(y_true=labels, y_pred=preds)
    mcc = matthews_corrcoef(labels, preds)
    auc = roc_auc_score(labels, probs)
    aupr = average_precision_score(labels, probs)
    cm = confusion_matrix(labels, preds)
    return {
        "acc": acc,
        "f1": f1,
        "mcc": mcc,
        "auc": auc,
        "aupr": aupr,
        "precision": precision,
        "recall": recall,
        "cm": cm,
    }

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

def train(model, train_dataloader, optimizer, accelerator):

    model.train()
    epoch_loss = 0

    for step, batch in enumerate(train_dataloader):
        
        feature=batch[0]
        labels=batch[1]
        outputs = model(feature, labels)
        loss = outputs['loss']
        epoch_loss += loss.item()

        accelerator.backward(loss)
        optimizer.step()

    return epoch_loss/len(train_dataloader)


def evaluate(model, eval_dataloader):
    softmax = torch.nn.Softmax(dim=1)
    model.eval()
    epoch_loss = 0
    logits = None
    
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            feature=batch[0]
            labels=batch[1]
            outputs = model(feature, labels)
        
        if logits is None:
            logits = outputs['logits'].detach().cpu().numpy()
            reference = labels.detach().cpu().numpy()
        else:
            logits = np.append(logits,outputs['logits'].detach().cpu().numpy(), axis=0)
            reference = np.append(reference,labels.detach().cpu().numpy(), axis=0)
        
        
        loss = outputs['loss']
        epoch_loss += loss.item()
    
    eval_loss = epoch_loss/len(eval_dataloader)
    
    if config.output_mode == "boolean_cls":
        probs = softmax(torch.tensor(logits, dtype=torch.float32))[:,1].numpy()
        preds = np.argmax(logits, axis=1)
        results = binar_metrics(preds, reference, probs)
        
    elif config.output_mode == "multi_cls":
        probs = softmax(torch.tensor(logits, dtype=torch.float32)).numpy()
        preds = np.argmax(logits, axis=1)
        results = metrics(preds, reference, probs)
        
    return eval_loss, preds, results, reference, probs


def run_MLP(X_train, X_test, y_train, y_test, filename):
    
    LABELS = list(set(y_train))
    X_train, X_test = torch.Tensor(X_train), torch.Tensor(X_test)
    y_train, y_test = torch.LongTensor(y_train), torch.LongTensor(y_test)
        
    '''Configs'''
    config.num_labels = len(LABELS)
    config.feature_size = X_train.shape[1]
    config.hidden_size = [128, 128, 32]
    config.dropout = 0.1
    config.BS = 256
    config.learning_rate = 1e-4
    config.weight_decay = 1e-2
    config.num_train_epochs = 100
    config.print_every_epoch = 10

    config.output_mode = "boolean_cls"

    # Dataset                  
    train_set = TensorDataset(X_train, y_train)
    val_set = TensorDataset(X_test, y_test)

    # 加权采样
    class_count = pd.DataFrame(y_train).value_counts().sort_index().to_list()
    class_weights = 1./torch.tensor(class_count, dtype=torch.float) 
    if config.feature_size > 2000:
        class_weights *= 1e4
    print("class_weights:", class_weights)

    weighted_sampler = WeightedRandomSampler(
        weights=class_weights[y_train],
        num_samples=X_train.shape[0],
        replacement=True
    )

    # Dataloader
    train_dataloader = DataLoader(train_set, batch_size=config.BS, sampler=weighted_sampler)
    eval_dataloader = DataLoader(val_set, batch_size=config.BS)


    '''model'''
    model = MLP_learner(config)
    no_decay = ["bias", "LayerNorm.weight"] 
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": config.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
        ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=config.learning_rate)

    accelerator = Accelerator() #(fp16 = True)


    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
                model, optimizer, train_dataloader, eval_dataloader
            )
    
    best_valid_loss = float('inf')
    best_epoch  = 0 
    for epoch in range(config.num_train_epochs):

        start_time = time.monotonic()

        train_loss = train(model, train_dataloader, optimizer, accelerator)
        eval_loss, y_pred, results, _, _ = evaluate(model, eval_dataloader)

        auc = results['auc']
        rec = results['recall']
        prec = results['precision']
        cm = results['cm']
        if eval_loss < best_valid_loss:
            best_valid_loss = eval_loss
            best_epoch = epoch
            torch.save(model.state_dict(), f'{filename}.pt')

        end_time = time.monotonic()
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        """
        if epoch % config.print_every_epoch == 0:
            print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
            print(f'\tTrain Loss: {train_loss:.3f} ')
            print(f'\t Val. Loss: {eval_loss:.3f} ')
            print(f'\t Val. Auc: {auc:.4f} | Rec: {rec:.4f} | Val. Prec: {prec:.4f}')
            print(cm)
        """

    
    model.load_state_dict(torch.load(f'{filename}.pt'))
    
    p = time.time()
    model.eval()
    eval_loss, y_pred, results, y_reference, y_probs = evaluate(model, eval_dataloader)
    q = time.time()
    print(q-p, 'time/s')

    auc = results['auc']
    rec = results['recall']
    prec = results['precision']
    cm =  results['cm']
    
    with open('one_etoh_cross.csv', mode='a+', newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow([filename, filename, auc, rec, prec])
        #(f'\t Val. Auc: {auc:.4f} | Rec: {rec:.4f} | Val. Prec: {prec:.4f}')
    print(cm)

    print(sklearn.metrics.classification_report(y_pred, y_reference))

    fpr, tpr, thresholds = roc_curve(y_reference, y_probs, pos_label=1)
    roc_auc = sklearn.metrics.auc(fpr, tpr)
    
    plt.figure()
    lw=3
    plt.plot(fpr, tpr, label=f"AUC: {auc:.4f}") 
    plt.plot([0,1], [0,1], "k--")
    plt.title('cross-species-etoh:'+ filename +'--->'+filename)
    plt.xlabel("False positive rate")
    plt.ylabel("True positive rate")
    plt.legend()
    plt.savefig('cross-species-etoh:'+ filename +'--->'+filename, dpi=1200)
    plt.show()

    return auc

  from .autonotebook import tqdm as notebook_tqdm


### Create Dataset 

In [None]:
# build single gene name to feature dic
import pickle
import csv

with open ('/data1/xpgeng/P1/raw_data/3_model_features_10/'+ 'iML1515' + '_10_features.pkl', 'rb') as f:
    dicA = pickle.load(f)
    print(len(dicA))

with open ('/data1/xpgeng/P1/raw_data/4_model_features_bioembedding/'+ 'iML1515' + '_protein_embedding.pkl', 'rb') as f:
    dicB = pickle.load(f)
    print(len(dicB))

dic={}
for key in dicA.keys() & dicB.keys(): # use set intersection to get common keys
  # Concatenate the values of the common keys
    dic[key] = dicA[key] + dicB[key]

# Print the result
print(len(dic))

In [None]:
g1513= []
for key in dic.keys():
    g1513.append(key)
print(len(g1513))

import cobra
model = cobra.io.read_sbml_model('/data1/xpgeng/P1/model/'+ 'iML1515.xml')
gene = []
for i in model.genes:
    gene.append(i.id)
print(len(gene))

set(gene)-set(g1513)
# {'b2092', 'b4104', 's0001'}

In [None]:
etoh ={}
with open ('/data1/xpgeng/P1/raw_data/6_two_gene_ko/'+ 'iML1515' + '.xml_two_gene_ko', 'r') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)
    for row in reader:
        if 'fail' not in row:
            etoh[row[0]]=row[1:3]  
            

with open('iML1515'+'_two_etoh_dataset.csv', 'w') as f:
    writer = csv.writer(f)
    for key,value in etoh.items():
        writer.writerow([*value])

Here we can get all gene pair and their features. label then by the o or 1 and input MLP \
Noted that here I first write all the features as a file whose size is 29Gb so i dont upload it here. \
Next cell code is the method above, if u wanna perform the project, i suggest u choose another way. \
But, cause the entries is too much about 1M, so if u write them all into the MEM, i guess ur server will overload. \


### Read data and Run MLP

In [None]:
# it is 29Gb and took too much time
data = pd.read_csv('/data1/xpgeng/8_two_gene_ko_dataset/iML1515' + '_two_etoh_dataset.csv', header=None)
p = data.iloc[:, 2068] # target labels
n = 0
for j in p:
    if j == 1:
        n +=1
print(n)

In [None]:
# in this way perform prediction and splict the data into 5 parts and perform 5-fold validation
%%time
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

#data = ... # load your data here

X = data.iloc[:, 0:2068].values # input features
y = data.iloc[:, 2068].values # target labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # split data into training and test sets


run_MLP(X_train, X_test, y_train, y_test, 'iML1515')