In [7]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm, trange
from sentence_transformers import SentenceTransformer, util

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [5]:
file_path = "/data-new/yangzihao/DDI/"
# file_path= '/kaggle/input/ddidatasets'
feature_list = ["smile", "target", "enzyme"]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cfg = {
    "model_name": "microsoft/deberta-v3-base",
    "use_amp": False,   # Cause gradient underflow -> NaN
    "epo_num": 3,
    "lr": 5e-5,
    "patience": 400,
    "batch_size": 8,
    "max_len": 256,
}

cuda


In [8]:
events = pd.read_csv(f'{file_path}/fusion_data/events.csv', index_col=0)
df_drug = pd.read_csv(f'{file_path}/fusion_data/drug.csv', index_col=0)
# events = pd.read_csv(f'{file_path}/events.csv', index_col=0)
# df_drug = pd.read_csv(f'{file_path}/drug.csv', index_col=0)

events.shape, df_drug.shape

((323539, 4), (1258, 5))

In [9]:
# events.mechanism = events.mechanism + " " + events.action
counts = events.mechanism.value_counts()

sorted_ev = events.sort_values(by='mechanism')
sorted_ev['label'] = LabelEncoder().fit_transform(sorted_ev.mechanism)
sorted_ev = sorted_ev.drop(['action', 'mechanism'], axis=1)
df_drug = df_drug.drop(['id'], axis=1)
df_drug = df_drug.set_index('name')

In [21]:
from sklearn.utils import resample

# 统计不同类别的数量
label_counts = sorted_ev["label"].value_counts()

# 找到数量最少的类别
min_label = label_counts.idxmin()
min_count = label_counts.min()

# 对每个类别按最小数量采样
samples_data = pd.concat(
    [
        resample(
            sorted_ev[sorted_ev["label"] == i],
            n_samples=min_count,
            replace=False,
        )
        for i in sorted_ev["label"].unique()
    ]
)

samples_data

25 51


Unnamed: 0,drugA,drugB,label
115141,Levodopa,Triflupromazine,0
263219,Levodopa,Umeclidinium,0
61854,Palbociclib,Omeprazole,0
61840,Ferroussulfateanhydrous,Omeprazole,0
132988,Palbociclib,Nizatidine,0
...,...,...,...
121082,Ergometrine,Nortriptyline,90
21435,Midodrine,Trimipramine,90
264240,Clomipramine,Droxidopa,90
76793,Imipramine,Phenylephrine,90


In [5]:
class DDIDataset(Dataset):
    def __init__(self, ev_df, drug_df, max_len=256):
        self.events = ev_df
        self.drugs = drug_df
        self.max_len = max_len
        # self.tokenizer = tokenizer

    def __len__(self):
        return self.events.shape[0]

    def __getitem__(self, index):
        d_a, d_b, label = self.events.iloc[index]
        # d_a_seq = d_a + "," + ",".join(self.drugs.loc[d_a].values)
        # d_b_seq = d_b + "," + ",".join(self.drugs.loc[d_b].values)
        dA_seq = f"The drug {d_a}'s information is: {','.join(self.drugs.loc[d_a].values)}."
        dB_seq = f"The drug {d_b}'s information is: {','.join(self.drugs.loc[d_b].values)}."

        return dA_seq, dB_seq, label

In [6]:
# model_name = 'bert-base-uncased'
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# bert = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=100)

In [11]:
def get_sim_matrix():
    experiment_data = sorted_ev.sample(int(sorted_ev.shape[0] * 0.1))
    print(f"All data size: {experiment_data.shape[0]}")
    
    total_dataset = DDIDataset(experiment_data, df_drug)
    total_loader = DataLoader(total_dataset, batch_size=1024, shuffle=False, drop_last=True)
    
    model = SentenceTransformer("all-MiniLM-L6-v2", device=device)
    model.max_seq_length = 256

    features = []
    labels = []

    for batch in tqdm(total_loader):
        # Compute sentence embeddings
        embeddings1 = model.encode(batch[0], batch_size=512, show_progress_bar=False, device=device, convert_to_tensor=True)
        embeddings2 = model.encode(batch[1], batch_size=512, show_progress_bar=False, device=device, convert_to_tensor=True)
        
        # Compute cosine-similarities
        sim_mat = util.cos_sim(embeddings1, embeddings2)
        features.append(sim_mat)
        labels.append(batch[2])

        # print(sim_mat.shape, batch[2].shape)
        # break
    features = torch.cat(features)
    labels = torch.cat(labels)
    
    return features, labels

features, labels = get_sim_matrix()

All data size: 32353


100%|██████████| 31/31 [00:35<00:00,  1.14s/it]


In [15]:
loader = DataLoader(features, batch_size=8, shuffle=True)
for b in loader:
    print(b.shape)
    break

torch.Size([8, 1024])


In [19]:
display(sorted_ev.head())
display(df_drug.head())

Unnamed: 0,drugA,drugB,label
167576,Levodopa,Paroxetine,0
263221,Levodopa,Dosulepin,0
263220,Levodopa,Trimebutine,0
263219,Levodopa,Umeclidinium,0
263218,Levodopa,Aclidinium,0


Unnamed: 0_level_0,target,enzyme,smile
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bivalirudin,P00734,P05164,1|41|79|80|108|117|140|143|173|193|197|242|269...
Desmopressin,P30518|P37288|P47901,P23219|P35354,1|53|80|115|117|140|143|173|193|197|242|253|30...
Cyclosporine,P49069|Q96LZ3|P62937|P30405,P20815|P08684|P33261|P10635,1|5|19|38|47|80|101|115|126|132|186|208|219|22...
Cyanocobalamin,Q99707|P22033|Q9UBK8|Q8IVH4|Q9Y4U1|P42898,Q96EY8|Q05599,1|35|41|45|49|75|80|84|106|140|188|192|194|197...
Ademetionine,Q14749|P17707|P31153|P35520|Q00266|P21964|Q8N1...,P05181|P19623,1|75|80|194|209|348|362|378|454|457|489|577|61...
