# Results

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import random
import sqlite3
from tqdm import tqdm

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    average_precision_score,
)
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import StratifiedKFold

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

seed = 42
random.seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
pd.set_option("display.precision", 4)
pd.set_option("display.float_format", "{:.4f}".format)
pd.set_option("display.max_rows", 20)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

data_path = "./data"
# data_path = "/kaggle/input/ddidatasets"

small dataset

In [None]:
# conn = sqlite3.connect(f"{data_path}/event.db")
# df_drug = pd.read_sql("select * from drug;", conn)
# extraction = pd.read_sql("select * from extraction;", conn)
# mechanism = extraction["mechanism"]
# action = extraction["action"]
# drugA = extraction["drugA"]
# drugB = extraction["drugB"]

# extraction["label_text"] = extraction.mechanism + " " + extraction.action

# # extraction["label_text"] = LabelEncoder().fit_transform(extraction.mechanism)
# extraction["label_text"] = extraction["label_text"].apply(str.lower)
# extraction = extraction.drop(['index'], axis=1)
# df_drug = df_drug.drop(['id', 'index'], axis=1)
# df_drug = df_drug.set_index('name')

# ## check number of classes
# # extraction['label_text'].value_counts()
# ## check number of drugs
# # df_drug.index

# display(df_drug.head(2))
# display(extraction.head(2))

big dataset

In [None]:
events = pd.read_csv(f"{data_path}/events.csv", index_col=0)
df_drugs = pd.read_csv(f"{data_path}/drugs.csv", index_col=0)

display(events.head(2))
display(df_drugs.head(2))

## Config

In [None]:
def evaluate_metrics(pred_probs, labels):
    pred_probs = np.concatenate(pred_probs, axis=0)
    labels = np.concatenate(labels, axis=0)
    print(pred_probs.shape, labels.shape)

    # 获得预测的类别
    predicted_labels = pred_probs.argmax(axis=1)

    # 计算accuracy
    accuracy = accuracy_score(labels, predicted_labels)

    # 计算precision、recall、F1-score
    precision = precision_score(labels, predicted_labels, average="micro")
    recall = recall_score(labels, predicted_labels, average="micro")
    f1 = f1_score(labels, predicted_labels, average="micro")

    # 计算AUC和AUPR
    auc_score = roc_auc_score(
        labels, pred_probs, average="macro", multi_class="ovr"
    )
    aupr_score = average_precision_score(labels, pred_probs, average="micro")

    # return a dict
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "auc_score": auc_score,
        "aupr_score": aupr_score,
    }
    # return accuracy, precision, recall, f1, auc_score, aupr_score

In [None]:
# model_path = './original/models/'
model_path = '/kaggle/input/ddi-lm-models/original/models/'
models = [f'{model_path}{model}' for model in os.listdir(model_path) if os.path.isdir(f'{model_path}{model}')]

current_model = models[0]
tokenizer = AutoTokenizer.from_pretrained(current_model)
model = AutoModelForSequenceClassification.from_pretrained(current_model)
print(f'model: {current_model} Loaded!')

In [None]:
class DDI_Dataset(Dataset):
    def __init__(self, ev_df, drug_df, tokenizer, max_len=256):
        self.events = ev_df
        self.drugs = drug_df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return self.events.shape[0]

    def __getitem__(self, index):
        d_a, d_b, labels = self.events.iloc[index, [2, 3, -1]]
        # d_a_seq = d_a + "," + ','.join(self.drugs.loc[d_a].values)
        # d_b_seq = d_b + "," + ','.join(self.drugs.loc[d_b].values)
        
        # use different modal
        # d_a_seq = d_a + f", the drug {d_a}'s chemical form is: " + self.drugs.loc[d_a].target
        # d_b_seq = d_b + f", the drug {d_b}'s chemical form is: " + self.drugs.loc[d_b].target
        # text = f'{d_a_seq + " " + self.tokenizer.sep_token + " " + d_b_seq}'

        # use prompt
        text = f"The drug {d_a} interacts with the drug {d_b}. \
            The drug {d_a}'s information is: {','.join(self.drugs.loc[d_a].values)}. \
            The drug {d_b}'s information is: {','.join(self.drugs.loc[d_b].values)}."
        
        # print(text)

        encode_dict = self.tokenizer.encode_plus(
            text=text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding="max_length",
            return_attention_mask=True,
            return_tensors="pt",
        )
        ids = encode_dict["input_ids"].squeeze(0)
        masks = encode_dict["attention_mask"].squeeze(0)
        
        return {"ids": ids, "masks": masks, "labels": labels}

In [None]:
def predict(model, loader):
    total_probs = []
    total_labels = []
    model.to(device)
    model.eval()
    with torch.no_grad():
        for step, b in tqdm(enumerate(loader), total=len(loader), desc="Test"):
            b = {k: v.to(device) for k, v in b.items()}
            ids, masks, labels = b["ids"], b["masks"], b["labels"]

            outputs = model(ids, masks)
            logits = outputs.logits

            probs = F.softmax(logits, dim=1).cpu().numpy()
            total_probs.append(probs)
            total_labels.append(labels.cpu().numpy())


    total_probs = np.concatenate(total_probs, axis=0)
    total_labels = np.concatenate(total_labels, axis=0)
    print(f'predict: {total_probs.shape}, {total_labels.shape}')
    return total_probs, total_labels

In [None]:
def cross_val(events, df_drugs, tokenizer, model):
    skf = StratifiedKFold(n_splits=5)
    total_pred_scores = []
    total_labels = []
    cv_results = []

    for train_index, test_index in skf.split(np.zeros(len(events)), events['label']):

        # train_dataset = DDI_Dataset(events.iloc[train_index], df_drugs, tokenizer, max_len=256)
        test_dataset = DDI_Dataset(events.iloc[test_index], df_drugs, tokenizer, max_len=256)
        test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

        pred_scores, labels = predict(model, test_loader)

        total_pred_scores.append(pred_scores)
        total_labels.append(labels)
        total_pred_scores = np.concatenate(total_pred_scores, axis=0)
        total_labels = np.concatenate(total_labels, axis=0)
        
        print(f'evaluate: {total_pred_scores.shape}, {total_labels.shape}')
        results = evaluate_metrics(total_pred_scores, total_labels)
        cv_results.append(results)
        print("results: ", results)

    return cv_results

## Inference

In [None]:
cross_val(events, df_drugs, tokenizer, model)