In [None]:
!pip install datasets transformers[sentencepiece]==4.28.0
!pip install shap

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers[sentencepiece]==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m77.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers[sentencepiece]==4.28.0)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers[sentencepiece]==4.28.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting shap
  Downloading shap-0.41.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (572 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m572.6/572.6 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Collecting slicer==0.0.7 (from shap)
  Downloading slicer-0.0.7-py3-none-any.whl (14 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.41.0 slicer-0.0.7


# Libraries & Functions

In [None]:
import pandas as pd 
import numpy as np
from tqdm import tqdm

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [None]:
import torch
from datasets import Dataset, DatasetDict
from torch.utils.data import DataLoader
from tokenizers import BertWordPieceTokenizer
from transformers import AutoTokenizer, DataCollatorWithPadding, BertTokenizerFast, DistilBertTokenizerFast,  TrainingArguments, Trainer, AutoModelForSequenceClassification
from transformers import pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_predict

In [None]:
!git config --global user.email "viktor.domazetoski@hotmail.com"
!git config --global user.name "ViktorDo1"

from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model_names = ["BERT", "BioBERT"]
checkpoint_names = ["bert-base-cased", "dmis-lab/biobert-base-cased-v1.2"]

In [None]:
def prepare_data(X, y):
  data = []
  for i, (sequence, label) in enumerate(zip(X, y)):
    data.append( 
        {"text":sequence, 
        "label":label,
        "idx":i
        })
  df = pd.DataFrame(data)
  return Dataset.from_pandas(df)

In [None]:
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, max_length = 512)

In [None]:
from sklearn import metrics

def calculate_scores(y_test, y_pred, average = "binary"):
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average = average)
    recall = recall_score(y_test, y_pred, average = average)
    f1 = f1_score(y_test, y_pred, average = average)
    
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred, pos_label=1)
    auc = metrics.auc(fpr, tpr)

    return [accuracy, precision, recall, f1, auc]

# Input Data

In [None]:
raw_datasets = dict()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
colab_dir = "drive/MyDrive/Text_BioMacro/"

In [None]:
!ls "drive/MyDrive/Text_BioMacro/"

prepared_lpi_negatives.csv  prepared_predicts_negatives.csv
prepared_lpi_positives.csv  prepared_predicts_positives.csv


## Living Planet Index

In [None]:
dataset_name = "LPI"
columns = ["Abstract", "Journal", "Title"]
LPI_negatives = pd.read_csv(colab_dir + "prepared_lpi_negatives.csv", usecols=columns)
LPI_negatives["label"] = 0

LPI_positives = pd.read_csv(colab_dir + "prepared_lpi_positives.csv", usecols=columns)
LPI_positives["label"] = 1


raw_datasets[dataset_name] = pd.concat([LPI_positives, LPI_negatives])

del LPI_positives
del LPI_negatives

In [None]:
raw_datasets[dataset_name]

Unnamed: 0,Abstract,Journal,Title,label
0,even though intensive aquaculture production o...,Freshwater Biology,aquaculture non native salmonid invasions and ...,1
1,because sea otters enhydra lutris exert a wide...,Ecology,bald eagles and sea otters in the aleutian arc...,1
2,interactions between sea otters enhydra lutris...,Marine Ecology Progress Series,changes in sea urchins and kelp following a re...,1
3,bacterial abundance production and extracellul...,Marine Biology,microbial activity and carbon nitrogen and pho...,1
4,the main objective of many conservation progra...,Ecological Applications,density dependent productivity depression in p...,1
...,...,...,...,...
4995,for any enzyme catalyzed reaction to occur the...,The ISME journal,relationships between protein encoding gene ab...,0
4996,high intensity functional training hift is a p...,Military medicine,is high intensity functional training hift cro...,0
4997,the developmental plasticity of plants relies ...,Proceedings of the National Academy of Science...,differential tor activation and cell prolifera...,0
4998,ocean acidification produced by dissolution of...,Proceedings of the National Academy of Science...,global declines in oceanic nitrification rates...,0


## PREDICTS

In [None]:
dataset_name = "PREDICTS"
columns = ["Abstract", "Journal", "Title"]
PREDICTS_negatives = pd.read_csv(colab_dir + "prepared_predicts_negatives.csv", usecols=columns)
PREDICTS_negatives["label"] = 0

PREDICTS_positives = pd.read_csv(colab_dir + "prepared_predicts_positives.csv", usecols=columns)
PREDICTS_positives["label"] = 1

raw_datasets[dataset_name] = pd.concat([PREDICTS_positives, PREDICTS_negatives])

del PREDICTS_positives
del PREDICTS_negatives

In [None]:
raw_datasets[dataset_name]

Unnamed: 0,Abstract,Journal,Title,label
0,bees are believed to be dominant pollen vector...,Journal of Applied Ecology,bee diversity along a disturbance gradient in ...,1
1,the maintenance of grasslands as distinct habi...,Conservation Biology,grazing intensity and the diversity of grassho...,1
2,male euglossine bees were sampled with chemica...,Biotropica,abundance and diversity of euglossine bees in ...,1
3,niche breadth of species has been hypothesized...,The American Naturalist,ecological specialization and susceptibility t...,1
4,bumblebees hymenoptera apidae are important po...,Oikos,use of genetic markers to quantify bumblebee f...,1
...,...,...,...,...
4995,we tested the hypothesis that the appearance o...,The Science of the total environment,exo enzymatic activities and dissolved organic...,0
4996,given a constantly increasing urban population...,The Science of the total environment,numerical study of the impact of vegetation co...,0
4997,musty and earthy odors frequently characterize...,Water research,contribution of streptomyces in sediment to ea...,0
4998,we selected five typical tree species includin...,Tree physiology,utilization of lightflecks by seedlings of fiv...,0


## Preprocess Data

In [None]:
preprocessed_datasets_dict = {}

In [None]:
for dataset_name in list(raw_datasets.keys()):
  for text_data in ["Title", "Abstract"]:
    X_train, X_test, \
    y_train, y_test, \
    indices_train, indices_test \
    = train_test_split(raw_datasets[dataset_name][text_data], raw_datasets[dataset_name]["label"], np.arange(len(raw_datasets[dataset_name])), test_size=0.25, random_state=42)

    preprocessed_datasets_dict[dataset_name, text_data] = DatasetDict()
    preprocessed_datasets_dict[dataset_name, text_data]["train"] = prepare_data(X_train, y_train)
    preprocessed_datasets_dict[dataset_name, text_data]["validation"] = prepare_data(X_test, y_test)

In [None]:
for dataset_name in list(raw_datasets.keys()):
  for text_data in ["Title", "Abstract"]:
    print(dataset_name, text_data)
    print(preprocessed_datasets_dict[dataset_name, text_data])
    print()

LPI Title
DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'idx'],
        num_rows: 4224
    })
    validation: Dataset({
        features: ['text', 'label', 'idx'],
        num_rows: 1409
    })
})

LPI Abstract
DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'idx'],
        num_rows: 4224
    })
    validation: Dataset({
        features: ['text', 'label', 'idx'],
        num_rows: 1409
    })
})

PREDICTS Title
DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'idx'],
        num_rows: 4152
    })
    validation: Dataset({
        features: ['text', 'label', 'idx'],
        num_rows: 1384
    })
})

PREDICTS Abstract
DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'idx'],
        num_rows: 4152
    })
    validation: Dataset({
        features: ['text', 'label', 'idx'],
        num_rows: 1384
    })
})



In [None]:
tokenized_datasets_dict = {}

In [None]:
tokenizer_dict = {}

for model_name, model_checkpoint in zip(model_names, checkpoint_names):
  for dataset_name in list(raw_datasets.keys()):
    for text_data in ["Title", "Abstract"]:

      tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
      tokenizer_dict[model_name] = AutoTokenizer.from_pretrained(model_checkpoint)

      tokenized_datasets_dict[dataset_name, text_data, model_name] = preprocessed_datasets_dict[dataset_name, text_data].map(tokenize_function, batched=True)
      tokenized_datasets_dict[dataset_name, text_data, model_name] = tokenized_datasets_dict[dataset_name, text_data, model_name].remove_columns(["text", "idx"])
      tokenized_datasets_dict[dataset_name, text_data, model_name] = tokenized_datasets_dict[dataset_name, text_data, model_name].rename_column("label", "labels")
      tokenized_datasets_dict[dataset_name, text_data, model_name].set_format("torch")

Map:   0%|          | 0/4224 [00:00<?, ? examples/s]

Map:   0%|          | 0/1409 [00:00<?, ? examples/s]

Map:   0%|          | 0/4224 [00:00<?, ? examples/s]

Map:   0%|          | 0/1409 [00:00<?, ? examples/s]

Map:   0%|          | 0/4152 [00:00<?, ? examples/s]

Map:   0%|          | 0/1384 [00:00<?, ? examples/s]

Map:   0%|          | 0/4152 [00:00<?, ? examples/s]

Map:   0%|          | 0/1384 [00:00<?, ? examples/s]

Map:   0%|          | 0/4224 [00:00<?, ? examples/s]

Map:   0%|          | 0/1409 [00:00<?, ? examples/s]

Map:   0%|          | 0/4224 [00:00<?, ? examples/s]

Map:   0%|          | 0/1409 [00:00<?, ? examples/s]

Map:   0%|          | 0/4152 [00:00<?, ? examples/s]

Map:   0%|          | 0/1384 [00:00<?, ? examples/s]

Map:   0%|          | 0/4152 [00:00<?, ? examples/s]

Map:   0%|          | 0/1384 [00:00<?, ? examples/s]

In [None]:
data_collator_dict = {}
for model_name, model_checkpoint in zip(model_names, checkpoint_names):
  tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
  data_collator_dict[model_name] = DataCollatorWithPadding(tokenizer=tokenizer)

# Classical ML Models

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

results_list = []

for dataset_name in list(raw_datasets.keys()):
  for text_data in ["Title", "Abstract"]:
    print(dataset_name, text_data)
    X_train, X_test, \
    y_train, y_test, \
    indices_train, indices_test \
            = train_test_split(raw_datasets[dataset_name][text_data], raw_datasets[dataset_name]["label"], np.arange(len(raw_datasets[dataset_name])), test_size=0.25, random_state=42)

    vectorizer = TfidfVectorizer(analyzer = 'word', ngram_range = (1, 2), max_df = 0.85)
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)

    lr_mod = LogisticRegression(penalty='l2', class_weight='balanced').fit(X_train, y_train)
    y_predict = lr_mod.predict(X_test)

    results = calculate_scores(y_test, y_predict)

    results_list.append([dataset_name, text_data] + results + ["Logistic Regression"])
        
df_results_lr = pd.DataFrame(results_list, columns=["Dataset", "Text", "Accuracy", "Precision", "Recall", "F1-Score", "AUC", "Model"])

LPI Title
LPI Abstract
PREDICTS Title
PREDICTS Abstract


In [None]:
df_results_lr

Unnamed: 0,Dataset,Text,Accuracy,Precision,Recall,F1-Score,AUC,Model
0,LPI,Title,0.932576,0.706731,0.812155,0.755784,0.88124,Logistic Regression
1,LPI,Abstract,0.960256,0.801932,0.917127,0.85567,0.94187,Logistic Regression
2,PREDICTS,Title,0.961705,0.78022,0.916129,0.84273,0.941791,Logistic Regression
3,PREDICTS,Abstract,0.973266,0.843023,0.935484,0.88685,0.956757,Logistic Regression


# Large Language Models

## Training

In [None]:
model_dict = {}

In [None]:
if torch.cuda.is_available():  # Tell PyTorch to use the GPU. 
 device = torch.device("cuda") 
 print('There are %d GPU(s) available.' % torch.cuda.device_count()) 
 print('We will use the GPU:', torch.cuda.get_device_name(0)) # If not...
else:
 print('No GPU available, using the CPU instead.')
 device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [None]:
for model_name, model_checkpoint in zip(model_names[:], checkpoint_names[:]):
  for dataset_name in list(raw_datasets.keys())[:]:
    for text_data in ["Title", "Abstract"][:]:
      print(model_name, dataset_name, text_data)

      model_dict[dataset_name, text_data, model_name] = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

      training_args = TrainingArguments(
          output_dir = "{}-finetuned-topicmodelling-{}".format(model_name, dataset_name+text_data),
          learning_rate = 2e-5,
          per_device_train_batch_size = 16,
          per_device_eval_batch_size = 16,
          num_train_epochs = 3,
          weight_decay = 0.01,
          evaluation_strategy = "epoch",
          # push_to_hub=True,
          fp16 = True
      )

      trainer = Trainer(
          model = model_dict[dataset_name, text_data, model_name],
          args = training_args,
          train_dataset = tokenized_datasets_dict[dataset_name, text_data, model_name]["train"],
          eval_dataset = tokenized_datasets_dict[dataset_name, text_data, model_name]["validation"],
          tokenizer = tokenizer_dict[model_name],
          data_collator = data_collator_dict[model_name]
      )

      trainer.train()

      # trainer.push_to_hub()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Epoch,Training Loss,Validation Loss
1,No log,0.2554
2,0.152600,0.145882
3,0.152600,0.173978


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Epoch,Training Loss,Validation Loss
1,No log,0.094291
2,0.119500,0.105696
3,0.119500,0.098975


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Epoch,Training Loss,Validation Loss
1,No log,0.130219
2,0.152900,0.111636
3,0.152900,0.112405


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Epoch,Training Loss,Validation Loss
1,No log,0.138786
2,0.121100,0.08583
3,0.121100,0.08643


Some weights of the model checkpoint at dmis-lab/biobert-base-cased-v1.2 were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification we

Epoch,Training Loss,Validation Loss
1,No log,0.172163
2,0.139500,0.169212
3,0.139500,0.179822


Some weights of the model checkpoint at dmis-lab/biobert-base-cased-v1.2 were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification we

Epoch,Training Loss,Validation Loss
1,No log,0.277801
2,0.112600,0.104732
3,0.112600,0.109369


Some weights of the model checkpoint at dmis-lab/biobert-base-cased-v1.2 were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification we

Epoch,Training Loss,Validation Loss
1,No log,0.203241
2,0.126500,0.090285
3,0.126500,0.099672


Some weights of the model checkpoint at dmis-lab/biobert-base-cased-v1.2 were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification we

Epoch,Training Loss,Validation Loss
1,No log,0.054712
2,0.098000,0.056865
3,0.098000,0.070533


## Evaluation

In [None]:
results_list = []

for model_name, model_checkpoint in zip(model_names[:], checkpoint_names[:]):
  for dataset_name in list(raw_datasets.keys())[:]:
    for text_data in ["Title", "Abstract"][:]:
      
      eval_dataloader = DataLoader(
            tokenized_datasets_dict[dataset_name, text_data, model_name]["validation"], batch_size=8, collate_fn=data_collator_dict[model_name]
      )


      logit_list = []
      prediction_list = []
      labels_list = []
      model_dict[dataset_name, text_data, model_name].eval()
      for batch in eval_dataloader:
          batch = {k: v.to(device) for k, v in batch.items()}
          with torch.no_grad():
              outputs = model_dict[dataset_name, text_data, model_name](**batch)

          labels = batch["labels"]
          logits = outputs.logits
          predictions = torch.argmax(logits, dim=-1)
          logit_list.append(logits.cpu().detach().numpy())
          prediction_list.append(predictions.cpu().detach().numpy())
          labels_list.append(labels.cpu().detach().numpy())
      
      y_true = np.array([item for sublist in labels_list for item in sublist])
      y_predict = np.array([item for sublist in prediction_list for item in sublist])

      results = calculate_scores(y_true, y_predict)

      results_list.append([dataset_name, text_data] + results + [model_name])
          
df_results = pd.DataFrame(results_list, columns=["Dataset", "Text", "Accuracy", "Precision", "Recall", "F1-Score", "AUC", "Model"])

In [None]:
df_results

Unnamed: 0,Dataset,Text,Accuracy,Precision,Recall,F1-Score,AUC,Model
0,LPI,Title,0.960256,0.874251,0.80663,0.83908,0.894764,BERT
1,LPI,Abstract,0.976579,0.906593,0.911602,0.909091,0.948879,BERT
2,PREDICTS,Title,0.968208,0.849057,0.870968,0.859873,0.92572,BERT
3,PREDICTS,Abstract,0.979046,0.875,0.948387,0.910217,0.96565,BERT
4,LPI,Title,0.958126,0.896104,0.762431,0.823881,0.874701,BioBERT
5,LPI,Abstract,0.97516,0.924419,0.878453,0.90085,0.933933,BioBERT
6,PREDICTS,Title,0.973988,0.940741,0.819355,0.875862,0.906423,BioBERT
7,PREDICTS,Abstract,0.979769,0.899371,0.922581,0.910828,0.954781,BioBERT


Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.
