# Libraries & Functions

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [3]:
import torch
from datasets import Dataset, DatasetDict
from torch.utils.data import DataLoader
from tokenizers import BertWordPieceTokenizer
from transformers import AutoTokenizer, DataCollatorWithPadding, BertTokenizerFast, DistilBertTokenizerFast,  TrainingArguments, Trainer, AutoModelForSequenceClassification
from transformers import pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_predict

In [4]:
!git config --global user.email "viktor.domazetoski@hotmail.com"
!git config --global user.name "ViktorDo1"

from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [7]:
# model_names = ["BERT", "BioBERT"]
# checkpoint_names = ["bert-base-cased", "dmis-lab/biobert-base-cased-v1.2"]

model_names = ["distilbert", "distilbert", "debertav2", "electra"]
checkpoint_names = ["ViktorDo/EcoBERT-Pretrained", "distilbert-base-uncased", "microsoft/deberta-v3-base", "google/electra-base-discriminator"]

In [8]:
def prepare_data(X, y):
  data = []
  for i, (sequence, label) in enumerate(zip(X, y)):
    data.append(
        {"text":sequence,
        "label":label,
        "idx":i
        })
  df = pd.DataFrame(data)
  return Dataset.from_pandas(df)

In [9]:
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, max_length = 512)

In [10]:
from sklearn import metrics

def calculate_scores(y_test, y_pred, average = "binary"):

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average = average)
    recall = recall_score(y_test, y_pred, average = average)
    f1 = f1_score(y_test, y_pred, average = average)

    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred, pos_label=1)
    auc = metrics.auc(fpr, tpr)

    return [accuracy, precision, recall, f1, auc]

# Input Data

In [11]:
raw_datasets = dict()

In [12]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
colab_dir = "drive/MyDrive/Text_BioMacro/"

In [14]:
!ls "drive/MyDrive/Text_BioMacro/"

prepared_lpi_negatives.csv  prepared_predicts_negatives.csv
prepared_lpi_positives.csv  prepared_predicts_positives.csv


## Living Planet Index

In [15]:
dataset_name = "LPI"
columns = ["Abstract", "Journal", "Title"]
LPI_negatives = pd.read_csv(colab_dir + "prepared_lpi_negatives.csv", usecols=columns)
LPI_negatives["label"] = 0

LPI_positives = pd.read_csv(colab_dir + "prepared_lpi_positives.csv", usecols=columns)
LPI_positives["label"] = 1


raw_datasets[dataset_name] = pd.concat([LPI_positives, LPI_negatives])

del LPI_positives
del LPI_negatives

In [16]:
raw_datasets[dataset_name]

Unnamed: 0,Abstract,Journal,Title,label
0,even though intensive aquaculture production o...,Freshwater Biology,aquaculture non native salmonid invasions and ...,1
1,because sea otters enhydra lutris exert a wide...,Ecology,bald eagles and sea otters in the aleutian arc...,1
2,interactions between sea otters enhydra lutris...,Marine Ecology Progress Series,changes in sea urchins and kelp following a re...,1
3,bacterial abundance production and extracellul...,Marine Biology,microbial activity and carbon nitrogen and pho...,1
4,the main objective of many conservation progra...,Ecological Applications,density dependent productivity depression in p...,1
...,...,...,...,...
4995,for any enzyme catalyzed reaction to occur the...,The ISME journal,relationships between protein encoding gene ab...,0
4996,high intensity functional training hift is a p...,Military medicine,is high intensity functional training hift cro...,0
4997,the developmental plasticity of plants relies ...,Proceedings of the National Academy of Science...,differential tor activation and cell prolifera...,0
4998,ocean acidification produced by dissolution of...,Proceedings of the National Academy of Science...,global declines in oceanic nitrification rates...,0


## PREDICTS

In [17]:
dataset_name = "PREDICTS"
columns = ["Abstract", "Journal", "Title"]
PREDICTS_negatives = pd.read_csv(colab_dir + "prepared_predicts_negatives.csv", usecols=columns)
PREDICTS_negatives["label"] = 0

PREDICTS_positives = pd.read_csv(colab_dir + "prepared_predicts_positives.csv", usecols=columns)
PREDICTS_positives["label"] = 1

raw_datasets[dataset_name] = pd.concat([PREDICTS_positives, PREDICTS_negatives])

del PREDICTS_positives
del PREDICTS_negatives

In [18]:
raw_datasets[dataset_name]

Unnamed: 0,Abstract,Journal,Title,label
0,bees are believed to be dominant pollen vector...,Journal of Applied Ecology,bee diversity along a disturbance gradient in ...,1
1,the maintenance of grasslands as distinct habi...,Conservation Biology,grazing intensity and the diversity of grassho...,1
2,male euglossine bees were sampled with chemica...,Biotropica,abundance and diversity of euglossine bees in ...,1
3,niche breadth of species has been hypothesized...,The American Naturalist,ecological specialization and susceptibility t...,1
4,bumblebees hymenoptera apidae are important po...,Oikos,use of genetic markers to quantify bumblebee f...,1
...,...,...,...,...
4995,we tested the hypothesis that the appearance o...,The Science of the total environment,exo enzymatic activities and dissolved organic...,0
4996,given a constantly increasing urban population...,The Science of the total environment,numerical study of the impact of vegetation co...,0
4997,musty and earthy odors frequently characterize...,Water research,contribution of streptomyces in sediment to ea...,0
4998,we selected five typical tree species includin...,Tree physiology,utilization of lightflecks by seedlings of fiv...,0


## Preprocess Data

In [19]:
preprocessed_datasets_dict = {}

In [20]:
for dataset_name in list(raw_datasets.keys()):
  for text_data in ["Title", "Abstract"]:
    X_train, X_test, \
    y_train, y_test, \
    indices_train, indices_test \
    = train_test_split(raw_datasets[dataset_name][text_data], raw_datasets[dataset_name]["label"], np.arange(len(raw_datasets[dataset_name])), test_size=0.25, random_state=42)

    preprocessed_datasets_dict[dataset_name, text_data] = DatasetDict()
    preprocessed_datasets_dict[dataset_name, text_data]["train"] = prepare_data(X_train, y_train)
    preprocessed_datasets_dict[dataset_name, text_data]["validation"] = prepare_data(X_test, y_test)

In [21]:
for dataset_name in list(raw_datasets.keys()):
  for text_data in ["Title", "Abstract"]:
    print(dataset_name, text_data)
    print(preprocessed_datasets_dict[dataset_name, text_data])
    print()

LPI Title
DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'idx'],
        num_rows: 4224
    })
    validation: Dataset({
        features: ['text', 'label', 'idx'],
        num_rows: 1409
    })
})

LPI Abstract
DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'idx'],
        num_rows: 4224
    })
    validation: Dataset({
        features: ['text', 'label', 'idx'],
        num_rows: 1409
    })
})

PREDICTS Title
DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'idx'],
        num_rows: 4152
    })
    validation: Dataset({
        features: ['text', 'label', 'idx'],
        num_rows: 1384
    })
})

PREDICTS Abstract
DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'idx'],
        num_rows: 4152
    })
    validation: Dataset({
        features: ['text', 'label', 'idx'],
        num_rows: 1384
    })
})



In [22]:
tokenized_datasets_dict = {}

In [23]:
tokenizer_dict = {}

for model_name, model_checkpoint in zip(model_names, checkpoint_names):
  for dataset_name in list(raw_datasets.keys()):
    for text_data in ["Title", "Abstract"]:
      if(model_checkpoint == "ViktorDo/EcoBERT-Pretrained"):
        tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
        tokenizer_dict[model_name] = AutoTokenizer.from_pretrained("distilbert-base-uncased")
      else:
        tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
        tokenizer_dict[model_name] = AutoTokenizer.from_pretrained(model_checkpoint)

      tokenized_datasets_dict[dataset_name, text_data, model_name] = preprocessed_datasets_dict[dataset_name, text_data].map(tokenize_function, batched=True)
      tokenized_datasets_dict[dataset_name, text_data, model_name] = tokenized_datasets_dict[dataset_name, text_data, model_name].remove_columns(["text", "idx"])
      tokenized_datasets_dict[dataset_name, text_data, model_name] = tokenized_datasets_dict[dataset_name, text_data, model_name].rename_column("label", "labels")
      tokenized_datasets_dict[dataset_name, text_data, model_name].set_format("torch")

Map:   0%|          | 0/4224 [00:00<?, ? examples/s]

Map:   0%|          | 0/1409 [00:00<?, ? examples/s]

Map:   0%|          | 0/4224 [00:00<?, ? examples/s]

Map:   0%|          | 0/1409 [00:00<?, ? examples/s]

Map:   0%|          | 0/4152 [00:00<?, ? examples/s]

Map:   0%|          | 0/1384 [00:00<?, ? examples/s]

Map:   0%|          | 0/4152 [00:00<?, ? examples/s]

Map:   0%|          | 0/1384 [00:00<?, ? examples/s]

Map:   0%|          | 0/4224 [00:00<?, ? examples/s]

Map:   0%|          | 0/1409 [00:00<?, ? examples/s]

Map:   0%|          | 0/4224 [00:00<?, ? examples/s]

Map:   0%|          | 0/1409 [00:00<?, ? examples/s]

Map:   0%|          | 0/4152 [00:00<?, ? examples/s]

Map:   0%|          | 0/1384 [00:00<?, ? examples/s]

Map:   0%|          | 0/4152 [00:00<?, ? examples/s]

Map:   0%|          | 0/1384 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/4224 [00:00<?, ? examples/s]

Map:   0%|          | 0/1409 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/4224 [00:00<?, ? examples/s]

Map:   0%|          | 0/1409 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/4152 [00:00<?, ? examples/s]

Map:   0%|          | 0/1384 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/4152 [00:00<?, ? examples/s]

Map:   0%|          | 0/1384 [00:00<?, ? examples/s]

Map:   0%|          | 0/4224 [00:00<?, ? examples/s]

Map:   0%|          | 0/1409 [00:00<?, ? examples/s]

Map:   0%|          | 0/4224 [00:00<?, ? examples/s]

Map:   0%|          | 0/1409 [00:00<?, ? examples/s]

Map:   0%|          | 0/4152 [00:00<?, ? examples/s]

Map:   0%|          | 0/1384 [00:00<?, ? examples/s]

Map:   0%|          | 0/4152 [00:00<?, ? examples/s]

Map:   0%|          | 0/1384 [00:00<?, ? examples/s]

In [24]:
data_collator_dict = {}
for model_name, model_checkpoint in zip(model_names, checkpoint_names):
  if(model_checkpoint == "ViktorDo/EcoBERT-Pretrained"):
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
  else:
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
  data_collator_dict[model_name] = DataCollatorWithPadding(tokenizer=tokenizer)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Classical ML Models

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

results_list = []

for dataset_name in list(raw_datasets.keys()):
  for text_data in ["Title", "Abstract"]:
    print(dataset_name, text_data)
    X_train, X_test, \
    y_train, y_test, \
    indices_train, indices_test \
            = train_test_split(raw_datasets[dataset_name][text_data], raw_datasets[dataset_name]["label"], np.arange(len(raw_datasets[dataset_name])), test_size=0.25, random_state=42)

    vectorizer = TfidfVectorizer(analyzer = 'word', ngram_range = (1, 2), max_df = 0.85)
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)

    lr_mod = LogisticRegression(penalty='l2', class_weight='balanced').fit(X_train, y_train)
    y_predict = lr_mod.predict(X_test)

    results = calculate_scores(y_test, y_predict)

    results_list.append([dataset_name, text_data] + results + ["Logistic Regression"])

df_results_lr = pd.DataFrame(results_list, columns=["Dataset", "Text", "Accuracy", "Precision", "Recall", "F1-Score", "AUC", "Model"])

LPI Title
LPI Abstract
PREDICTS Title
PREDICTS Abstract


In [None]:
df_results_lr

Unnamed: 0,Dataset,Text,Accuracy,Precision,Recall,F1-Score,AUC,Model
0,LPI,Title,0.932576,0.706731,0.812155,0.755784,0.88124,Logistic Regression
1,LPI,Abstract,0.960256,0.801932,0.917127,0.85567,0.94187,Logistic Regression
2,PREDICTS,Title,0.961705,0.78022,0.916129,0.84273,0.941791,Logistic Regression
3,PREDICTS,Abstract,0.973266,0.843023,0.935484,0.88685,0.956757,Logistic Regression


# Large Language Models - Title

## Training

In [30]:
model_dict = {}

In [31]:
if torch.cuda.is_available():  # Tell PyTorch to use the GPU.
 device = torch.device("cuda")
 print('There are %d GPU(s) available.' % torch.cuda.device_count())
 print('We will use the GPU:', torch.cuda.get_device_name(0)) # If not...
else:
 print('No GPU available, using the CPU instead.')
 device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [32]:
for model_name, model_checkpoint in zip(model_names[:], checkpoint_names[:]):
  for dataset_name in list(raw_datasets.keys())[:]:
    for text_data in ["Title"][:]:
      print(model_name, dataset_name, text_data)

      model_dict[dataset_name, text_data, model_name] = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

      training_args = TrainingArguments(
          output_dir = "{}-finetuned-topicmodelling-{}".format(model_name, dataset_name+text_data),
          learning_rate = 2e-5,
          per_device_train_batch_size = 16,
          per_device_eval_batch_size = 16,
          num_train_epochs = 3,
          weight_decay = 0.01,
          evaluation_strategy = "epoch",
          # push_to_hub=True,
          fp16 = True
      )

      trainer = Trainer(
          model = model_dict[dataset_name, text_data, model_name],
          args = training_args,
          train_dataset = tokenized_datasets_dict[dataset_name, text_data, model_name]["train"],
          eval_dataset = tokenized_datasets_dict[dataset_name, text_data, model_name]["validation"],
          tokenizer = tokenizer_dict[model_name],
          data_collator = data_collator_dict[model_name]
      )

      trainer.train()

      # trainer.push_to_hub()

distilbert LPI Title


Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at ViktorDo/EcoBERT-Pretrained were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at ViktorDo/EcoBERT-Pretrained and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pr

Epoch,Training Loss,Validation Loss
1,No log,0.17502
2,0.128000,0.125894
3,0.128000,0.14029


distilbert PREDICTS Title


Some weights of the model checkpoint at ViktorDo/EcoBERT-Pretrained were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at ViktorDo/EcoBERT-Pretrained and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pr

Epoch,Training Loss,Validation Loss
1,No log,0.08387
2,0.120700,0.102451
3,0.120700,0.082015


distilbert LPI Title


Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classi

Epoch,Training Loss,Validation Loss
1,No log,0.142272
2,0.130000,0.13983
3,0.130000,0.151728


distilbert PREDICTS Title


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classi

Epoch,Training Loss,Validation Loss
1,No log,0.107874
2,0.119100,0.121189
3,0.119100,0.101804


debertav2 LPI Title


Downloading pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.weight']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a

Epoch,Training Loss,Validation Loss
1,No log,0.178375
2,0.175700,0.157655
3,0.175700,0.164302


debertav2 PREDICTS Title


Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.weight']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a

Epoch,Training Loss,Validation Loss
1,No log,0.233889
2,0.150300,0.110278
3,0.150300,0.119524


electra LPI Title


Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/electra-base-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.o

Epoch,Training Loss,Validation Loss
1,No log,0.136794
2,0.160400,0.153299
3,0.160400,0.166023


electra PREDICTS Title


Some weights of the model checkpoint at google/electra-base-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.o

Epoch,Training Loss,Validation Loss
1,No log,0.134602
2,0.145100,0.129215
3,0.145100,0.124885


## Evaluation

In [33]:
results_list = []

for model_name, model_checkpoint in zip(model_names[:], checkpoint_names[:]):
  for dataset_name in list(raw_datasets.keys())[:]:
    for text_data in ["Title"][:]:

      eval_dataloader = DataLoader(
            tokenized_datasets_dict[dataset_name, text_data, model_name]["validation"], batch_size=8, collate_fn=data_collator_dict[model_name]
      )


      logit_list = []
      prediction_list = []
      labels_list = []
      model_dict[dataset_name, text_data, model_name].eval()
      for batch in eval_dataloader:
          batch = {k: v.to(device) for k, v in batch.items()}
          with torch.no_grad():
              outputs = model_dict[dataset_name, text_data, model_name](**batch)

          labels = batch["labels"]
          logits = outputs.logits
          predictions = torch.argmax(logits, dim=-1)
          logit_list.append(logits.cpu().detach().numpy())
          prediction_list.append(predictions.cpu().detach().numpy())
          labels_list.append(labels.cpu().detach().numpy())

      y_true = np.array([item for sublist in labels_list for item in sublist])
      y_predict = np.array([item for sublist in prediction_list for item in sublist])

      results = calculate_scores(y_true, y_predict)

      results_list.append([dataset_name, text_data] + results + [model_name])

df_results = pd.DataFrame(results_list, columns=["Dataset", "Text", "Accuracy", "Precision", "Recall", "F1-Score", "AUC", "Model"])

In [34]:
df_results

Unnamed: 0,Dataset,Text,Accuracy,Precision,Recall,F1-Score,AUC,Model
0,LPI,Title,0.958126,0.871951,0.790055,0.828986,0.886477,distilbert
1,PREDICTS,Title,0.969653,0.850932,0.883871,0.867089,0.932171,distilbert
2,LPI,Title,0.958126,0.871951,0.790055,0.828986,0.886477,distilbert
3,PREDICTS,Title,0.969653,0.850932,0.883871,0.867089,0.932171,distilbert
4,LPI,Title,0.958836,0.877301,0.790055,0.831395,0.886884,debertav2
5,PREDICTS,Title,0.965318,0.820359,0.883871,0.850932,0.92973,debertav2
6,LPI,Title,0.958836,0.859649,0.812155,0.835227,0.896305,electra
7,PREDICTS,Title,0.965318,0.849673,0.83871,0.844156,0.909998,electra


In [35]:
df_results.to_excel("TopicModeling_Encoder_Results_Title.xlsx", index = False)

# Large Language Models - Abstract

## Training

In [25]:
model_dict = {}

In [26]:
if torch.cuda.is_available():  # Tell PyTorch to use the GPU.
 device = torch.device("cuda")
 print('There are %d GPU(s) available.' % torch.cuda.device_count())
 print('We will use the GPU:', torch.cuda.get_device_name(0)) # If not...
else:
 print('No GPU available, using the CPU instead.')
 device = torch.device("cpu")

import gc
torch.cuda.empty_cache()
gc.collect()

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


218

In [27]:
for model_name, model_checkpoint in zip(model_names[:], checkpoint_names[:]):
  for dataset_name in list(raw_datasets.keys())[:]:
    for text_data in ["Abstract"][:]:
      print(model_name, dataset_name, text_data)

      model_dict[dataset_name, text_data, model_name] = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

      training_args = TrainingArguments(
          output_dir = "{}-finetuned-topicmodelling-{}".format(model_name, dataset_name+text_data),
          learning_rate = 2e-5,
          per_device_train_batch_size = 8,
          per_device_eval_batch_size = 8,
          num_train_epochs = 3,
          weight_decay = 0.01,
          evaluation_strategy = "epoch",
          # push_to_hub=True,
          fp16 = True
      )

      trainer = Trainer(
          model = model_dict[dataset_name, text_data, model_name],
          args = training_args,
          train_dataset = tokenized_datasets_dict[dataset_name, text_data, model_name]["train"],
          eval_dataset = tokenized_datasets_dict[dataset_name, text_data, model_name]["validation"],
          tokenizer = tokenizer_dict[model_name],
          data_collator = data_collator_dict[model_name]
      )

      trainer.train()

      torch.cuda.empty_cache()
      gc.collect()
      # trainer.push_to_hub()

distilbert LPI Abstract


Some weights of the model checkpoint at ViktorDo/EcoBERT-Pretrained were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at ViktorDo/EcoBERT-Pretrained and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pr

Epoch,Training Loss,Validation Loss
1,0.1498,0.087065
2,0.0807,0.101794
3,0.0368,0.1025


distilbert PREDICTS Abstract


Some weights of the model checkpoint at ViktorDo/EcoBERT-Pretrained were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at ViktorDo/EcoBERT-Pretrained and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pr

Epoch,Training Loss,Validation Loss
1,0.1284,0.066457
2,0.0675,0.070919
3,0.0258,0.075232


distilbert LPI Abstract


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classi

Epoch,Training Loss,Validation Loss
1,0.1547,0.113928
2,0.0866,0.093054
3,0.0426,0.10505


distilbert PREDICTS Abstract


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classi

Epoch,Training Loss,Validation Loss
1,0.1386,0.063935
2,0.0708,0.086005
3,0.0341,0.068978


debertav2 LPI Abstract


Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.weight', 'mask_predictions.dense.weight']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a

Epoch,Training Loss,Validation Loss
1,0.187,0.149469
2,0.1092,0.124374
3,0.0683,0.103767


debertav2 PREDICTS Abstract


Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.weight', 'mask_predictions.dense.weight']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a

Epoch,Training Loss,Validation Loss
1,0.1623,0.090255
2,0.0949,0.126035
3,0.0574,0.083314


electra LPI Abstract


Some weights of the model checkpoint at google/electra-base-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.o

Epoch,Training Loss,Validation Loss
1,0.1687,0.141681
2,0.0914,0.10994
3,0.0447,0.116972


electra PREDICTS Abstract


Some weights of the model checkpoint at google/electra-base-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.o

Epoch,Training Loss,Validation Loss
1,0.1515,0.110367
2,0.0679,0.09455
3,0.0425,0.09582


## Evaluation

In [28]:
results_list = []

for model_name, model_checkpoint in zip(model_names[:], checkpoint_names[:]):
  for dataset_name in list(raw_datasets.keys())[:]:
    for text_data in ["Abstract"][:]:

      eval_dataloader = DataLoader(
            tokenized_datasets_dict[dataset_name, text_data, model_name]["validation"], batch_size=8, collate_fn=data_collator_dict[model_name]
      )


      logit_list = []
      prediction_list = []
      labels_list = []
      model_dict[dataset_name, text_data, model_name].eval()
      for batch in eval_dataloader:
          batch = {k: v.to(device) for k, v in batch.items()}
          with torch.no_grad():
              outputs = model_dict[dataset_name, text_data, model_name](**batch)

          labels = batch["labels"]
          logits = outputs.logits
          predictions = torch.argmax(logits, dim=-1)
          logit_list.append(logits.cpu().detach().numpy())
          prediction_list.append(predictions.cpu().detach().numpy())
          labels_list.append(labels.cpu().detach().numpy())

      y_true = np.array([item for sublist in labels_list for item in sublist])
      y_predict = np.array([item for sublist in prediction_list for item in sublist])

      results = calculate_scores(y_true, y_predict)

      results_list.append([dataset_name, text_data] + results + [model_name])

df_results = pd.DataFrame(results_list, columns=["Dataset", "Text", "Accuracy", "Precision", "Recall", "F1-Score", "AUC", "Model"])

In [29]:
df_results

Unnamed: 0,Dataset,Text,Accuracy,Precision,Recall,F1-Score,AUC,Model
0,LPI,Abstract,0.977999,0.898936,0.933702,0.915989,0.959115,distilbert
1,PREDICTS,Abstract,0.984827,0.91875,0.948387,0.933333,0.968905,distilbert
2,LPI,Abstract,0.977999,0.898936,0.933702,0.915989,0.959115,distilbert
3,PREDICTS,Abstract,0.984827,0.91875,0.948387,0.933333,0.968905,distilbert
4,LPI,Abstract,0.977289,0.916201,0.906077,0.911111,0.946931,debertav2
5,PREDICTS,Abstract,0.979769,0.889571,0.935484,0.91195,0.960419,debertav2
6,LPI,Abstract,0.97516,0.896739,0.911602,0.90411,0.948065,electra
7,PREDICTS,Abstract,0.980491,0.895062,0.935484,0.914826,0.960826,electra


In [30]:
df_results.to_excel("TopicModeling_Encoder_Results_Abstract.xlsx", index = False)