In [1]:
import pandas as pd
import numpy as np
np.random.seed(1907)
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate

from datasets import Dataset

id2label = {
    0:"Cardiac",
    1:"Brain",
    2:"Prostate"
}

label2id = {
    "Cardiac":0,
    "Brain":1,
    "Prostate":2
}

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

df = pd.read_csv("../../data/abstract_dataset.csv")

df_unique = df.drop_duplicates(subset=['title','doi'],keep=False)
df_unique = df_unique[['abstract','task']]
df_unique.columns=["text","labels"]
df_unique['labels']= df_unique['labels'].replace({'Cardiac':0, 'Brain':1, 'Prostate':2})

train,test = train_test_split(df_unique,test_size=0.2)
test,dev = train_test_split(test,test_size=0.5)


train_dataset = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test)
dev_dataset = Dataset.from_pandas(dev)

def preprocess_function(exemples):
    return tokenizer(exemples["text"],truncation=True,padding='max_length',return_tensors="pt",max_length=300)

tokenized_train =  train_dataset.map(preprocess_function,batched=True)
tokenized_test =  test_dataset.map(preprocess_function,batched=True)
tokenized_dev =  dev_dataset.map(preprocess_function,batched=True)

2023-03-22 09:35:16.970905: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-22 09:35:17.324039: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-03-22 09:35:17.398791: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-03-22 09:35:17.398808: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if yo

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [2]:
model = AutoModelForSequenceClassification.from_pretrained(
    "bionlp/bluebert_pubmed_mimic_uncased_L-24_H-1024_A-16", num_labels=3, id2label=id2label, label2id=label2id
)

training_args = TrainingArguments(output_dir="test_trainer",evaluation_strategy="epoch",per_device_train_batch_size=1,per_device_eval_batch_size=1)
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_dev,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)


Some weights of the model checkpoint at bionlp/bluebert_pubmed_mimic_uncased_L-24_H-1024_A-16 were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequ

In [3]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2685
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 8055
  Number of trainable parameters = 335144963


  0%|          | 0/8055 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


: 

: 