In [None]:
### training of single-task adapters
!pip -q install --upgrade transformers

In [None]:
!pip -q install --upgrade torch torchvision torchaudio

In [None]:
!pip install -qq adapters datasets

In [None]:
import torch
from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer, AutoConfig
import torch
from adapters import AutoAdapterModel
import numpy as np
from transformers import TrainingArguments, EvalPrediction
from adapters import AdapterTrainer
from sklearn.metrics import f1_score

In [None]:
from datasets import load_dataset, DatasetDict

In [None]:
#load dataset MBIB
dataset = load_dataset("mediabiasgroup/mbib-base")
#dataset = load_dataset("rotten_tomatoes")
dataset.num_rows

In [None]:
## shuffling dataset split of respective bias
dataset1=dataset['text_level_bias'].shuffle(seed=42)

In [None]:
# replacing the nan value in dataset1['text']  with ' ' (space)

dataset1 = dataset1.map(lambda examples: {'text': [str(x).replace('nan', ' ') for x in examples['text']]}, batched=True)


In [None]:
#generating split of dataset1 into train, dev, test

dataset1 = dataset1.train_test_split(test_size=0.4, seed=42,shuffle=False)
dataset1['test'] = dataset1['test'].train_test_split(test_size=0.5, seed=42,shuffle=False)
dataset1['validation'] = dataset1['test']['train']
dataset1['test'] = dataset1['test']['test']


In [None]:

#tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-2022-154m")
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-base-mnli")

def encode_batch(batch):
  """Encodes a batch of input data using the model tokenizer."""
  #return tokenizer(, max_length=512, truncation=True, padding="max_length")
  return tokenizer(batch["text"], padding= True,truncation=True, max_length=512)

# Encode the input data
dataset1 = dataset1.map(encode_batch, batched=True)
# The transformers model expects the target class column to be named "labels"
dataset1 = dataset1.rename_column(original_column_name="label", new_column_name="labels")
# Transform to pytorch tensors and only output the required columns
dataset1.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
## Configuration and pre-trained model is being downloaded
config = AutoConfig.from_pretrained("microsoft/deberta-base-mnli")
model = AutoAdapterModel.from_pretrained(
    "microsoft/deberta-base-mnli",
    config=config,
)

In [None]:
# Add a new adapter

model.add_adapter("text_level_bias_deberta-mnli", config="seq_bn")

# Add a matching classification head
model.add_classification_head(
    "text_level_bias_deberta-mnli",
    num_labels=2,
    id2label={ 0:0, 1:1}
  )

# Activate the adapter
model.train_adapter("text_level_bias_deberta-mnli")


In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
import numpy as np
from transformers import TrainingArguments, EvalPrediction, EarlyStoppingCallback
from adapters import AdapterTrainer
from sklearn.metrics import f1_score

training_args = TrainingArguments(
    learning_rate=1.2e-4,
    num_train_epochs=6,
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    #logging_steps=200,
    output_dir="./training_output",
    #overwrite_output_dir=True,
    load_best_model_at_end=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
    report_to="none"
)

def compute_metrics(p: EvalPrediction):
  preds = np.argmax(p.predictions, axis=1)
  f1 = f1_score(p.label_ids, preds, average='macro')
  return {
      'macro_f1': f1,
  }

trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset1["train"],
    eval_dataset=dataset1["validation"],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

In [None]:
trainer.train()

In [None]:


# Define your compute_metrics function
def compute_macro_f1(pred):
  labels = pred.label_ids
  preds = np.argmax(pred.predictions, axis=1)
  f1 = f1_score(labels, preds, average='macro')
  return {
      'macro-f1': f1,
  }

# Evaluate on the test set using predict and compute_macro_f1
predictions = trainer.predict(dataset1["test"])
metrics = compute_macro_f1(predictions)

print(metrics) # Print the macro-F1 score

In [None]:
from huggingface_hub import notebook_login
notebook_login()


In [None]:
##Upload task adapters to huggingface-hub
model.push_adapter_to_hub(
    "text_level_bias_deberta-mnli",
    "text_level_bias_deberta-mnli",
    datasets_tag='mediabiasgroup/mbib-base'
)

In [None]:
##Inference on test dataset from saved pre-trained adapters
from adapters import AutoAdapterModel
from transformers import AutoTokenizer

model = AutoAdapterModel.from_pretrained("microsoft/deberta-base-mnli")
adapter_name = model.load_adapter("SOUMYADEEPSAR/text_level_bias_deberta-mnli", set_active=True)
tokenizer=AutoTokenizer.from_pretrained("microsoft/deberta-base-mnli")


In [None]:
from tqdm.auto import tqdm
import torch

In [None]:
pred=[]
for i in tqdm(range(len(dataset1['test']))):
  input_data = tokenizer(dataset1['test']['text'][i], return_tensors="pt",truncation=True,max_length=512)
  outputs = model(**input_data)
  predicted = torch.argmax(outputs[0]).item()
  pred.append(predicted)


In [None]:
actual=dataset1['test']['label']

In [None]:
from sklearn.metrics import classification_report
print(classification_report(actual, pred))

In [None]:
from sklearn.metrics import  f1_score
print(f1_score(actual, pred, average='macro'))