In [None]:
!pip install -Uq adapters
!pip install -q datasets
!pip install -q accelerate

In [None]:
from transformers import DataCollatorWithPadding
import numpy as np
from datasets import load_dataset
from sklearn.model_selection import KFold

In [None]:
from transformers import AutoConfig
from adapters import AutoAdapterModel

In [None]:
#loading dataset from huggingface
from datasets import load_dataset

dataset = load_dataset("mediabiasgroup/mbib-base")
dataset.num_rows

In [None]:
#random shuffling (inter-mixing) of dataset
dataset1=dataset['text_level_bias'].shuffle(seed=42)


In [None]:
# To replace nan values in dataset1 with ' '

dataset1 = dataset1.map(lambda x: {'text': ' ' if x['text'] is None else x['text']})

In [None]:
# # replace nan value with empty string
# dataset1['text'] = dataset1['text'].fillna(' ')

In [None]:
# generating 3 folds using KFold from sklearn
kf = KFold(n_splits=3)


In [None]:
from transformers import AutoTokenizer

#tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-2022-154m")

In [None]:
from transformers import AutoTokenizer

#tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-2022-154m")
def encode_batch(batch):
  """Encodes a batch of input data using the model tokenizer."""
  return tokenizer(batch["text"], padding=True,truncation=True, max_length=512)

# Encode the input data
dataset1 = dataset1.map(encode_batch, batched=True)
# The transformers model expects the target class column to be named "labels"
dataset1 = dataset1.rename_column("label", "labels")
# Transform to pytorch tensors and only output the required columns
dataset1.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:

id2label ={0:0,1:1}


In [None]:
config = AutoConfig.from_pretrained("cardiffnlp/twitter-roberta-base-2022-154m")
model = AutoAdapterModel.from_pretrained(
    "cardiffnlp/twitter-roberta-base-2022-154m",
    config=config,
)

In [None]:
from adapters.composition import Fuse

#Load the pre-trained adapters we want to fuse
model.load_adapter("SOUMYADEEPSAR/political_bias", with_head=False)
model.load_adapter("SOUMYADEEPSAR/gender_bias", with_head=False)
model.load_adapter("SOUMYADEEPSAR/racial_bias", with_head=False)
model.load_adapter("SOUMYADEEPSAR/cognitive_bias1", with_head=False)
model.load_adapter("SOUMYADEEPSAR/text_level_bias1", with_head=False)
# Add a fusion layer for all loaded adapters
adapter_setup = Fuse("political_bias", "gender_bias", "racial_bias","cognitive_bias1","text_level_bias1")
model.add_adapter_fusion(adapter_setup)

# Add a classification head for our target task
model.add_classification_head("cognitive_bias_fusion", num_labels=len(id2label))

In [None]:
model.train_adapter_fusion(adapter_setup)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
import numpy as np
from transformers import TrainingArguments, EvalPrediction,EarlyStoppingCallback
from adapters import AdapterTrainer

from sklearn.metrics import f1_score

training_args = TrainingArguments(
    learning_rate=5.5e-5,
    num_train_epochs=6,
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    #logging_steps=200,
    output_dir="./training_output",
    #overwrite_output_dir=True,
    load_best_model_at_end=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
    report_to="none"
)

def compute_metrics(p: EvalPrediction):
  preds = np.argmax(p.predictions, axis=1)
  f1 = f1_score(p.label_ids, preds, average='macro')
  return {
      'macro_f1': f1,
  }



In [None]:
for fold, (train_idx, test_idx) in enumerate(kf.split(dataset1)):
    train_idx1 = train_idx[:int((len(train_idx) )*0.8)]
    val_idx = train_idx[int((len(train_idx)) *0.8):]
    print(f"Fold {fold+1}")
    print(train_idx1)
    print(val_idx)
    print(test_idx)
    print(len(train_idx1))
    print(len(val_idx))
    print(len(test_idx))

In [None]:
from adapters import AdapterTrainer

In [None]:
f1_scores=[]
for fold, (train_idx, test_idx) in enumerate(kf.split(dataset1)):
    print(f"Fold {fold+1}")
    # 80 percent of train data from a fold was used for training and 20 percent for validation
    train_idx1 = train_idx[:int((len(train_idx) )*0.8)]
    val_idx = train_idx[int((len(train_idx)) *0.8):]
    print(f"Fold {fold+1}")
    # print(len(train_idx1))
    # print(len(val_idx))
    # print(len(test_idx))
    # Split dataset into training and validation based on the indices from KFold
    train_split = dataset1.select(train_idx1)
    val_split = dataset1.select(val_idx)
    test_split = dataset1.select(test_idx)
    print(len(train_split))
    print(len(val_split))
    print(len(test_split))

    # Load the model for classification and its configuration

    config = AutoConfig.from_pretrained("cardiffnlp/twitter-roberta-base-2022-154m")

    model = AutoAdapterModel.from_pretrained(
    "cardiffnlp/twitter-roberta-base-2022-154m",
    config=config,
    )

    from adapters.composition import Fuse

    #Load the pre-trained adapters we want to fuse
    model.load_adapter("SOUMYADEEPSAR/political_bias", with_head=False)
    model.load_adapter("SOUMYADEEPSAR/gender_bias", with_head=False)
    model.load_adapter("SOUMYADEEPSAR/racial_bias", with_head=False)
    model.load_adapter("SOUMYADEEPSAR/cognitive_bias1", with_head=False)
    model.load_adapter("SOUMYADEEPSAR/text_level_bias1", with_head=False)

    # Add a fusion layer for all loaded adapters
    adapter_setup = Fuse("political_bias", "gender_bias", "racial_bias","cognitive_bias1","text_level_bias1")
    model.add_adapter_fusion(adapter_setup)


    # Add a classification head for our target task
    model.add_classification_head("political_bias_fusion", num_labels=len(id2label))
    #activating adapter fusion layers for training and freezing all other parameters of model and single-task adapters
    model.train_adapter_fusion(adapter_setup)

    # Create a Trainer instance
    trainer = AdapterTrainer(
        model=model,
        args=training_args,
        train_dataset=train_split,
        eval_dataset=val_split,
        compute_metrics=compute_metrics,
        data_collator=data_collator,
        callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
    )

    # Train the model
    trainer.train()

    # Evaluate the model on validation data on kth fold
    metrics = trainer.evaluate()
    print(f"Metrics for valid fold {fold+1}: {metrics}")
    #Evaluate on test data
    outputs = trainer.predict(test_split)
    print(f"Metrics for test fold {fold+1}: {outputs.metrics}")
    f1_scores.append(outputs.metrics['test_macro_f1'])


In [None]:
# Find average of f1_scores from each folds in cross-validation

import numpy as np

# Assuming f1_scores is a list of f1 scores from each fold
average_f1 = np.mean(f1_scores)
print(f"Average F1 score: {average_f1}")