For implementation this classification task, following resources are referred:


1.   https://huggingface.co/transformers/v3.2.0/custom_datasets.html
2.   https://www.philschmid.de/k-fold-as-cross-validation-with-a-bert-text-classification-example
3.   https://www.crummy.com/software/BeautifulSoup/bs4/doc/




In [None]:
!pip install transformers[torch]

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

#Data scrapping using Beautiful Soup

In [None]:
!pip install bs4 lxml

In [None]:
#importing BeautifulSoup
from bs4 import BeautifulSoup as bs

In [None]:
import pandas as pd

In [None]:
content = []

#loading data
with open("/content/creg-tue-control-group.xml", "r") as file:
  content = file.readlines()
content = "".join(content)
bs_content = bs(content, "xml")

In [None]:
bs_content

In [None]:
s_ids = []
answers = []
q_ids = []
binary_values = []

In [None]:
s_answers = bs_content.find_all('StudentAnswer')
for s_ans in s_answers:
    student_id = s_ans['student_id']
    question_id = s_ans['question_id']
    answer_text = s_ans.find('answerText').text
    diagnoses = s_ans.find_all('diagnosis')
    for diagnosis in diagnoses:
      binary_value = diagnosis.get('binary', '')

      s_ids.append(student_id)
      q_ids.append(question_id)
      answer.append(answer_text)
      binary_values.append(binary_value)

In [None]:
df = pd.DataFrame({
    'Student_ID': s_ids,
    'Question_ID': q_ids,
    'Answer_Text': answer,
    'Binary_Value': binary_values
})

In [None]:
df.tail(5)

In [None]:
#fuction to assign anything else apart from the 0 and 1 as 2
def to_skill(label):
  skill = str(label).strip().lower()
  if skill == "true":
    return 1
  elif skill == "false":
    return 0
  else:
    return 2

In [None]:
df['Binary_value'] = df.Binary_Value.apply(to_skill)

In [None]:
df['Binary_Value'].unique()

In [None]:
df.rename(columns = {'Answer_Text':'text1', 'Binary_value':'labels'}, inplace = True)

In [None]:
df = df.dropna()

In [None]:
df['labels'] = df['labels'].astype(int)
df['labels'].unique()

In [None]:
df['labels']

In [None]:
#seceting the rows with label value 2 and dropping them

indices = df[df['labels'] == 2].index
df = df.drop(index=indices)

print(df)

In [None]:
df['labels'].unique()

In [None]:
df.head(5)

*   Method: Instance-based method
*   Dataset: CREG-TUE
*   Classifier: Binary


For instance-based scoring for various models, repleace the model name as follows:


1.   FacebookAI/xlm-roberta-large
2.   FacebookAI/xlm-roberta-base
3.   deepset/gelectra-large
4.   deepset/gelectra-base
5.   deepset/gbert-large
6.   deepset/gbert-base

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from torch.utils.data import DataLoader, Dataset
import torch

In [None]:
texts = list(df['text1'].values)
labels = list(df['labels'].values)

df1 = pd.DataFrame.from_dict({'texts': texts, 'labels': labels})

class TorchDataSet(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: v[idx].clone() for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx]).long()
        return item

    def __len__(self):
        return len(self.labels)

#storing the individual values of f1score, accuracy, precision, recall for each fold
f1s_list = []
accuracy_list = []
precision_list = []
recall_list = []

for jk in range(1):
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    l = 0
    for train_index, val_index in kfold.split(df1):
        print(f'{jk+1}x{l+1}')

        X_train = df1['texts'].values[train_index]
        X_test = df1['texts'].values[val_index]
        y_train = df1['labels'].values[train_index]
        y_test = df1['labels'].values[val_index]

        tokenizer = AutoTokenizer.from_pretrained('FacebookAI/xlm-roberta-large', truncation=True, padding=True, max_length=512)
        model = AutoModelForSequenceClassification.from_pretrained("FacebookAI/xlm-roberta-large", num_labels=2, ignore_mismatched_sizes=True).to(torch.device('cuda'))
        collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)

        train_inp = tokenizer(list(X_train), padding=True, truncation=True, return_tensors='pt')
        test_inp = tokenizer(list(X_test), padding=True, truncation=True, return_tensors='pt')
        train_dataset = TorchDataSet(train_inp, y_train)
        test_dataset = TorchDataSet(test_inp, y_test)

        def compute_metrics(eval_pred):
            labels = eval_pred.label_ids
            preds = np.argmax(eval_pred.predictions, axis=1)
            accuracy = accuracy_score(labels, preds)
            f1 = f1_score(labels, preds)
            precision = precision_score(labels, preds)
            recall = recall_score(labels, preds)

            return {
                "accuracy": accuracy,
                "f1": f1,
                "precision": precision,
                "recall": recall

            }

        training_args = TrainingArguments(
            output_dir="final_bert_trainer",
            evaluation_strategy="epoch",
            save_strategy='epoch',
            logging_strategy='epoch',
            learning_rate=1e-5,
            per_device_train_batch_size=2,
            per_device_eval_batch_size=2,
            num_train_epochs=3,
            weight_decay=0.01,
            load_best_model_at_end=True,
            save_total_limit=1,
            logging_dir='./logs',
            warmup_steps=400,
            fp16=True,
            gradient_accumulation_steps=8
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=test_dataset,
            tokenizer=tokenizer,
            data_collator=collator,
            compute_metrics=compute_metrics,
        )

        trainer.train()

        #model_save_name = '...'
        #path = F".../{model_save_name}"
        #torch.save(model.state_dict(), path)

        # Evaluate the model
        model.eval()

        logits_list = []
        for batch in DataLoader(test_dataset, batch_size=2):
            with torch.no_grad():
                input_ids = batch['input_ids'].to(torch.device('cuda'))
                attention_mask = batch['attention_mask'].to(torch.device('cuda'))
                labels = batch['labels'].to(torch.device('cuda'))

                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                logits = outputs.logits
                logits_list.append(logits.cpu().numpy())

        logits = np.concatenate(logits_list, axis=0)

        threshold = 0.5
        probabilities = torch.sigmoid(torch.tensor(logits))
        predictions = (probabilities >= threshold).int()
        pred = predictions.numpy()

        y_test_binary = (y_test >= 0.5).astype(int)
        pred_binary = np.argmax(pred, axis=1)

        accuracy = accuracy_score(y_test_binary, pred_binary)
        precision = precision_score(y_test_binary, pred_binary)
        recall = recall_score(y_test_binary, pred_binary)
        f1 = f1_score(y_test_binary, pred_binary)


        print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}")
        l += 1
        accuracy_list.append(accuracy)
        precision_list.append(precision)
        recall_list.append(recall)
        f1s_list.append(f1)


         # clearing cache
        torch.cuda.empty_cache()

# Mean metric calculation for all folds
mean_accuracy = np.mean(accuracy_list)
mean_precision = np.mean(precision_list)
mean_recall = np.mean(recall_list)
mean_f1s = np.mean(f1s_list)

print(f"Mean Accuracy: {mean_accuracy}")
print(f"Mean Precision: {mean_precision}")
print(f"Mean Recall: {mean_recall}")
print(f"Mean F1 Score: {mean_f1s}")

Entailement_based scoring

*   Method: Entailment-based scoring
*   Dataset: CREG-TUE
*   Classifier: Binary


For entailment-based scoring for various models, repleace the model name as follows:


1.   FacebookAI/xlm-roberta-large
2.   FacebookAI/xlm-roberta-base
3.   deepset/gelectra-large
4.   deepset/gelectra-base
5.   deepset/gbert-large
6.   deepset/gbert-base

In [None]:
content = []

#loading data
with open("/content/creg-tue-control-group.xml", "r") as file:
  content = file.readlines()
content = "".join(content)
bs_content = bs(content, "xml")

In [None]:
bs_content

In [None]:
target_answer_ids = []
question_ids = []
target_answers = []
student_ids = []
student_answers = []
twoclass_labels = []

In [None]:
q = bs_content.find_all('Question')
for question in q:
    target_ans_id = question.find('TargetAnswer')['id']
    target_ans = question.find('TargetAnswer').text

    student_answers_data = question.find_all('StudentAnswer')
    for s_ans_data in student_answers_data:
        student_id = s_ans_data['student_id']
        question_id = s_ans_data['question_id']
        student_answer = s_ans_data.find('answerText').text
        diagnoses = s_ans_data.find_all('diagnosis')
        for diagnosis in diagnoses:
          twoclass_value = diagnosis.get('binary')

          student_ids.append(student_id)
          question_ids.append(question_id)
          student_answers.append(student_answer)
          twoclass_labels.append(twoclass_value)
          target_answer_ids.append(target_ans_id)
          target_answers.append(target_ans)

In [None]:
# Create a DataFrame
df = pd.DataFrame({
    'target_answer_id': target_answer_ids,
    'target_answer': target_answers,
    'student_id': student_ids,
    'Question_ID': question_ids,
    'student_answer': student_answers,
    'twoclass_labels': twoclass_labels
})

In [None]:
df

In [None]:
def to_skill(label):
  skill = str(label).strip().lower()
  if skill == "true":
    return 1
  elif skill == "false":
    return 0
  else:
    return 2

In [None]:
df['twoclass_labels'] = df.twoclass_labels.apply(to_skill)

In [None]:
df['twoclass_labels'].unique()

In [None]:
df.rename(columns = {'student_answer':'text1', 'target_answer':'text2', 'twoclass_labels':'labels'}, inplace = True)

In [None]:
df = df.dropna()

In [None]:
df['labels'] = df['labels'].astype(int)
df['labels'].unique()

In [None]:
# dropping the records with label 2
indices = df[df['labels'] == 2].index
df = df.drop(index=indices)

print(df)

In [None]:
df = df.reindex(columns=['text1','text2','labels','student_id','target_answer_id'])

In [None]:
df.head(5)

In [None]:
df['text2'] = df['text2'].str.replace('\n', '')#removing the front \n from every text
df.head(5)

In [None]:
#student response, reference answer and labels
texts1 = list(df['text1'].values)
texts2 = list(df['text2'].values)
labels = list(df['labels'].values)

df1 = pd.DataFrame.from_dict({'texts1': texts1, 'texts2': texts2, 'labels': labels})

class TorchDataSet(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: v[idx].clone() for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx]).long()
        return item

    def __len__(self):
        return len(self.labels)

#storing the individual values of f1score, accuraca, precision, recall for each fold
f1s_list = []
accuracy_list = []
precision_list = []
recall_list = []
kappa_list = []

for jk in range(1):
    kfold = KFold(n_splits=5, shuffle=True)
    l = 0
    for train_index, val_index in kfold.split(df1):
        print(f'{jk+1}x{l+1}')

        X_train = df1[['texts1', 'texts2']].values[train_index]
        X_test = df1[['texts1', 'texts2']].values[val_index]
        y_train = df1['labels'].values[train_index]
        y_test = df1['labels'].values[val_index]

        train_texts = [f"{text1} [SEP] {text2}" for text1, text2 in zip(X_train[:, 0], X_train[:, 1])]
        test_texts = [f"{text1} [SEP] {text2}" for text1, text2 in zip(X_test[:, 0], X_test[:, 1])]

        tokenizer = AutoTokenizer.from_pretrained('deepset/gelectra-large', truncation=True, padding=True, max_length=512)
        model = AutoModelForSequenceClassification.from_pretrained("deepset/gelectra-large", num_labels=2, ignore_mismatched_sizes=True).to(torch.device('cuda'))
        collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)

        train_inp = tokenizer(train_texts, padding=True, truncation=True, return_tensors='pt')
        test_inp = tokenizer(test_texts, padding=True, truncation=True, return_tensors='pt')
        train_dataset = TorchDataSet(train_inp, y_train)
        test_dataset = TorchDataSet(test_inp, y_test)

        def compute_metrics(eval_pred):
            labels = eval_pred.label_ids
            preds = np.argmax(eval_pred.predictions, axis=1)
            accuracy = accuracy_score(labels, preds)
            f1 = f1_score(labels, preds)
            precision = precision_score(labels, preds)
            recall = recall_score(labels, preds)
            return {
                "accuracy": accuracy,
                "f1": f1,
                "precision": precision,
                "recall": recall
            }

        training_args = TrainingArguments(
            output_dir="final_bert_trainer",
            evaluation_strategy="epoch",
            save_strategy='epoch',
            logging_strategy='epoch',
            learning_rate=1e-5,
            per_device_train_batch_size=1,
            per_device_eval_batch_size=1,
            num_train_epochs=3,
            weight_decay=0.01,
            load_best_model_at_end=True,
            save_total_limit=1,
            logging_dir='./logs',
            warmup_steps=400,
            fp16=True,
            gradient_accumulation_steps=8
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=test_dataset,
            tokenizer=tokenizer,
            data_collator=collator,
            compute_metrics=compute_metrics,
        )

        trainer.train()

        model.eval()


        predictions = trainer.predict(test_dataset=test_dataset)
        pred_labels = np.argmax(predictions.predictions, axis=1)
        true_labels = predictions.label_ids

        accuracy = accuracy_score(true_labels, pred_labels)
        precision = precision_score(true_labels, pred_labels)
        recall = recall_score(true_labels, pred_labels)
        f1 = f1_score(true_labels, pred_labels)


        print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}")
        l += 1

        accuracy_list.append(accuracy)
        precision_list.append(precision)
        recall_list.append(recall)
        f1s_list.append(f1)
        kappa_list.append(kappa)

# Calculating the average of evey evaluation metrics for all folds
mean_accuracy = np.mean(accuracy_list)
mean_precision = np.mean(precision_list)
mean_recall = np.mean(recall_list)
mean_f1s = np.mean(f1s_list)

print(f"Mean Accuracy: {mean_accuracy}")
print(f"Mean Precision: {mean_precision}")
print(f"Mean Recall: {mean_recall}")
print(f"Mean F1 Score: {mean_f1s}")
