For implementation this classification task, following resources are referred:


1.   https://huggingface.co/transformers/v3.2.0/custom_datasets.html
2.   https://www.philschmid.de/k-fold-as-cross-validation-with-a-bert-text-classification-example
3.   https://www.crummy.com/software/BeautifulSoup/bs4/doc/



In [None]:
!pip install transformers[torch]

In [None]:
!pip install bs4 lxml

In [None]:
#importing BeautifulSoup
from bs4 import BeautifulSoup as bs

In [None]:
import pandas as pd

In [None]:
content = []

#loading data
with open("/content/creg-tue-control-group.xml", "r") as file:
  content = file.readlines()
content = "".join(content)
bs_content = bs(content, "xml")

In [None]:
bs_content

In [None]:
target_answer_ids = []
target_answers = []
student_ids = []
student_answers = []
fourclass_labels = []

In [None]:
# Extract data from the parsed XML
questions = bs_content.find_all('Question')
for question in questions:
    target_answer_id = question.find('TargetAnswer')['id']
    target_answer = question.find('TargetAnswer').text

    student_answers_data = question.find_all('StudentAnswer')
    for student_answer_data in student_answers_data:
        student_id = student_answer_data['student_id']
        student_answer = student_answer_data.find('answerText').text
        diagnoses = student_answer_data.find_all('diagnosis')
        for diagnosis in diagnoses:
          fourclass_value = diagnosis.get('detailed', '')

          student_ids.append(student_id)
          student_answers.append(student_answer)
          fourclass_labels.append(fourclass_value)
          target_answer_ids.append(target_answer_id)
          target_answers.append(target_answer)

In [None]:
# Create a DataFrame
df = pd.DataFrame({
    'target_answer_id': target_answer_ids,
    'target_answer': target_answers,
    'student_id': student_ids,
    'student_answer': student_answers,
    'fourclass_labels': fourclass_labels
})

In [None]:
df

In [None]:
def to_skill(label):
  skill = label
  if skill == "EXTRA_CONCEPT":
    return 3
  elif skill == "CORRECT":
    return 2
  elif skill == "BLEND":
    return 1
  elif skill == "MISSING_CONCEPT":
    return 0
  else:
    return 5

In [None]:
df['fourclass_labels'] = df.fourclass_labels.apply(to_skill)

In [None]:
df['fourclass_labels'].unique()

In [None]:
df.rename(columns = {'student_answer':'text1', 'target_answer':'text2','fourclass_labels':'labels'}, inplace = True)

In [None]:
df = df.dropna()

In [None]:
df['labels'] = df['labels'].astype(int)
df['labels'].unique()

In [None]:
#dropping the columns with value 5
indices = df[df['labels'] == 5].index

df = df.drop(index=indices)

print(df)

In [None]:
df = df.reindex(columns=['text1','text2','labels','student_id','target_answer_id'])

In [None]:
df['labels'].unique()

In [None]:
df['text2'] = df['text2'].str.replace('\n', '')
df.head(5)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, cohen_kappa_score
from torch.utils.data import DataLoader, Dataset
import torch
from transformers import AdamW, get_linear_schedule_with_warmup

*   Method: Entailment-based scoring
*   Dataset: CREG-TUE
*   Classifier: 4-class


For entailment-based scoring for various models, repleace the model name as follows:


1.   FacebookAI/xlm-roberta-large
2.   FacebookAI/xlm-roberta-base
3.   deepset/gelectra-large
4.   deepset/gelectra-base
5.   deepset/gbert-large
6.   deepset/gbert-base

In [None]:
texts1 = list(df['text1'].values)
texts2 = list(df['text2'].values)
labels = list(df['labels'].values)

df1 = pd.DataFrame.from_dict({'texts1': texts1, 'texts2': texts2, 'labels': labels})

class TorchDataSet(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: v[idx].clone() for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx]).long()
        return item

    def __len__(self):
        return len(self.labels)

#storing the individual values of f1score, accuraca, precision, recall for each fold
f1s_list = []
accuracy_list = []
precision_list = []
recall_list = []

for jk in range(1):
    kfold = KFold(n_splits=5, shuffle=True)
    l = 0
    for train_index, val_index in kfold.split(df1):
        print(f'{jk+1}x{l+1}')

        X_train = df1[['texts1', 'texts2']].values[train_index]
        X_test = df1[['texts1', 'texts2']].values[val_index]
        y_train = df1['labels'].values[train_index]
        y_test = df1['labels'].values[val_index]

        train_texts = [f"{text1} [SEP] {text2}" for text1, text2 in zip(X_train[:, 0], X_train[:, 1])]
        test_texts = [f"{text1} [SEP] {text2}" for text1, text2 in zip(X_test[:, 0], X_test[:, 1])]

        tokenizer = AutoTokenizer.from_pretrained('FacebookAI/xlm-roberta-large', truncation=True, padding=True, max_length=512)
        model = AutoModelForSequenceClassification.from_pretrained("FacebookAI/xlm-roberta-large", num_labels=4, ignore_mismatched_sizes=True).to(torch.device('cuda'))
        collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)

        train_inp = tokenizer(train_texts, padding=True, truncation=True, return_tensors='pt')
        test_inp = tokenizer(test_texts, padding=True, truncation=True, return_tensors='pt')
        train_dataset = TorchDataSet(train_inp, y_train)
        test_dataset = TorchDataSet(test_inp, y_test)

        def compute_metrics(eval_pred):
            labels = eval_pred.label_ids
            preds = np.argmax(eval_pred.predictions, axis=1)
            accuracy = accuracy_score(labels, preds)
            f1 = f1_score(labels, preds, average='weighted', zero_division=0)
            precision = precision_score(labels, preds, average='weighted', zero_division=0)
            recall = recall_score(labels, preds, average='weighted', zero_division=0)
            return {
                "accuracy": accuracy,
                "f1": f1,
                "precision": precision,
                "recall": recall
            }

        training_args = TrainingArguments(
            output_dir="final_bert_trainer",
            evaluation_strategy="epoch",
            save_strategy='epoch',
            logging_strategy='epoch',
            learning_rate=1e-5,
            per_device_train_batch_size=1,
            per_device_eval_batch_size=1,
            num_train_epochs=3,
            weight_decay=0.01,
            load_best_model_at_end=True,
            save_total_limit=1,
            logging_dir='./logs',
            warmup_steps=400,
            fp16=True,
            gradient_accumulation_steps=8
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=test_dataset,
            tokenizer=tokenizer,
            data_collator=collator,
            compute_metrics=compute_metrics,
        )

        trainer.train()

        model.eval()


        predictions = trainer.predict(test_dataset=test_dataset)
        pred_labels = np.argmax(predictions.predictions, axis=1)
        true_labels = predictions.label_ids
        accuracy = accuracy_score(true_labels, pred_labels)
        precision = precision_score(true_labels, pred_labels, average='weighted')
        recall = recall_score(true_labels, pred_labels, average='weighted')
        f1 = f1_score(true_labels, pred_labels, average='weighted')

        print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}")


        accuracy_list.append(accuracy)
        precision_list.append(precision)
        recall_list.append(recall)
        f1s_list.append(f1)

# Calculating the average of evey evaluation metrics for all folds
mean_accuracy = np.mean(accuracy_list)
mean_precision = np.mean(precision_list)
mean_recall = np.mean(recall_list)
mean_f1s = np.mean(f1s_list)

print(f"Mean Accuracy: {mean_accuracy}")
print(f"Mean Precision: {mean_precision}")
print(f"Mean Recall: {mean_recall}")
print(f"Mean F1 Score: {mean_f1s}")

*   Method: Instance-based scoring
*   Dataset: CREG-TUE
*   Classifier: Binary


For instance-based scoring for various models, repleace the model name as follows:


1.   FacebookAI/xlm-roberta-large
2.   FacebookAI/xlm-roberta-base
3.   deepset/gelectra-large
4.   deepset/gelectra-base
5.   deepset/gbert-large
6.   deepset/gbert-base

In [None]:
texts = list(df['text1'].values)
labels = list(df['labels'].values)

df1 = pd.DataFrame.from_dict({'texts': texts, 'labels': labels})

class TorchDataSet(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: v[idx].clone() for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx]).long()
        return item

    def __len__(self):
        return len(self.labels)

f1s_list = []
accuracy_list = []
precision_list = []
recall_list = []

for jk in range(1):
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    l = 0
    for train_index, val_index in kfold.split(df1):
        print(f'{jk+1}x{l+1}')

        X_train = df1['texts'].values[train_index]
        X_test = df1['texts'].values[val_index]
        y_train = df1['labels'].values[train_index]
        y_test = df1['labels'].values[val_index]

        tokenizer = AutoTokenizer.from_pretrained('FacebookAI/xlm-roberta-large', truncation=True, padding=True, max_length=512)
        model = AutoModelForSequenceClassification.from_pretrained("FacebookAI/xlm-roberta-large", num_labels=4, ignore_mismatched_sizes=True).to(torch.device('cuda'))
        collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)

        train_inp = tokenizer(list(X_train), padding=True, truncation=True, return_tensors='pt')
        test_inp = tokenizer(list(X_test), padding=True, truncation=True, return_tensors='pt')
        train = TorchDataSet(train_inp, y_train)
        test = TorchDataSet(test_inp, y_test)

        def compute_metrics(eval_pred):
          labels = eval_pred.label_ids
          preds = np.argmax(eval_pred.predictions, axis=1)
          accuracy = accuracy_score(labels, preds)
          f1 = f1_score(labels, preds, average='weighted')
          precision = precision_score(labels, preds, average='weighted')
          recall = recall_score(labels, preds, average='weighted')

          return {
              "accuracy": accuracy,
              "f1": f1,
              "precision": precision,
              "recall": recall,
              "quadratic_kappa": kappa
              }

        training_args = TrainingArguments(
            output_dir="final_bert_trainer",
            evaluation_strategy="epoch",
            save_strategy='epoch',
            logging_strategy='epoch',
            learning_rate=1e-5,
            per_device_train_batch_size=2,
            per_device_eval_batch_size=2,
            num_train_epochs=3,
            weight_decay=0.01,
            load_best_model_at_end=True,
            save_total_limit=1,
            logging_dir='./logs',
            warmup_steps=400,
            fp16=True,
            gradient_accumulation_steps=8
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train,
            eval_dataset=test,
            tokenizer=tokenizer,
            data_collator=collator,
            compute_metrics=compute_metrics,
        )

        trainer.train()

        model.eval()

        # Evaluate the model
        predictions = trainer.predict(test_dataset=test)

        pred_labels = np.argmax(predictions.predictions, axis=1)
        true_labels = predictions.label_ids

        # Compute evaluation metrics
        accuracy = accuracy_score(true_labels, pred_labels)
        precision = precision_score(true_labels, pred_labels, average='weighted', zero_division=0)
        recall = recall_score(true_labels, pred_labels, average='weighted', zero_division=0)
        f1 = f1_score(true_labels, pred_labels, average='weighted', zero_division=0)
        kappa = cohen_kappa_score(true_labels, pred_labels, weights='quadratic')

        print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}, Quadratic Kappa: {kappa}")
        l += 1

        accuracy_list.append(accuracy)
        precision_list.append(precision)
        recall_list.append(recall)
        f1s_list.append(f1)
        kappa_list.append(kappa)

# Mean metric calculation for all folds
mean_accuracy = np.mean(accuracy_list)
mean_precision = np.mean(precision_list)
mean_recall = np.mean(recall_list)
mean_f1s = np.mean(f1s_list)
mean_kappa = np.mean(kappa_list)

print(f"Mean Accuracy: {mean_accuracy}")
print(f"Mean Precision: {mean_precision}")
print(f"Mean Recall: {mean_recall}")
print(f"Mean F1 Score: {mean_f1s}")
print(f"Mean Quadratic Kappa: {mean_kappa}")