For implementation this classification task, following resources are referred:


1.   https://huggingface.co/transformers/v3.2.0/custom_datasets.html
2.   https://www.philschmid.de/k-fold-as-cross-validation-with-a-bert-text-classification-example

In [None]:
!pip install transformers[torch]

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from torch.utils.data import DataLoader, Dataset
import torch
from transformers import AdamW, get_linear_schedule_with_warmup

In [None]:
# loading data from the skill file
df = pd.read_excel('/content/Skill_with_question_id.xlsx')
df.head(5)

In [None]:
df['Planning_Investigations'].replace('-','2', inplace = True) # replacing entry "-" with 2
df = df[~df['Planning_Investigations'].isnull()]#checking for null value
df = df[df['Planning_Investigations'].str.isnumeric()]# changing to numerical value
df.head(5)

In [None]:
# function to check for value 1 and 0 for labels
def to_skill(label):
    skill = int(label)
    if skill == 1:
        return 1
    elif skill == 0:
        return 0

df['Planning_Investigations'] = df.Planning_Investigations.apply(to_skill)
df = df.dropna()

In [None]:
df.rename(columns={'Answer': 'text1', 'Solution': 'text2', 'Planning_Investigations': 'labels'}, inplace=True)#renaming the necessary columns
df.head(5)

In [None]:
df = df.reindex(columns=['text1', 'text2', 'labels','Student', 'question_id','Constructing_Explanations', 'Analyzing_Data'])#reindexing for easier use
df.head(5)

*   Method: Instance-based
*   Skill: Planning_Investigationa
*   Dataset: AFLEK
*   Models with prediction saved


For instance-based scoring for various models, repleace the model name as follows:


1.   FacebookAI/xlm-roberta-large
2.   FacebookAI/xlm-roberta-base
3.   deepset/gelectra-large
4.   deepset/gelectra-base
5.   deepset/gbert-large
6.   deepset/gbert-base

In [None]:
texts = list(df['text1'].values)
labels = list(df['labels'].values)
students = list(df['Student'].values)
sols = list(df['Solution'].values)
q_ids = list(df['question_id'].values)

df1 = pd.DataFrame.from_dict({'texts': texts, 'labels': labels, 'students': students, 'solutions': sols, 'question_ids': q_ids})

class TorchDataSet(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx]).long()
        return item

    def __len__(self):
        return len(self.labels)
#storing the individual values of f1score, accuraca, precision, recall for each fold
f1s_list = []
acc_list = []
precision_list = []
recall_list = []


# storing all the predictions, ground truths, and other details
all_predictions = []
all_ground_truths = []
all_students = []
all_solutions = []
all_answers = []
all_question_ids = []

for jk in range(1):
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    l = 0
    for train_index, val_index in kfold.split(df1):
        print(f'{jk+1}x{l+1}')

        X_train = df1['texts'].values[train_index]
        X_test = df1['texts'].values[val_index]
        y_train = df1['labels'].values[train_index]
        y_test = df1['labels'].values[val_index]
        student_ids_test = df1['students'].values[val_index]
        solutions_test = df1['solutions'].values[val_index]
        answers_test = df1['texts'].values[val_index]
        question_ids_test = df1['question_ids'].values[val_index]

        tokenizer = AutoTokenizer.from_pretrained('deepset/gbert-base', truncation=True, padding=True, max_length=512)
        model = AutoModelForSequenceClassification.from_pretrained("deepset/gbert-base", num_labels=2, ignore_mismatched_sizes=True).to(torch.device('cuda'))
        collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)

        train_inp = tokenizer(list(X_train), padding=True, truncation=True, return_tensors='pt')
        test_inp = tokenizer(list(X_test), padding=True, truncation=True, return_tensors='pt')
        train_dataset = TorchDataSet(train_inp, y_train)
        test_dataset = TorchDataSet(test_inp, y_test)

        def compute_metrics(eval_pred):
            labels = eval_pred.label_ids
            preds = np.argmax(eval_pred.predictions, axis=1)
            accuracy = accuracy_score(labels, preds)
            f1 = f1_score(labels, preds)
            precision = precision_score(labels, preds)
            recall = recall_score(labels, preds)
            return {
                "accuracy": accuracy,
                "f1": f1,
                "precision": precision,
                "recall": recall
            }

        training_args = TrainingArguments(
            output_dir="final_bert_trainer",
            evaluation_strategy="epoch",
            save_strategy='epoch',
            logging_strategy='epoch',
            learning_rate=1e-5,
            per_device_train_batch_size=2,
            per_device_eval_batch_size=2,
            num_train_epochs=3,
            weight_decay=0.01,
            load_best_model_at_end=True,
            save_total_limit=1,
            logging_dir='./logs',
            warmup_steps=400,
            fp16=True,
            gradient_accumulation_steps=8
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=test_dataset,
            tokenizer=tokenizer,
            data_collator=collator,
            compute_metrics=compute_metrics,
        )

        trainer.train()

        #model_save_name = '..'
        #path = F"../{model_save_name}"
        #torch.save(model.state_dict(), path)

        model.eval()

        logits_list = []
        for batch in DataLoader(test_dataset, batch_size=2):
            with torch.no_grad():
                input_ids = batch['input_ids'].to(torch.device('cuda'))
                attention_mask = batch['attention_mask'].to(torch.device('cuda'))
                labels = batch['labels'].to(torch.device('cuda'))

                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                logits = outputs.logits
                logits_list.append(logits.cpu().numpy())

        logits = np.concatenate(logits_list, axis=0)

        threshold = 0.5
        probabilities = torch.sigmoid(torch.tensor(logits))
        predictions = (probabilities >= threshold).int()
        pred = predictions.numpy()

        y_test_binary = (y_test >= 0.5).astype(int)
        pred_binary = np.argmax(pred, axis=1)

        # saving the prediction along with other details
        all_predictions.extend(pred_binary)
        all_ground_truths.extend(y_test_binary)
        all_students.extend(student_ids_test)
        all_solutions.extend(solutions_test)
        all_answers.extend(answers_test)
        all_question_ids.extend(question_ids_test)

        accuracy = accuracy_score(y_test_binary, pred_binary)
        precision = precision_score(y_test_binary, pred_binary)
        recall = recall_score(y_test_binary,pred_binary)
        f1 = f1_score(y_test_binary, pred_binary)


        print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}")
        l += 1
        acc_list.append(accuracy)
        precision_list.append(precision)
        recall_list.append(recall)
        f1s_list.append(f1)


        # claring cache
        torch.cuda.empty_cache()

# Calculating the average of evey evaluation metrics for all folds
mean_acc = np.mean(acc_list)
mean_precision = np.mean(precision_list)
mean_recall = np.mean(recall_list)
mean_f1s = np.mean(f1s_list)


print(f"Mean Accuracy: {mean_acc}")
print(f"Mean Precision: {mean_precision}")
print(f"Mean Recall: {mean_recall}")
print(f"Mean F1 Score: {mean_f1s}")

# Saving to the predictions to outputfile
output_df = pd.DataFrame({
    'student_id': all_students,
    'question_id': all_question_ids,
    'Solution': all_solutions,
    'Answer': all_answers,
    'Actual Label': all_ground_truths,
    'Predicted Label': all_predictions
})
#output_df.to_excel('...xlsx', index=False)

#print("Predictions and ground truths have been saved to 'predictions_ground_truths.xlsx'.")

*   Method: Entailment-based
*   Skill: Planning Investigations
*   Dataset: AFLEK
*   Models with prediction saved


For entailment-based scoring for various models, repleace the model name as follows:


1.   FacebookAI/xlm-roberta-large
2.   FacebookAI/xlm-roberta-base
3.   deepset/gelectra-large
4.   deepset/gelectra-base
5.   deepset/gbert-large
6.   deepset/gbert-base

In [None]:
texts1 = list(df['text1'].values)
texts2 = list(df['text2'].values)
labels = list(df['labels'].values)
students = list(df['Student'].values)
sols = list(df['text2'].values)
q_ids = list(df['question_id'].values)

df1 = pd.DataFrame.from_dict({'texts1': texts1, 'texts2': texts2, 'labels': labels, 'students': students, 'solutions': sols, 'question_ids': q_ids})

class TorchDataSet(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: v[idx].clone() for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx]).long()
        return item

    def __len__(self):
        return len(self.labels)

#storing the individual values of f1score, accuraca, precision, recall for each fold
f1s_list = []
accuracy_list = []
precision_list = []
recall_list = []


# storing all the predictions, ground truths, and other details
Full_predictions = []
Full_ground_truths = []
all_students = []
all_solutions = []
all_answers = []
all_question_ids = []

for jk in range(1):
    kfold = KFold(n_splits=5, shuffle=True)
    l = 0
    for train_index, val_index in kfold.split(df1):
        print(f'{jk+1}x{l+1}')

        X_train = df1[['texts1', 'texts2']].values[train_index]
        X_test = df1[['texts1', 'texts2']].values[val_index]
        y_train = df1['labels'].values[train_index]
        y_test = df1['labels'].values[val_index]
        student_ids_test = df1['students'].values[val_index]
        solutions_test = df1['solutions'].values[val_index]
        answers_test = df1['texts1'].values[val_index]
        question_ids_test = df1['question_ids'].values[val_index]

        train_texts = [f"{text1} [SEP] {text2}" for text1, text2 in zip(X_train[:, 0], X_train[:, 1])]
        test_texts = [f"{text1} [SEP] {text2}" for text1, text2 in zip(X_test[:, 0], X_test[:, 1])]

        tokenizer = AutoTokenizer.from_pretrained('FacebookAI/xlm-roberta-large', truncation=True, padding=True, max_length=512)
        model = AutoModelForSequenceClassification.from_pretrained("FacebookAI/xlm-roberta-large", num_labels=2, ignore_mismatched_sizes=True).to(torch.device('cuda'))
        collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)

        train_inp = tokenizer(train_texts, padding=True, truncation=True, return_tensors='pt')
        test_inp = tokenizer(test_texts, padding=True, truncation=True, return_tensors='pt')
        train_dataset = TorchDataSet(train_inp, y_train)
        test_dataset = TorchDataSet(test_inp, y_test)

        def compute_metrics(eval_pred):
            labels = eval_pred.label_ids
            preds = np.argmax(eval_pred.predictions, axis=1)
            accuracy = accuracy_score(labels, preds)
            f1 = f1_score(labels, preds)
            precision = precision_score(labels, preds)
            recall = recall_score(labels, preds)

            return {
                "accuracy": accuracy,
                "f1": f1,
                "precision": precision,
                "recall": recall
            }

        training_args = TrainingArguments(
            output_dir="final_bert_trainer",
            evaluation_strategy="epoch",
            save_strategy='epoch',
            logging_strategy='epoch',
            learning_rate=1e-5,
            per_device_train_batch_size=2,
            per_device_eval_batch_size=2,
            num_train_epochs=3,
            weight_decay=0.01,
            load_best_model_at_end=True,
            save_total_limit=1,
            logging_dir='./logs',
            warmup_steps=400,
            fp16=True,
            gradient_accumulation_steps=8
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=test_dataset,
            tokenizer=tokenizer,
            data_collator=collator,
            compute_metrics=compute_metrics,
        )

        trainer.train()

        #model_save_name = '...'
        #path = F".../{model_save_name}"
        #torch.save(model.state_dict(), path)

        model.eval()
        logits_list = []
        for batch in DataLoader(test_dataset, batch_size = 2):
            with torch.no_grad():
              input_ids = batch['input_ids'].to(torch.device('cuda'))
              attention_mask = batch['attention_mask'].to(torch.device('cuda'))
              labels = batch['labels'].to(torch.device('cuda'))

              outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
              logits = outputs.logits
              logits_list.append(logits.cpu().numpy())
        logits = np.concatenate(logits_list, axis=0)

        threshold = 0.5
        probabilities = torch.sigmoid(torch.tensor(logits))
        predictions = (probabilities >= threshold).int()
        pred = predictions.numpy()

        y_test_binary = (y_test >= 0.5).astype(int)
        pred_binary = np.argmax(pred, axis=1)

        # saving the prediction along with other details
        full_predictions.extend(pred_binary)
        full_ground_truths.extend(y_test_binary)
        all_students.extend(student_ids_test)
        all_solutions.extend(solutions_test)
        all_answers.extend(answers_test)
        all_question_ids.extend(question_ids_test)

        accuracy = accuracy_score(y_test_binary, pred_binary)
        precision = precision_score(y_test_binary, pred_binary)
        recall = recall_score(y_test_binary, pred_binary)
        f1 = f1_score(y_test_binary, pred_binary)

        print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}")
        l += 1
        accuracy_list.append(accuracy)
        precision_list.append(precision)
        recall_list.append(recall)
        f1s_list.append(f1)

        # claring cache
        torch.cuda.empty_cache()

# Calculating the average of evey evaluation metrics for all folds
mean_accuracy = np.mean(accuracy_list)
mean_precision = np.mean(precision_list)
mean_recall = np.mean(recall_list)
mean_f1s = np.mean(f1s_list)

print(f"Mean Accuracy: {mean_accuracy}")
print(f"Mean Precision: {mean_precision}")
print(f"Mean Recall: {mean_recall}")
print(f"Mean F1 Score: {mean_f1s}")

# Saving to the predictions to outputfille
output_df = pd.DataFrame({
    'student_id': all_students,
    'question_id': all_question_ids,
    'Solution': all_solutions,
    'Answer': all_answers,
    'Actual Label': full_ground_truths,
    'Predicted Label': full_predictions
})
#output_df.to_excel('...', index=False)

print("Predictions and ground truths have been saved to 'predictions_ground_truths.xlsx'.")