# Finely-Tuned GPT-2 Model - Automatic Grading (Dataset 2)

Omar Ebrahim - 202000443

1. Importing the libraries

In [1]:
import torch
import pandas as pd
from transformers import GPT2Tokenizer, GPT2Model
from torch.utils.data import DataLoader, Dataset
import numpy as np

from peft import LoraConfig, TaskType, get_peft_model

from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr 
from sklearn.metrics import mean_absolute_error

  from .autonotebook import tqdm as notebook_tqdm


2. Importing the dataset and defining the GPT2Tokenizer used for tokenization and the original GPTModel used for embedding

In [2]:
df = pd.read_csv("C:/Users/Omar/OneDrive/Desktop/my_dataset.csv")

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')

3. Creating a CustomDataset which replaces the sample and student answers with their tokenized versions, and adds from the original dataset the original question and scores.

In [3]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        question = str(self.data.loc[index, 'Question'])
        correct_code = str(self.data.loc[index, 'Correct_Code'])
        code_with_error = str(self.data.loc[index, 'Code_with_Error'])
        total_marks = self.data.loc[index, 'Total_Marks']
        
        inputs = self.tokenizer(correct_code,
                                max_length=self.max_length, 
                                padding='max_length', 
                                truncation=True, 
                                return_tensors='pt')
        
        inputs2 = self.tokenizer(code_with_error,
                                 max_length=self.max_length, 
                                 padding='max_length', 
                                 truncation=True, 
                                 return_tensors='pt')
        
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'input_ids2': inputs2['input_ids'].flatten(),
            'total_marks': torch.tensor(total_marks, dtype=torch.float)
        }

dataset = CustomDataset(df, tokenizer)

4. Creating the LoRA configuration with their default values, and then adding it to the BERTModel for the purpose of reducing trainable parameters

In [4]:
lora_config = LoraConfig(
    task_type=TaskType.FEATURE_EXTRACTION,  # Task type for which the model will undergo fine-tuning
    r=1,  # Dimensions of A and B
    lora_alpha=1,  # Scaling factor determining the relative significance of weights in A and B
    lora_dropout=0.1  # Dropout probability for LoRA
)

In [5]:
model = get_peft_model(model, lora_config)
model.eval()



PeftModelForFeatureExtraction(
  (base_model): LoraModel(
    (model): GPT2Model(
      (wte): Embedding(50257, 768)
      (wpe): Embedding(1024, 768)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0-11): 12 x GPT2Block(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): GPT2Attention(
            (c_attn): lora.Linear(
              (base_layer): Conv1D()
              (lora_dropout): ModuleDict(
                (default): Dropout(p=0.1, inplace=False)
              )
              (lora_A): ModuleDict(
                (default): Linear(in_features=768, out_features=1, bias=False)
              )
              (lora_B): ModuleDict(
                (default): Linear(in_features=1, out_features=2304, bias=False)
              )
              (lora_embedding_A): ParameterDict()
              (lora_embedding_B): ParameterDict()
            )
            (c_proj): Conv1D()
            (attn_dropout): Dropout(p=0.1, inplace

5. End of Sequence condition added for the tokenized results such that they are able to be used for embedding

In [6]:
tokenizer.pad_token = tokenizer.eos_token

6. Generating the word embeddings for sample answers and the student answers. 

In [7]:
GPT2embeddings1 = []
GPT2embeddings2 = []

with torch.no_grad():
    for example in DataLoader(dataset, batch_size=1):
        input_ids = example['input_ids']
        outputs = model(input_ids=input_ids)
        embeddings = outputs.last_hidden_state
        GPT2embeddings1.append(embeddings.squeeze().detach().numpy())

    for example in DataLoader(dataset, batch_size=1):
        input_ids2 = example['input_ids2']
        outputs = model(input_ids=input_ids2)
        embeddings = outputs.last_hidden_state
        GPT2embeddings2.append(embeddings.squeeze().detach().numpy())

GPT2embeddings1 = np.array(GPT2embeddings1)
GPT2embeddings2 = np.array(GPT2embeddings2)

7. Evaluation:
   1. Generating the new scores by doing cosine similarity between embedded student answers and embedded model answers (rescaled between 1-10)
   2. After getting both scores, we compare them using using MAE and Pearson correlation
   3. MAPE was used additionally to see the % difference between both grades per row

In [8]:
MAE = []
MAPE = []
Pearson = []
predicted_scores = []

GPT2embeddings1 = np.load('GPT2embeddings1.npy', allow_pickle=True)
GPT2embeddings2 = np.load('BMwordEmbeddings2.npy', allow_pickle=True)

print("Original Score | Predicted Score | MAPE (%)")
print("--------------------------------------------")
for i, row in df.iterrows():
    GPT2embeddings1_i = GPT2embeddings1[i].reshape(1, -1)
    GPT2embeddings2_i = GPT2embeddings2[i].reshape(1, -1)

    similarity = cosine_similarity(GPT2embeddings1_i, GPT2embeddings2_i)[0][0]
    scaled_similarity = (similarity + 1) * 5
    predicted_score = max(min(scaled_similarity, 10), 0)
    original_score = row['Total_Marks']

    mape = np.abs((predicted_score - original_score) / original_score) * 100

    print(f"{original_score:.2f}           | {predicted_score:.2f}            | {mape:.2f}")

    MAE.append(np.abs(predicted_score - original_score))
    MAPE.append(mape)
    Pearson.append((original_score, predicted_score))
    predicted_scores.append(predicted_score) 

MAE = np.mean(MAE)
MAPE = np.mean(MAPE)
pearson_corr = np.array(Pearson)
corr_coefficient, _ = pearsonr(pearson_corr[:,0], pearson_corr[:,1])

print(f"\nMean Absolute Error: {MAE:.2f}")
print(f"Pearson Correlation: {corr_coefficient:.2f}")

Original Score | Predicted Score | MAPE (%)
--------------------------------------------
10.00           | 5.06            | 49.42
8.00           | 5.05            | 36.84
8.00           | 5.07            | 36.69
8.00           | 5.06            | 36.69
8.00           | 5.07            | 36.56
10.00           | 5.05            | 49.47
6.00           | 5.04            | 15.93
10.00           | 5.05            | 49.48
10.00           | 5.06            | 49.43
10.00           | 5.06            | 49.40
8.00           | 5.06            | 36.74
8.00           | 5.07            | 36.67
8.00           | 5.07            | 36.63
10.00           | 5.06            | 49.40
0.00           | 5.06            | inf
10.00           | 5.06            | 49.37
7.00           | 5.06            | 27.70
8.00           | 5.07            | 36.64
0.00           | 5.07            | inf
0.00           | 5.05            | inf
8.00           | 5.06            | 36.72
10.00           | 5.07            | 49.27
6.00   

  mape = np.abs((predicted_score - original_score) / original_score) * 100
  mape = np.abs((predicted_score - original_score) / original_score) * 100
  mape = np.abs((predicted_score - original_score) / original_score) * 100
  mape = np.abs((predicted_score - original_score) / original_score) * 100
  mape = np.abs((predicted_score - original_score) / original_score) * 100
  mape = np.abs((predicted_score - original_score) / original_score) * 100
  mape = np.abs((predicted_score - original_score) / original_score) * 100
  mape = np.abs((predicted_score - original_score) / original_score) * 100
  mape = np.abs((predicted_score - original_score) / original_score) * 100
  mape = np.abs((predicted_score - original_score) / original_score) * 100
  mape = np.abs((predicted_score - original_score) / original_score) * 100
  mape = np.abs((predicted_score - original_score) / original_score) * 100
  mape = np.abs((predicted_score - original_score) / original_score) * 100
  mape = np.abs((predicte