# Finely-Tuned GPT-2 Model - Automatic Grading (Dataset 1)

Omar Ebrahim - 202000443

1. Importing the libraries

In [1]:
import torch
import pandas as pd
from transformers import GPT2Tokenizer, GPT2Model
from torch.utils.data import DataLoader, Dataset
import numpy as np
import time

from peft import LoraConfig, TaskType, get_peft_model

from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr 
from sklearn.metrics import mean_absolute_error

  from .autonotebook import tqdm as notebook_tqdm


2. Importing the dataset and defining the GPT2Tokenizer used for tokenization and the original GPTModel used for embedding

In [2]:
df = pd.read_csv("C:/Users/Omar/OneDrive/Desktop/totalmerged.csv")

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')

3. Creating a CustomDataset which replaces the sample and student answers with their tokenized versions, and adds from the original dataset the original question and scores.

In [3]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        question = str(self.data.loc[index, 'Question'])
        correct_code = str(self.data.loc[index, 'Correct_Code'])
        code_with_error = str(self.data.loc[index, 'Code_with_Error'])
        total_marks = self.data.loc[index, 'Total_Marks']
        
        inputs = self.tokenizer(correct_code,
                                max_length=self.max_length, 
                                padding='max_length', 
                                truncation=True, 
                                return_tensors='pt')
        
        inputs2 = self.tokenizer(code_with_error,
                                 max_length=self.max_length, 
                                 padding='max_length', 
                                 truncation=True, 
                                 return_tensors='pt')
        
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'input_ids2': inputs2['input_ids'].flatten(),
            'total_marks': torch.tensor(total_marks, dtype=torch.float)
        }

dataset = CustomDataset(df, tokenizer)

4. Creating the LoRA configuration with their default values, and then adding it to the BERTModel for the purpose of reducing trainable parameters

In [4]:
lora_config = LoraConfig(
    task_type=TaskType.FEATURE_EXTRACTION,  # Task type for which the model will undergo fine-tuning
    r=1,  # Dimensions of A and B
    lora_alpha=1,  # Scaling factor determining the relative significance of weights in A and B
    lora_dropout=0.1  # Dropout probability for LoRA
)


In [5]:
model = get_peft_model(model, lora_config)
model.eval()



PeftModelForFeatureExtraction(
  (base_model): LoraModel(
    (model): GPT2Model(
      (wte): Embedding(50257, 768)
      (wpe): Embedding(1024, 768)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0-11): 12 x GPT2Block(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): GPT2Attention(
            (c_attn): lora.Linear(
              (base_layer): Conv1D()
              (lora_dropout): ModuleDict(
                (default): Dropout(p=0.1, inplace=False)
              )
              (lora_A): ModuleDict(
                (default): Linear(in_features=768, out_features=1, bias=False)
              )
              (lora_B): ModuleDict(
                (default): Linear(in_features=1, out_features=2304, bias=False)
              )
              (lora_embedding_A): ParameterDict()
              (lora_embedding_B): ParameterDict()
            )
            (c_proj): Conv1D()
            (attn_dropout): Dropout(p=0.1, inplace

5. End of Sequence condition added for the tokenized results such that they are able to be used for embedding

In [6]:
tokenizer.pad_token = tokenizer.eos_token

6. Generating the word embeddings for sample answers and the student answers. The training time in seconds is recorded after embeddings are generated.

In [7]:
start_time = time.time()
GPT2embeddings1 = []
GPT2embeddings2 = []

with torch.no_grad():
    for example in DataLoader(dataset, batch_size=1):
        input_ids = example['input_ids']
        outputs = model(input_ids=input_ids)
        embeddings = outputs.last_hidden_state
        GPT2embeddings1.append(embeddings.squeeze().detach().numpy())

    for example in DataLoader(dataset, batch_size=1):
        input_ids2 = example['input_ids2']
        outputs = model(input_ids=input_ids2)
        embeddings = outputs.last_hidden_state
        GPT2embeddings2.append(embeddings.squeeze().detach().numpy())

GPT2embeddings1 = np.array(GPT2embeddings1)
GPT2embeddings2 = np.array(GPT2embeddings2)
end_time = time.time()
runTime = end_time - start_time
print(f"Time: {runTime} seconds")

Time: 574.8804430961609 seconds


7. Evaluation:
   1. Generating the new scores by doing cosine similarity between embedded student answers and embedded model answers (rescaled between 1-10)
   2. After getting both scores, we compare them using using MAE and Pearson correlation
   3. MAPE was used additionally to see the % difference between both grades per row

In [8]:
MAE = []
MAPE = []
Pearson = []
predicted_scores = []

np.save('GPT2embeddings1.npy', GPT2embeddings1)
np.save('GPT2embeddings2.npy', GPT2embeddings2)

GPT2embeddings1 = np.load('GPT2embeddings1.npy', allow_pickle=True)
GPT2embeddings2 = np.load('GPT2embeddings2.npy', allow_pickle=True)

print("Original Score | Predicted Score | MAPE (%)")
print("--------------------------------------------")
for i in range(len(GPT2embeddings1)):
    GPT2embeddings1_i = GPT2embeddings1[i].reshape(1, -1)
    GPT2embeddings2_i = GPT2embeddings2[i].reshape(1, -1)

    similarity = cosine_similarity(GPT2embeddings1_i, GPT2embeddings2_i)[0][0]
    scaled_similarity = (similarity + 1) * 5
    predicted_score = max(min(scaled_similarity, 10), 0)
    original_score = df['Total_Marks'][i]

    mape = np.abs((predicted_score - original_score) / original_score) * 100

    print(f"{original_score:.2f}           | {predicted_score:.2f}            | {mape:.2f}")

    MAE.append(np.abs(predicted_score - original_score))
    MAPE.append(mape)
    Pearson.append((original_score, predicted_score))
    predicted_scores.append(predicted_score)  # Append predicted score to array

MAE = np.mean(MAE)
MAPE = np.mean(MAPE)
pearson_corr = np.array(Pearson)
corr_coefficient, _ = pearsonr(pearson_corr[:,0], pearson_corr[:,1])

print(f"\nMean Absolute Error: {MAE:.2f}")
print(f"Pearson Correlation: {corr_coefficient:.2f}")


Original Score | Predicted Score | MAPE (%)
--------------------------------------------
7.00           | 9.90            | 41.39
8.00           | 7.42            | 7.29
5.00           | 9.65            | 92.99
7.00           | 7.98            | 14.04
5.00           | 7.27            | 45.35
8.00           | 6.62            | 17.28
4.00           | 7.52            | 87.88
7.00           | 8.14            | 16.26
5.00           | 7.01            | 40.24
6.00           | 7.98            | 33.05
3.00           | 7.06            | 135.33
8.00           | 7.53            | 5.83
5.00           | 8.02            | 60.32
5.00           | 7.98            | 59.66
6.00           | 7.41            | 23.45
6.00           | 6.42            | 6.99
3.00           | 6.42            | 113.98
7.00           | 7.12            | 1.77
4.00           | 6.69            | 67.20
5.00           | 7.05            | 41.06
6.00           | 7.90            | 31.61
5.00           | 7.85            | 57.04
1.00       

  mape = np.abs((predicted_score - original_score) / original_score) * 100


6.00           | 9.03            | 50.55
7.00           | 7.27            | 3.86
10.00           | 10.00            | 0.00
8.00           | 8.33            | 4.11
7.00           | 9.95            | 42.09
5.00           | 9.96            | 99.14
6.00           | 10.00            | 66.64
4.00           | 10.00            | 150.00
7.00           | 10.00            | 42.86
6.00           | 10.00            | 66.66
6.00           | 8.46            | 40.99
8.00           | 9.97            | 24.63
7.00           | 9.97            | 42.46
5.00           | 10.00            | 100.00
4.00           | 9.32            | 132.90
6.00           | 10.00            | 66.67
7.00           | 10.00            | 42.86
8.00           | 9.52            | 18.99
8.00           | 10.00            | 25.00
8.00           | 9.97            | 24.65
7.00           | 9.74            | 39.17
6.00           | 10.00            | 66.67
6.00           | 8.38            | 39.75
8.00           | 9.89            | 23.56
9.00 

  mape = np.abs((predicted_score - original_score) / original_score) * 100
  mape = np.abs((predicted_score - original_score) / original_score) * 100


8.00           | 7.65            | 4.36
10.00           | 6.95            | 30.51
7.00           | 10.00            | 42.86
6.00           | 7.85            | 30.90
3.00           | 10.00            | 233.33
1.00           | 7.33            | 632.84
6.00           | 10.00            | 66.67
6.00           | 8.15            | 35.86
2.00           | 8.09            | 304.73
4.00           | 7.54            | 88.40
4.00           | 8.84            | 120.91
6.00           | 10.00            | 66.67
7.00           | 9.99            | 42.72
9.00           | 10.00            | 11.11
8.00           | 10.00            | 24.99
6.00           | 8.09            | 34.79
4.00           | 8.09            | 102.19
5.00           | 7.37            | 47.40
5.00           | 9.99            | 99.85
6.00           | 10.00            | 66.67
5.00           | 9.64            | 92.78
4.00           | 7.03            | 75.80
6.00           | 10.00            | 66.67
6.00           | 7.64            | 27.30
7.0

  mape = np.abs((predicted_score - original_score) / original_score) * 100
  mape = np.abs((predicted_score - original_score) / original_score) * 100
  mape = np.abs((predicted_score - original_score) / original_score) * 100


4.00           | 7.57            | 89.23
4.00           | 7.79            | 94.75
5.00           | 8.16            | 63.23
3.00           | 7.84            | 161.26
4.00           | 7.99            | 99.82
4.00           | 7.80            | 95.03
9.00           | 7.97            | 11.47
3.00           | 7.75            | 158.31
8.00           | 7.50            | 6.21
7.00           | 7.71            | 10.21
5.00           | 7.64            | 52.84
10.00           | 10.00            | 0.00
6.00           | 7.57            | 26.12
4.00           | 7.32            | 82.97
5.00           | 7.81            | 56.12
5.00           | 7.94            | 58.88
7.00           | 7.43            | 6.08
10.00           | 10.00            | 0.00
5.00           | 7.96            | 59.14
5.00           | 7.95            | 59.02
10.00           | 10.00            | 0.00
10.00           | 10.00            | 0.00
6.00           | 7.99            | 33.20
5.00           | 7.77            | 55.46
4.00        

8. Applying the same concept of semantic similarity, but instead replacing it with Euclidean Distance.
Given euclidean distance is unbounded, I first collected the Euclidean results after performing similarity, defined the maximum and minimum values, then rescaled the results between 0-10

In [9]:
from sklearn.metrics.pairwise import euclidean_distances
GPT2embeddings1 = np.load('GPT2embeddings1.npy', allow_pickle=True)
GPT2embeddings2 = np.load('GPT2embeddings2.npy', allow_pickle=True)

distances = []

for i in range(len(GPT2embeddings1)):
    GPT2embeddings1_i = GPT2embeddings1[i].reshape(1, -1)
    GPT2embeddings2_i = GPT2embeddings2[i].reshape(1, -1)
    distance = euclidean_distances(GPT2embeddings1_i, GPT2embeddings2_i)[0][0]
    distances.append(distance)

distances = np.array(distances)

min_dist = distances.min()
max_dist = distances.max()

normalized_distances = (distances - min_dist) / (max_dist - min_dist)
scaled_distances = normalized_distances * 10

MAE = []
MAPE = []
Pearson = []

print("Original Score | Predicted Score | MAPE (%)")
print("--------------------------------------------")

for i in range(len(GPT2embeddings1)):
    original_score = df['Total_Marks'][i]
    predicted_score = scaled_distances[i]

    mape = np.abs((predicted_score - original_score) / original_score) * 100

    print(f"{original_score:.2f}           | {predicted_score:.2f}            | {mape:.2f}")

    MAE.append(np.abs(predicted_score - original_score))
    MAPE.append(mape)
    Pearson.append((original_score, predicted_score))

MAE = np.mean(MAE)
MAPE = np.mean(MAPE)
pearson_corr = np.array(Pearson)
corr_coefficient, _ = pearsonr(pearson_corr[:,0], pearson_corr[:,1])

print(f"\nMean Absolute Error: {MAE:.2f}")
print(f"Pearson Correlation: {corr_coefficient:.2f}")

Original Score | Predicted Score | MAPE (%)
--------------------------------------------
7.00           | 1.46            | 79.12
8.00           | 7.02            | 12.29
5.00           | 2.77            | 44.55
7.00           | 6.93            | 0.98
5.00           | 7.78            | 55.64
8.00           | 8.72            | 9.04
4.00           | 7.16            | 79.12
7.00           | 6.54            | 6.51
5.00           | 8.22            | 64.48
6.00           | 6.93            | 15.52
3.00           | 8.18            | 172.75
8.00           | 7.55            | 5.66
5.00           | 6.85            | 37.02
5.00           | 6.93            | 38.62
6.00           | 7.75            | 29.09
6.00           | 9.09            | 51.51
3.00           | 9.09            | 203.01
7.00           | 8.18            | 16.86
4.00           | 8.72            | 117.94
5.00           | 8.28            | 65.57
6.00           | 6.90            | 15.02
5.00           | 6.55            | 31.01
1.00      

  mape = np.abs((predicted_score - original_score) / original_score) * 100


9. Applying the same concept of semantic similarity, but instead replacing it with Manhattan Distance.
Given euclidean distance is unbounded, I first collected the Manhattan results after performing similarity, defined the maximum and minimum values, then rescaled the results between 0-10

In [10]:
from sklearn.metrics.pairwise import manhattan_distances
GPT2embeddings1 = np.load('GPT2embeddings1.npy', allow_pickle=True)
GPT2embeddings2 = np.load('GPT2embeddings2.npy', allow_pickle=True)

distances = []

for i in range(len(GPT2embeddings1)):
    GPT2embeddings1_i = GPT2embeddings1[i].reshape(1, -1)
    GPT2embeddings2_i = GPT2embeddings2[i].reshape(1, -1)
    distance = manhattan_distances(GPT2embeddings1_i, GPT2embeddings2_i)[0][0]
    distances.append(distance)

distances = np.array(distances)

min_dist = distances.min()
max_dist = distances.max()
normalized_distances = (distances - min_dist) / (max_dist - min_dist)
scaled_distances = normalized_distances * 10

MAE = []
MAPE = []
Pearson = []

GPT2embeddings1 = np.load('GPT2embeddings1.npy', allow_pickle=True)
GPT2embeddings2 = np.load('GPT2embeddings2.npy', allow_pickle=True)

print("Original Score | Predicted Score | MAPE (%)")
print("--------------------------------------------")

for i in range(len(GPT2embeddings1)):
    original_score = df['Total_Marks'][i]
    predicted_score = scaled_distances[i]

    mape = np.abs((predicted_score - original_score) / original_score) * 100

    print(f"{original_score:.2f}           | {predicted_score:.2f}            | {mape:.2f}")

    MAE.append(np.abs(predicted_score - original_score))
    MAPE.append(mape)
    Pearson.append((original_score, predicted_score))

MAE = np.mean(MAE)
MAPE = np.mean(MAPE)
pearson_corr = np.array(Pearson)
corr_coefficient, _ = pearsonr(pearson_corr[:,0], pearson_corr[:,1])

print(f"\nMean Absolute Error: {MAE:.2f}")
print(f"Pearson Correlation: {corr_coefficient:.2f}")

Original Score | Predicted Score | MAPE (%)
--------------------------------------------
7.00           | 0.51            | 92.73
8.00           | 6.64            | 16.98
5.00           | 0.71            | 85.85
7.00           | 6.18            | 11.72
5.00           | 7.58            | 51.59
8.00           | 8.42            | 5.20
4.00           | 6.95            | 73.67
7.00           | 6.08            | 13.14
5.00           | 8.20            | 64.01
6.00           | 6.18            | 2.99
3.00           | 8.54            | 184.71
8.00           | 7.79            | 2.62
5.00           | 6.11            | 22.22
5.00           | 6.18            | 23.59
6.00           | 8.16            | 35.94
6.00           | 9.21            | 53.57
3.00           | 9.21            | 207.13
7.00           | 8.67            | 23.92
4.00           | 9.01            | 125.20
5.00           | 8.73            | 74.67
6.00           | 7.91            | 31.90
5.00           | 7.20            | 44.07
1.00     

  mape = np.abs((predicted_score - original_score) / original_score) * 100
