# BERT Model - Automatic Grading (Dataset 1)

Omar Ebrahim - 202000443

1. Import libraries

In [9]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader, Dataset
import numpy as np
import time

from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr 
from sklearn.metrics import mean_absolute_error

2. Import the dataset & call the BERT tokenizer for the experiment

In [2]:
df = pd.read_csv("C:/Users/Omar/OneDrive/Desktop/totalmerged.csv")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

3. Create CustomDataset, which will be just like the first dataset, but the sample answers and student answers with their tokenized versions instead

In [3]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        question = str(self.data.loc[index, 'Question'])
        correct_code = str(self.data.loc[index, 'Correct_Code'])
        code_with_error = str(self.data.loc[index, 'Code_with_Error'])
        total_marks = self.data.loc[index, 'Total_Marks']
        
        inputs = self.tokenizer(correct_code,
                                add_special_tokens=True, 
                                max_length=self.max_length, 
                                padding='max_length', 
                                truncation=True, 
                                return_tensors='pt')
        
        inputs2 = self.tokenizer(code_with_error,
                                 add_special_tokens=True, 
                                 max_length=self.max_length, 
                                 padding='max_length', 
                                 truncation=True, 
                                 return_tensors='pt')
        
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'input_ids2': inputs2['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'attention_mask2': inputs2['attention_mask'].flatten(),
            'total_marks': torch.tensor(total_marks, dtype=torch.float)
        }

dataset = CustomDataset(df, tokenizer)

5. Calling the BERT Embedder prior to generating embeddings

In [4]:
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

6. Generating the embeddings from the tokenized sample and student answers, time function added to measure the length of this process

In [5]:
start_time = time.time()
BMwordEmbeddings1 = []
BMwordEmbeddings2 = []

with torch.no_grad():
    for example in DataLoader(dataset, batch_size=1):
        input_ids = example['input_ids']
        attention_mask = example['attention_mask']

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = outputs.last_hidden_state
        BMwordEmbeddings1.append(embeddings.squeeze().detach().numpy())

BMwordEmbeddings1 = np.array(BMwordEmbeddings1)
print(BMwordEmbeddings1)

with torch.no_grad():
    for example in DataLoader(dataset, batch_size=1):
        input_ids2 = example['input_ids2']
        attention_mask2 = example['attention_mask2']

        outputs = model(input_ids=input_ids2, attention_mask=attention_mask2)
        embeddings = outputs.last_hidden_state
        BMwordEmbeddings2.append(embeddings.squeeze().detach().numpy())

BMwordEmbeddings2 = np.array(BMwordEmbeddings2)
print(BMwordEmbeddings2)

np.save('BMwordEmbeddings1.npy', BMwordEmbeddings1)
np.save('BMwordEmbeddings2.npy', BMwordEmbeddings2)
end_time = time.time()
runTime = end_time - start_time
print(f"Time: {runTime} seconds")

[[[-0.5867809  -0.36800504 -0.24722183 ...  0.09294306 -0.6350142
    1.1450686 ]
  [ 0.00500179 -0.10744199 -0.3958574  ...  0.19922711 -0.42523757
    0.90047485]
  [-0.08389752  0.5038037  -0.01508595 ...  0.2977718  -0.33158606
    0.5548596 ]
  ...
  [ 0.4967159   0.01928156 -0.29722682 ...  0.19378288 -0.50516903
   -0.24182697]
  [-0.23897968 -0.30549297  0.18418804 ... -0.11764864 -0.63079214
    0.32405114]
  [-0.2762609  -0.49520755  0.14970571 ... -0.1531956  -0.5620768
    0.16504492]]

 [[-0.5867809  -0.36800504 -0.24722183 ...  0.09294306 -0.6350142
    1.1450686 ]
  [ 0.00500179 -0.10744199 -0.3958574  ...  0.19922711 -0.42523757
    0.90047485]
  [-0.08389752  0.5038037  -0.01508595 ...  0.2977718  -0.33158606
    0.5548596 ]
  ...
  [ 0.4967159   0.01928156 -0.29722682 ...  0.19378288 -0.50516903
   -0.24182697]
  [-0.23897968 -0.30549297  0.18418804 ... -0.11764864 -0.63079214
    0.32405114]
  [-0.2762609  -0.49520755  0.14970571 ... -0.1531956  -0.5620768
    0.1650

7. 
- Applying cosine similarity between sample and student answers (Rescaled between 0-10)
- MAE used to measure the overall margin of error between the original scores and the new scores
- Pearson's correlation used to measure the overall correlation between the original scores and new scores
- MAPE was used to see the % error per each observation 

In [6]:
MAE = []
MAPE = []
Pearson = []
predicted_scores = []

BMwordEmbeddings1 = np.load('BMwordEmbeddings1.npy', allow_pickle=True)
BMwordEmbeddings2 = np.load('BMwordEmbeddings2.npy', allow_pickle=True)

print("Original Score | Predicted Score | MAPE (%)")
print("--------------------------------------------")
for i in range(len(BMwordEmbeddings1)):
    BMWordEmbeddings1_i = BMwordEmbeddings1[i].reshape(1, -1)
    BMWordEmbeddings2_i = BMwordEmbeddings2[i].reshape(1, -1)

    similarity = cosine_similarity(BMWordEmbeddings1_i, BMWordEmbeddings2_i)[0][0]

    scaled_similarity = similarity * 10
    
    predicted_score = max(min(scaled_similarity, 10), 0)
    original_score = df['Total_Marks'][i]

    mape = np.abs((predicted_score - original_score) / original_score) * 100

    print(f"{original_score:.2f}           | {predicted_score:.2f}            | {mape:.2f}")

    MAE.append(np.abs(predicted_score - original_score))
    MAPE.append(mape)
    Pearson.append((original_score, predicted_score))
    predicted_scores.append(predicted_score) 

MAE = np.mean(MAE)
MAPE = np.mean(MAPE)
pearson_corr = np.array(Pearson)
corr_coefficient, _ = pearsonr(pearson_corr[:,0], pearson_corr[:,1])

print(f"\nMean Absolute Error: {MAE:.2f}")
print(f"Pearson Correlation: {corr_coefficient:.2f}")

Original Score | Predicted Score | MAPE (%)
--------------------------------------------
7.00           | 5.70            | 18.50
8.00           | 5.86            | 26.76
5.00           | 7.82            | 56.45
7.00           | 8.27            | 18.11
5.00           | 6.34            | 26.84
8.00           | 5.99            | 25.14
4.00           | 4.45            | 11.30
7.00           | 5.41            | 22.71
5.00           | 5.01            | 0.19
6.00           | 8.23            | 37.15
3.00           | 4.77            | 59.11
8.00           | 4.49            | 43.89
5.00           | 7.66            | 53.30
5.00           | 9.52            | 90.41
6.00           | 4.54            | 24.30
6.00           | 4.75            | 20.85
3.00           | 4.70            | 56.80
7.00           | 4.67            | 33.24
4.00           | 4.53            | 13.27
5.00           | 4.56            | 8.86
6.00           | 4.62            | 23.08
5.00           | 4.56            | 8.75
1.00        

  mape = np.abs((predicted_score - original_score) / original_score) * 100


6.00           | 5.30            | 11.67
9.00           | 9.97            | 10.83
6.00           | 9.97            | 66.24
4.00           | 4.90            | 22.50
9.00           | 10.00            | 11.11
5.00           | 10.00            | 100.00
10.00           | 10.00            | 0.00
7.00           | 3.71            | 47.04
6.00           | 4.91            | 18.20
8.00           | 3.36            | 58.01
9.00           | 10.00            | 11.11
10.00           | 10.00            | 0.00
4.00           | 5.30            | 32.49
10.00           | 5.60            | 43.99
10.00           | 4.43            | 55.66
8.00           | 4.06            | 49.28
6.00           | 10.00            | 66.67
4.00           | 3.36            | 16.03
2.00           | 10.00            | 400.00
7.00           | 10.00            | 42.86
10.00           | 10.00            | 0.00
10.00           | 10.00            | 0.00
6.00           | 10.00            | 66.67
8.00           | 10.00            | 25.00


  mape = np.abs((predicted_score - original_score) / original_score) * 100
  mape = np.abs((predicted_score - original_score) / original_score) * 100


2.00           | 6.59            | 229.50
3.00           | 6.58            | 119.18
10.00           | 9.61            | 3.86
9.00           | 7.19            | 20.14
2.00           | 6.22            | 210.92
2.00           | 6.14            | 207.12
3.00           | 6.23            | 107.72
9.00           | 6.32            | 29.80
10.00           | 5.63            | 43.74
0.00           | 4.82            | inf
5.00           | 6.29            | 25.89
8.00           | 6.36            | 20.55
8.00           | 8.21            | 2.57
9.00           | 9.98            | 10.86
6.00           | 7.24            | 20.67
4.00           | 6.31            | 57.79
2.00           | 6.28            | 214.15
5.00           | 6.70            | 33.97
7.00           | 7.24            | 3.43
8.00           | 6.64            | 17.06
10.00           | 5.58            | 44.17
7.00           | 10.00            | 42.86
6.00           | 3.73            | 37.89
3.00           | 8.88            | 196.11
1.00      

  mape = np.abs((predicted_score - original_score) / original_score) * 100


4.00           | 9.97            | 149.32
2.00           | 4.36            | 117.89
3.00           | 4.13            | 37.78
2.00           | 4.16            | 108.16
4.00           | 4.16            | 4.08
4.00           | 3.95            | 1.30
3.00           | 4.16            | 38.77
2.00           | 4.24            | 111.92
1.00           | 4.10            | 309.87
1.00           | 4.35            | 335.03
4.00           | 9.72            | 142.88
6.00           | 5.45            | 9.19
5.00           | 6.37            | 27.49
4.00           | 6.61            | 65.37
8.00           | 8.19            | 2.36
6.00           | 8.50            | 41.63
6.00           | 7.58            | 26.33
4.00           | 7.06            | 76.52
3.00           | 5.92            | 97.19
2.00           | 8.21            | 310.37
3.00           | 7.58            | 152.71
5.00           | 8.96            | 79.17
3.00           | 6.02            | 100.76
7.00           | 7.56            | 8.00
4.00       

  mape = np.abs((predicted_score - original_score) / original_score) * 100
  mape = np.abs((predicted_score - original_score) / original_score) * 100


5.00           | 4.61            | 7.75
5.00           | 4.61            | 7.75
7.00           | 4.61            | 34.11
7.00           | 4.61            | 34.11
7.00           | 4.61            | 34.11
6.00           | 4.22            | 29.73
6.00           | 4.61            | 23.21
6.00           | 4.61            | 23.21
6.00           | 4.61            | 23.21
6.00           | 4.61            | 23.21
7.00           | 4.61            | 34.18
7.00           | 4.54            | 35.19
7.00           | 4.54            | 35.19
8.00           | 4.54            | 43.29
7.00           | 4.54            | 35.19
10.00           | 3.89            | 61.10
10.00           | 4.38            | 56.25
10.00           | 3.85            | 61.51
10.00           | 4.02            | 59.79
10.00           | 10.00            | 0.00
8.00           | 8.41            | 5.09
8.00           | 9.97            | 24.63
7.00           | 5.53            | 20.94
8.00           | 9.97            | 24.63
8.00          

8. Same process as above, but instead using euclidean distance to measure between the old scores and new scores.

(Given this distance metric is unbounded, I first collected all the original Euclidean results between both scores, defined its minimum and maximum value, and then rescaled them between 0-10)

In [7]:
from sklearn.metrics.pairwise import euclidean_distances

BMwordEmbeddings1 = np.load('BMwordEmbeddings1.npy', allow_pickle=True)
BMwordEmbeddings2 = np.load('BMwordEmbeddings2.npy', allow_pickle=True)

distances = []

for i in range(len(BMwordEmbeddings1)):
    BMwordEmbeddings1_i = BMwordEmbeddings1[i].reshape(1, -1)
    BMwordEmbeddings2_i = BMwordEmbeddings2[i].reshape(1, -1)
    distance = euclidean_distances(BMwordEmbeddings1_i, BMwordEmbeddings2_i)[0][0]
    distances.append(distance)

distances = np.array(distances)

min_dist = distances.min()
max_dist = distances.max()

normalized_distances = (distances - min_dist) / (max_dist - min_dist)
scaled_distances = normalized_distances * 10

MAE = []
MAPE = []
Pearson = []

print("Original Score | Predicted Score | MAPE (%)")
print("--------------------------------------------")

for i in range(len(BMwordEmbeddings1)):
    original_score = df['Total_Marks'][i]
    predicted_score = scaled_distances[i]

    mape = np.abs((predicted_score - original_score) / original_score) * 100

    print(f"{original_score:.2f}           | {predicted_score:.2f}            | {mape:.2f}")

    MAE.append(np.abs(predicted_score - original_score))
    MAPE.append(mape)
    Pearson.append((original_score, predicted_score))

MAE = np.mean(MAE)
MAPE = np.mean(MAPE)
pearson_corr = np.array(Pearson)
corr_coefficient, _ = pearsonr(pearson_corr[:,0], pearson_corr[:,1])

print(f"\nMean Absolute Error: {MAE:.2f}")
print(f"Pearson Correlation: {corr_coefficient:.2f}")

Original Score | Predicted Score | MAPE (%)
--------------------------------------------
7.00           | 7.80            | 11.43
8.00           | 7.61            | 4.90
5.00           | 5.56            | 11.20
7.00           | 4.95            | 29.28
5.00           | 7.19            | 43.77
8.00           | 7.53            | 5.89
4.00           | 8.88            | 121.90
7.00           | 8.04            | 14.80
5.00           | 8.37            | 67.47
6.00           | 5.00            | 16.59
3.00           | 8.63            | 187.74
8.00           | 8.85            | 10.64
5.00           | 5.73            | 14.54
5.00           | 2.61            | 47.82
6.00           | 8.79            | 46.53
6.00           | 8.64            | 44.03
3.00           | 8.68            | 189.23
7.00           | 8.71            | 24.41
4.00           | 8.80            | 120.04
5.00           | 8.80            | 75.98
6.00           | 8.70            | 45.08
5.00           | 8.78            | 75.69
1.00   

  mape = np.abs((predicted_score - original_score) / original_score) * 100


9. Same process as above, but instead using euclidean distance to measure between the old scores and new scores.

(Given this distance metric is unbounded, I first collected all the original Manhattan results between both scores, defined its minimum and maximum value, and then rescaled them between 0-10)

In [8]:
from sklearn.metrics.pairwise import manhattan_distances

BMwordEmbeddings1 = np.load('BMwordEmbeddings1.npy', allow_pickle=True)
BMwordEmbeddings2 = np.load('BMwordEmbeddings2.npy', allow_pickle=True)

distances = []

for i in range(len(BMwordEmbeddings1)):
    BMwordEmbeddings1_i = BMwordEmbeddings1[i].reshape(1, -1)
    BMwordEmbeddings2_i = BMwordEmbeddings2[i].reshape(1, -1)
    distance = manhattan_distances(BMwordEmbeddings1_i, BMwordEmbeddings2_i)[0][0]
    distances.append(distance)

distances = np.array(distances)

min_dist = distances.min()
max_dist = distances.max()

normalized_distances = (distances - min_dist) / (max_dist - min_dist)
scaled_distances = normalized_distances * 10

MAE = []
MAPE = []
Pearson = []

print("Original Score | Predicted Score | MAPE (%)")
print("--------------------------------------------")

for i in range(len(BMwordEmbeddings1)):
    original_score = df['Total_Marks'][i]
    predicted_score = scaled_distances[i]

    mape = np.abs((predicted_score - original_score) / original_score) * 100

    print(f"{original_score:.2f}           | {predicted_score:.2f}            | {mape:.2f}")

    MAE.append(np.abs(predicted_score - original_score))
    MAPE.append(mape)
    Pearson.append((original_score, predicted_score))

MAE = np.mean(MAE)
MAPE = np.mean(MAPE)
pearson_corr = np.array(Pearson)
corr_coefficient, _ = pearsonr(pearson_corr[:,0], pearson_corr[:,1])

print(f"\nMean Absolute Error: {MAE:.2f}")
print(f"Pearson Correlation: {corr_coefficient:.2f}")

Original Score | Predicted Score | MAPE (%)
--------------------------------------------
7.00           | 7.03            | 0.42
8.00           | 6.68            | 16.44
5.00           | 3.81            | 23.72
7.00           | 3.37            | 51.84
5.00           | 5.84            | 16.89
8.00           | 6.54            | 18.22
4.00           | 8.48            | 111.95
7.00           | 7.33            | 4.73
5.00           | 7.67            | 53.45
6.00           | 3.45            | 42.43
3.00           | 8.18            | 172.53
8.00           | 8.46            | 5.76
5.00           | 4.19            | 16.21
5.00           | 1.30            | 74.01
6.00           | 8.43            | 40.45
6.00           | 8.20            | 36.66
3.00           | 8.24            | 174.67
7.00           | 8.27            | 18.14
4.00           | 8.37            | 109.29
5.00           | 8.39            | 67.70
6.00           | 8.31            | 38.51
5.00           | 8.41            | 68.11
1.00    

  mape = np.abs((predicted_score - original_score) / original_score) * 100
