In [6]:
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import re


In [1]:
from data.BERT_ASAG_tokenization.BERT_tokenization import BERT_tokenization


In [2]:
# tokenize for BERT
BERT_tokenization()


Found beetle with type csv in (data/BERT_ASAG_tokenization/BERT_tokens/data) and converted it to df
df found: False, name: BERT_tokens
No need to run standardized_splits for beetle
Running BERT_tokens for beetle
processing starting for beetle


100%|██████████| 6618/6618 [00:04<00:00, 1441.57it/s]


saving new BERT_tokens phase for: beetle
Found beetle with type csv in (data/BERT_ASAG_tokenization/gensim_embedding/data) and converted it to df
df found: False, name: BERT_tokens_spelling_corrected
No need to run standardized_splits for beetle
Running BERT_tokens_spelling_corrected for beetle
processing starting for beetle


  3%|▎         | 168/6618 [00:03<02:04, 51.65it/s]


KeyboardInterrupt: 

In [1]:
from performance_tracking.classes.Dataset_Torch import Dataset_Torch

In [11]:
dataset = Dataset_Torch(
    dir = "data/BERT_ASAG_tokenization/data/bert-base-cased/data/spelling_corrected/BERT_tokens/data",
    file_name = "concatenated_datasets",
    seed = 42,
    batch_size=128,
    sample_size=2000,
    sampling_group="dataset_name"
)


Found concatenated_datasets with type csv in (data/BERT_ASAG_tokenization/data/bert-base-uncased/data/spelling_corrected/BERT_tokens/data) and converted it to df
Found concatenated_datasets with type csv in (data/BERT_ASAG_tokenization/data/bert-base-uncased/data/spelling_corrected/BERT_tokens/data) and converted it to df


In [12]:
dataset.split_datasets()
dataset.init_dataloaders()


In [14]:
len(dataset["train"])

1400

In [5]:
from grading_models.BERT.classes.Py_Torch import Py_Torch
from performance_tracking.classes.Measurement_Settings import Measurement_Settings
from performance_tracking.constants import *


In [6]:
from transformers import BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

model_name = "distilbert-base-cased"

dataset_grading = Py_Torch(
    # parent
    model=BertForSequenceClassification.from_pretrained(model_name, num_labels=1),
    dataset=dataset,
    measurement_settings=Measurement_Settings(
        dataset_name=dataset["name"],
        embedding_seperated=False,
        sentence_embedding_method=None,
        feature_engenearing_method=None,

        embedding_model_name=model_name,
        grading_model=model_name,
        
        seed_data_split=42,

        description = "seplling_corrected_sample_size_2000",

        # inform user settings
        print_regression=True,
        print_classification=True,
        
        # save settings
        settings_performance_tracking=NO_SAVING,
        save_performance=False
    ),

    # child
    y_column="assigned_points",

    y_normalized=False, # idd not normalized because measurments are doen on non normalized values!

    lr = 2e-5,
    saved_model_dir = f"grading_models/BERT/saved_models/{model_name}/{dataset['name']}",
    epochs_to_run = 10,

)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Found performance_tracking_new with type csv in (performance_tracking/tracking) and converted it to df
Found performance_tracking_new with type csv in (performance_tracking/tracking) and converted it to df
Found performance_tracking_new with type csv in (performance_tracking/tracking) and converted it to df




In [7]:
dataset_grading.model_init()


Saved model directory does not exist.
No previously saved model found. Using initial model.


In [8]:
dataset_grading.train()


Current run epoch: 0
Current training epoch: 1
Saved model directory does not exist.
Epoch 1/10 - Train loss: 0.0019337175066256808
TRAINING:
Train Mean Squared Error after 1 epochs: 6.070162199392088
Accuracy: 19.80%
Current run epoch: 1
Current training epoch: 2
Epoch 2/10 - Train loss: 0.0005726484683682378
TRAINING:
Train Mean Squared Error after 2 epochs: 1.7236542923899272
Accuracy: 31.43%
Current run epoch: 2
Current training epoch: 3


KeyboardInterrupt: 

In [12]:
from torch.utils.data import Dataset, DataLoader

class ASAGDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        normalized_points = float(row["assigned_points"]) / float(row["max_points"])

        encoded = row["tokenized_for_BERT"]
        input_ids = encoded["input_ids"].squeeze()
        attention_mask = encoded["attention_mask"].squeeze()
        token_type_ids = encoded.get("token_type_ids", None)
        if token_type_ids is not None:
            token_type_ids = token_type_ids.squeeze()

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "token_type_ids": token_type_ids,
            "normalized_points": torch.tensor(normalized_points, dtype=torch.float32),
            "assigned_points": torch.tensor(row["assigned_points"], dtype=torch.float32),
            "max_points": torch.tensor(row["max_points"], dtype=torch.float32),
        }


In [15]:
# defining training, test and validation sets
train_dataset = ASAGDataset(data["train"])
test_dataset = ASAGDataset(data["test"])
validation_dataset = ASAGDataset(data["validation"])

batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
validation_dataloader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)


In [7]:
device = "mps" if getattr(torch,'has_mps',False) \
    else "gpu" if torch.cuda.is_available() else "cpu"


In [None]:
epochs_to_save_on = [5, 10, 15, 20]


In [17]:
from transformers import BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

model_name = "bert-base-uncased"
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=1)
model.to(device)

num_epochs = 3
optimizer = AdamW(model.parameters(), lr=2e-5)

total_steps = len(data["train"]) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    train_predictions = []
    train_ground_truth = []
    for batch in data["train"]:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        percentage_of_correctness = batch["normalized_points"].to(device)

        model.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=percentage_of_correctness.unsqueeze(1))
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

        train_loss += loss.item()

        # For performance measurement
        train_logits = outputs.logits.squeeze().detach().cpu()
        train_predictions.extend(train_logits.tolist())
        train_ground_truth.extend(percentage_of_correctness.cpu().tolist())

    print(f"Epoch {epoch + 1}/{num_epochs} - Train loss: {train_loss / len(data['train'])}")
    
    # Calculate the mean squared error for training data
    train_mse = mean_squared_error(train_ground_truth, train_predictions)
    print(f"Train Mean Squared Error after {epoch + 1} epochs: {train_mse}")

    # Evaluate on test set every 3rd epoch
    if (epoch + 1) % 3 == 0:
        model.eval()
        predictions = []
        ground_truth = []

        with torch.no_grad():
            for batch in data["test"]:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                percentage_of_correctness = batch["normalized_points"].to(device)

                outputs = model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits.squeeze().detach().cpu()

                predictions.extend(logits.tolist())
                ground_truth.extend(percentage_of_correctness.cpu().tolist())

        # Calculate the mean squared error
        mse = mean_squared_error(ground_truth, predictions)
        print(f"Test Mean Squared Error after {epoch + 1} epochs: {mse}")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch 1/3 - Train loss: 0.18850726711338964
Train Mean Squared Error after 1 epochs: 0.18857714803469583
Epoch 2/3 - Train loss: 0.13323351761390423
Train Mean Squared Error after 2 epochs: 0.13324085137201153
Epoch 3/3 - Train loss: 0.11131282759637669
Train Mean Squared Error after 3 epochs: 0.11121012062269414
Test Mean Squared Error after 3 epochs: 0.11687361907488548


In [18]:
# Evaluation loop
model.eval()
predictions = []
ground_truth = []
num_correct_predictions = 0
total_predictions = 0

with torch.no_grad():
    for batch in data["validation"]:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        percentage_of_correctness = batch["normalized_points"].to(device)
        max_points = batch["max_points"].to(device)
        assigned_points = batch["assigned_points"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits.squeeze().detach()

        # Multiply the predicted percentage of correctness by max_points and round to the nearest integer
        rounded_predictions = torch.round(logits * max_points)

        # Compare the rounded_predictions to the assigned_points and count the number of correct predictions
        num_correct_predictions += torch.sum(rounded_predictions == assigned_points).item()
        total_predictions += assigned_points.size(0)

        predictions.extend(logits.tolist())
        ground_truth.extend(percentage_of_correctness.cpu().tolist())

# Calculate the mean squared error
mse = mean_squared_error(ground_truth, predictions)
print(f"Mean Squared Error: {mse}")

# Calculate the accuracy
accuracy = num_correct_predictions / total_predictions
print(f"Accuracy: {accuracy * 100:.2f}%")


Mean Squared Error: 0.11320229657346007
Accuracy: 84.74%


In [14]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def encode_sentence_pair(student_answer, reference_answer, max_length=512):
    return tokenizer.encode_plus(
        student_answer,
        text_pair=reference_answer,
        max_length=max_length,
        pad_to_max_length=True,
        truncation=True,
        return_attention_mask=True,
        return_tensors="pt",
    )


In [22]:
class ASAGDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        student_answer = row["student_answer"]
        reference_answer = row["reference_answer"]
        percentage_of_correctness = float(row["assigned_points"]) / float(row["max_points"])

        encoded = encode_sentence_pair(student_answer, reference_answer)
        input_ids = encoded["input_ids"].squeeze()
        attention_mask = encoded["attention_mask"].squeeze()
        token_type_ids = encoded.get("token_type_ids", None)
        if token_type_ids is not None:
            token_type_ids = token_type_ids.squeeze()

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "token_type_ids": token_type_ids,
            "percentage_of_correctness": torch.tensor(percentage_of_correctness, dtype=torch.float32),
            "assigned_points": torch.tensor(row["assigned_points"], dtype=torch.float32),
            "max_points": torch.tensor(row["max_points"], dtype=torch.float32),
        }


In [23]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = ASAGDataset(train_df)
val_dataset = ASAGDataset(val_df)

batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


In [18]:
device = "mps" if getattr(torch,'has_mps',False) \
    else "gpu" if torch.cuda.is_available() else "cpu"

model = BertForSequenceClassification.from_pretrained(model_name, num_labels=1)
model.to(device)

num_epochs = 3
optimizer = AdamW(model.parameters(), lr=2e-5)

total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for batch in train_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        percentage_of_correctness = batch["percentage_of_correctness"].to(device)

        model.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=percentage_of_correctness.unsqueeze(1))
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

        train_loss += loss.item()

    print(f"Epoch {epoch + 1}/{num_epochs} - Train loss: {train_loss / len(train_dataloader)}")

# Evaluation loop
model.eval()
predictions = []
ground_truth = []

with torch.no_grad():
    for batch in val_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        percentage_of_correctness = batch["percentage_of_correctness"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits.squeeze().detach().cpu()

        predictions.extend(logits.tolist())
        ground_truth.extend(percentage_of_correctness.cpu().tolist())

# Calculate the mean squared error
mse = mean_squared_error(ground_truth, predictions)
print(f"Mean Squared Error: {mse}")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch 1/3 - Train loss: 0.1838835891550041
Epoch 2/3 - Train loss: 0.12693635819429178
Epoch 3/3 - Train loss: 0.10105955904051436
Mean Squared Error: 0.11197731473622943


In [24]:
# Evaluation loop
model.eval()
predictions = []
ground_truth = []
num_correct_predictions = 0
total_predictions = 0

with torch.no_grad():
    for batch in val_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        percentage_of_correctness = batch["percentage_of_correctness"].to(device)
        max_points = batch["max_points"].to(device)
        assigned_points = batch["assigned_points"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits.squeeze().detach()

        # Multiply the predicted percentage of correctness by max_points and round to the nearest integer
        rounded_predictions = torch.round(logits * max_points)

        # Compare the rounded_predictions to the assigned_points and count the number of correct predictions
        num_correct_predictions += torch.sum(rounded_predictions == assigned_points).item()
        total_predictions += assigned_points.size(0)

        predictions.extend(logits.tolist())
        ground_truth.extend(percentage_of_correctness.cpu().tolist())

# Calculate the mean squared error
mse = mean_squared_error(ground_truth, predictions)
print(f"Mean Squared Error: {mse}")

# Calculate the accuracy
accuracy = num_correct_predictions / total_predictions
print(f"Accuracy: {accuracy * 100:.2f}%")




Mean Squared Error: 0.11197731473622943
Accuracy: 84.52%


In [4]:
from transformers import BertTokenizer

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Some text
text = "Replace me by any text you'd like."

# Tokenize input for BERT (adding special tokens and creating attention masks)
inputs = tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')

print(inputs)


{'input_ids': tensor([[ 101, 5672, 2033, 2011, 2151, 3793, 2017, 1005, 1040, 2066, 1012,  102,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0,