In [2]:
from transformers import DistilBertConfig,BertConfig, BertForSequenceClassification, AutoTokenizer, AdamW, DistilBertForSequenceClassification
import torch
import json
import random

data = []
epochs = 30
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load dataset
with open('/content/train.jsonl', 'r') as f:
    for line in f:
        data.append(json.loads(line))

# Preprocess data
sentences = []
labels = []
for item in data:
    for option, answer in zip([item['option1'], item['option2']], [item['answer'], item['answer']]):
        sentence = item['sentence'].replace('_', '[MASK]')
        sentence = sentence.replace('[MASK]', option)
        sentences.append(sentence)
        labels.append(int(answer) - 1)

# Split data into training and dev sets
split_ratio = 0.8  # 80% training, 20% dev
split_index = int(len(sentences) * split_ratio)

train_sentences = sentences[:split_index]
train_labels = torch.tensor(labels[:split_index]).to(device)

dev_sentences = sentences[split_index:]
dev_labels = torch.tensor(labels[split_index:]).to(device)

#tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
# drop out rate for bert
dropout_rate = 0.3
config = DistilBertConfig.from_pretrained("distilbert-base-uncased", num_labels=2, dropout=dropout_rate)
model = DistilBertForSequenceClassification(config)
model = model.to(device)

#model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
#model = model.to(device)

#weight_decay = 0.01
optimizer = torch.optim.RMSprop(model.parameters(), lr=1e-5)
#optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5) #weight_decay = weight_decay)

# Early stopping parameters
early_stopping_patience = 3
best_train_loss = float('inf')
early_stopping_counter = 0

batch_size = 64
print("------------------------------------")
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    train_loss = 0.0
    # Shuffle training data each epoch
    combined_data = list(zip(train_sentences, train_labels))
    random.shuffle(combined_data)
    train_sentences, train_labels = zip(*combined_data)
    train_labels = torch.tensor(train_labels).to(device)

    # Training phase
    model.train()
    for i in range(0, len(train_sentences), batch_size):
        batch_train_sentences = train_sentences[i:i + batch_size]
        batch_train_labels = train_labels[i:i + batch_size]
        batch_train_labels = batch_train_labels.to(device)

        optimizer.zero_grad()
        #batch_train_inputs = tokenizer(batch_train_sentences, return_tensors="pt", padding=True, truncation=True)
        #batch_train_inputs = {key: value.to(device) for key, value in batch_train_inputs.items()}
        batch_train_inputs = tokenizer(batch_train_sentences, return_tensors="pt", padding=True, truncation=True, return_token_type_ids=False)
        batch_train_inputs = {key: value.to(device) for key, value in batch_train_inputs.items()}
        outputs = model(**batch_train_inputs, labels=batch_train_labels)
        loss = outputs.loss
        train_loss += loss
        loss.backward()
        optimizer.step()
    avg_train_loss = train_loss / (len(train_labels)/batch_size)
    print(f"Training Loss: {avg_train_loss}")

    # Validation phase
    model.eval()
    with torch.no_grad():
        dev_loss = 0.0
        for i in range(0, len(dev_labels), batch_size):
            batch_dev_sentences = dev_sentences[i:i + batch_size]
            batch_dev_labels = dev_labels[i:i + batch_size]

            batch_dev_labels = batch_dev_labels.to(device)

            batch_dev_inputs = tokenizer(batch_dev_sentences, return_tensors="pt", padding=True, truncation=True, return_token_type_ids=False)
            batch_dev_inputs = {key: value.to(device) for key, value in batch_dev_inputs.items()}
            #batch_dev_inputs = tokenizer(batch_dev_sentences, return_tensors="pt", padding=True, truncation=True)
            #batch_dev_inputs = {key: value.to(device) for key, value in batch_dev_inputs.items()}

            outputs = model(**batch_dev_inputs, labels=batch_dev_labels)
            dev_loss += outputs.loss.item()

        avg_dev_loss = dev_loss / (len(dev_labels) / batch_size)
        print(f"Validation Loss: {avg_dev_loss}")
        print("------------------------------------")

    # Early stopping logic
    if avg_dev_loss < best_train_loss:
        best_train_loss = avg_dev_loss
        early_stopping_counter = 0
    else:
        early_stopping_counter += 1

    if early_stopping_counter >= early_stopping_patience:
        print(f"Early stopping! No improvement in validation loss for {early_stopping_patience} epochs.")
        break  # Stop training

# Save trained model
model.save_pretrained('/content/model')

print("Training complete.")






The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

------------------------------------
Epoch 1/30
Training Loss: 0.6970278024673462
Validation Loss: 0.695818010651239
------------------------------------
Epoch 2/30
Training Loss: 0.6950380802154541
Validation Loss: 0.6960761582497323
------------------------------------
Epoch 3/30
Training Loss: 0.6939724683761597
Validation Loss: 0.6968395750121315
------------------------------------
Epoch 4/30
Training Loss: 0.6924352049827576
Validation Loss: 0.6873150681505109
------------------------------------
Epoch 5/30
Training Loss: 0.684661328792572
Validation Loss: 0.6919533750798442
------------------------------------
Epoch 6/30
Training Loss: 0.6753706932067871
Validation Loss: 0.7068037441461393
------------------------------------
Epoch 7/30
Training Loss: 0.6655757427215576
Validation Loss: 0.7075784340943441
------------------------------------
Early stopping! No improvement in validation loss for 3 epochs.
Training complete.


In [None]:
from transformers import BertForSequenceClassification, AdamW, AutoTokenizer
import torch
import json
from sklearn.metrics import accuracy_score

# Load test dataset
test_data = []
with open('/content/test.jsonl', 'r') as f:
    for line in f:
        test_data.append(json.loads(line))

# Preprocess test data
test_sentences = []
test_labels = []
for item in test_data:
    for option, answer in zip([item['option1'], item['option2']], [item['answer'], item['answer']]):
        sentence = item['sentence'].replace('_', '[MASK]')
        sentence = sentence.replace('[MASK]', option)
        test_sentences.append(sentence)
        test_labels.append(int(answer) - 1)

# Load trained model
loaded_model = DistilBertForSequenceClassification.from_pretrained('/content/model')
loaded_model.to(device)

# Tokenize  test data
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
test_inputs = tokenizer(test_sentences, return_tensors="pt", padding=True, truncation=True)
test_inputs = {key: value.to(device) for key, value in test_inputs.items()}

# Perform inference on test data
loaded_model.eval()
with torch.no_grad():
    outputs = loaded_model(**test_inputs)
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=1)
    predicted_labels = torch.argmax(probabilities, dim=1).cpu().numpy()

# Convert test labels to NumPy array
true_labels = torch.tensor(test_labels).numpy()

# Print sentences, correct answers, and predicted labels
for i in range(len(test_data)):
    print(f"Example {i + 1}:")
    print("Sentence:", test_sentences[i])
    print("Correct Answer:", true_labels[i] + 1)  # Add 1 to convert back to 1-indexing
    print("Predicted Label:", predicted_labels[i] + 1)  # Add 1 to convert back to 1-indexing
    print("--------------")

# Evaluate model performance
accuracy = accuracy_score(true_labels, predicted_labels)
print(f"Accuracy on the test set: {accuracy}")

The model is overfitting. I can see it
is learning the training data too well while on the validation set that is unseen data it doesnt perform too well.
The training stopped after 5 epochs and when i tested on the test set it results in a Accuracy of 0.58, this means the model predicts correctly 58% of the time.

Tried to add dropout from 0.1 to 0.5 as well as weight decay for BERT. Also changed and tried different learning rates and batch sizes but keeps overfitting.

I tried a less complex model, DistilBert but it keeps overfitting. It lasts more epochs before overfitting compared to normal BERT but it still overfits.
Here i tried to add dropout as well as weight decay but without results.
