In [None]:
from transformers import BertForSequenceClassification, AutoTokenizer
import torch
import json
import random
from torch.optim.lr_scheduler import StepLR

data = []
epochs = 30
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load dataset
with open('/content/train.jsonl', 'r') as f:
    for line in f:
        data.append(json.loads(line))

# Preprocess data
sentences = []
labels = []
for item in data:
    for option, answer in zip([item['option1'], item['option2']], [item['answer'], item['answer']]):
        sentence = item['sentence'].replace('_', '[MASK]')
        sentence = sentence.replace('[MASK]', option)
        sentences.append(sentence)
        labels.append(int(answer) - 1)

# Split data into training and dev sets
split_ratio = 0.8  # 80% training, 20% dev
split_index = int(len(sentences) * split_ratio)

# Shuffle training data
combined_data = list(zip(sentences[:split_index], labels[:split_index]))
random.shuffle(combined_data)
train_sentences, train_labels = zip(*combined_data)

# Labels to GPU
train_labels = torch.tensor(train_labels).to(device)
dev_labels = torch.tensor(labels[split_index:]).to(device)

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Load pre-trained model for classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model = model.to(device)

# Use SGD optimizer instead of Adam
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.9)

#scheduler for learning rate decay
scheduler = StepLR(optimizer, step_size=1, gamma=0.1)

# Early stopping parameters
early_stopping_patience = 3
best_train_loss = float('inf')
early_stopping_counter = 0

# Fine-tune the model
batch_size = 128

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")

    # Training phase
    model.train()
    for i in range(0, len(train_sentences), batch_size):
        batch_train_sentences = train_sentences[i:i + batch_size]
        batch_train_labels = train_labels[i:i + batch_size]
        batch_train_labels = torch.tensor(batch_train_labels).to(device)

        optimizer.zero_grad()
        batch_train_inputs = tokenizer(batch_train_sentences, return_tensors="pt", padding=True, truncation=True)
        batch_train_inputs = {key: value.to(device) for key, value in batch_train_inputs.items()}
        outputs = model(**batch_train_inputs, labels=batch_train_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        scheduler.step()

        print(f"  Batch {i//batch_size + 1}/{len(train_sentences)//batch_size + 1}, Loss: {loss.item()}")

    # Validation phase
    model.eval()
    with torch.no_grad():
        dev_loss = 0.0
        for i in range(0, len(dev_labels), batch_size):
            batch_dev_sentences = sentences[split_index:][i:i + batch_size]
            batch_dev_labels = dev_labels[i:i + batch_size]

            batch_dev_labels = torch.tensor(batch_dev_labels).to(device)

            batch_dev_inputs = tokenizer(batch_dev_sentences, return_tensors="pt", padding=True, truncation=True)
            batch_dev_inputs = {key: value.to(device) for key, value in batch_dev_inputs.items()}

            outputs = model(**batch_dev_inputs, labels=batch_dev_labels)
            dev_loss += outputs.loss.item()

        avg_dev_loss = dev_loss / (len(dev_labels) / batch_size)
        print(f"Validation Loss: {avg_dev_loss}")

    # Early stopping logic
    if avg_dev_loss < best_train_loss:
        best_train_loss = avg_dev_loss
        early_stopping_counter = 0
    else:
        early_stopping_counter += 1

    if early_stopping_counter >= early_stopping_patience:
        print(f"Early stopping! No improvement in validation loss for {early_stopping_patience} epochs.")
        break  # Stop training

# Save trained model
model.save_pretrained('/content/model')

print("Training complete.")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/30


  batch_train_labels = torch.tensor(batch_train_labels).to(device)


  Batch 1/505, Loss: 0.680671751499176
  Batch 2/505, Loss: 0.700760543346405
  Batch 3/505, Loss: 0.6969625353813171
  Batch 4/505, Loss: 0.7098594903945923
  Batch 5/505, Loss: 0.6961447596549988
  Batch 6/505, Loss: 0.6909744739532471
  Batch 7/505, Loss: 0.6875796914100647
  Batch 8/505, Loss: 0.6948564648628235
  Batch 9/505, Loss: 0.7012844085693359
  Batch 10/505, Loss: 0.6993989944458008
  Batch 11/505, Loss: 0.6800473928451538
  Batch 12/505, Loss: 0.6802864074707031
  Batch 13/505, Loss: 0.702400267124176
  Batch 14/505, Loss: 0.714778482913971
  Batch 15/505, Loss: 0.6992886662483215
  Batch 16/505, Loss: 0.6909516453742981
  Batch 17/505, Loss: 0.6927001476287842
  Batch 18/505, Loss: 0.7161590456962585
  Batch 19/505, Loss: 0.7014274001121521
  Batch 20/505, Loss: 0.7040616869926453
  Batch 21/505, Loss: 0.6964558362960815
  Batch 22/505, Loss: 0.6934621334075928
  Batch 23/505, Loss: 0.6848512291908264
  Batch 24/505, Loss: 0.6950128674507141
  Batch 25/505, Loss: 0.69566

In [None]:
from transformers import BertForSequenceClassification, AdamW, AutoTokenizer
import torch
import json
from sklearn.metrics import accuracy_score

# Load test dataset
test_data = []
with open('/content/test.jsonl', 'r') as f:
    for line in f:
        test_data.append(json.loads(line))

# Preprocess test data
test_sentences = []
test_labels = []
for item in test_data:
    for option, answer in zip([item['option1'], item['option2']], [item['answer'], item['answer']]):
        sentence = item['sentence'].replace('_', '[MASK]')
        sentence = sentence.replace('[MASK]', option)
        test_sentences.append(sentence)
        test_labels.append(int(answer) - 1)

# Load trained model
loaded_model = BertForSequenceClassification.from_pretrained('/content/model')
loaded_model.to(device)

# Tokenize  test data
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
test_inputs = tokenizer(test_sentences, return_tensors="pt", padding=True, truncation=True)
test_inputs = {key: value.to(device) for key, value in test_inputs.items()}

# Perform inference on test data
loaded_model.eval()
with torch.no_grad():
    outputs = loaded_model(**test_inputs)
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=1)
    predicted_labels = torch.argmax(probabilities, dim=1).cpu().numpy()

# Convert test labels to NumPy array
true_labels = torch.tensor(test_labels).numpy()

# Print sentences, correct answers, and predicted labels
for i in range(len(test_data)):
    print(f"Example {i + 1}:")
    print("Sentence:", test_sentences[i])
    print("Correct Answer:", true_labels[i] + 1)  # Add 1 to convert back to 1-indexing
    print("Predicted Label:", predicted_labels[i] + 1)  # Add 1 to convert back to 1-indexing
    print("--------------")

# Evaluate model performance
accuracy = accuracy_score(true_labels, predicted_labels)
print(f"Accuracy on the test set: {accuracy}")