In [None]:
from transformers import BertForSequenceClassification, AdamW, AutoTokenizer
import torch
import json

data = []
epochs = 14
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load dataset
with open('/content/train.jsonl', 'r') as f:
    for line in f:
        data.append(json.loads(line))

# Preprocess data
sentences = []
labels = []
for item in data:
    for option, answer in zip([item['option1'], item['option2']], [item['answer'], item['answer']]):
        sentence = item['sentence'].replace('_', '[MASK]')
        sentence = sentence.replace('[MASK]', option)
        sentences.append(sentence)
        labels.append(int(answer) - 1)

split_ratio = 0.8
split_index = int(len(sentences) * split_ratio)

train_sentences, val_sentences = sentences[:split_index], sentences[split_index:]
train_labels, val_labels = labels[:split_index], labels[split_index:]
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Labels to GPU
labels = torch.tensor(labels).to(device)

# Load pre-trained model for classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model = model.to(device)

# Use Adam optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Fine-tune the model
model.train()
batch_size = 128

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    for i in range(0, len(train_sentences), batch_size):
        batch_sentences = train_sentences[i:i + batch_size]
        batch_labels = train_labels[i:i + batch_size]
        batch_labels = torch.tensor(batch_labels).to(device)

        optimizer.zero_grad()
        batch_inputs = tokenizer(batch_sentences, return_tensors="pt", padding=True, truncation=True)
        batch_inputs = {key: value.to(device) for key, value in batch_inputs.items()}
        outputs = model(**batch_inputs, labels=batch_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        print(f"  Batch {i//batch_size + 1}/{len(train_sentences)//batch_size + 1}, Loss: {loss.item()}")

print("Training complete.")

# Save trained model
model.save_pretrained('/content/model')


In [None]:
from transformers import AutoTokenizer
import torch
import json
from sklearn.metrics import accuracy_score

# Load test dataset
test_data = []
with open('/content/test.jsonl', 'r') as f:
    for line in f:
        test_data.append(json.loads(line))

# Preprocess test data
test_sentences = []
test_labels = []
for item in test_data:
    for option, answer in zip([item['option1'], item['option2']], [item['answer'], item['answer']]):
        sentence = item['sentence'].replace('_', '[MASK]')
        sentence = sentence.replace('[MASK]', option)
        test_sentences.append(sentence)
        test_labels.append(int(answer) - 1)

# Load trained model
loaded_model = BertForSequenceClassification.from_pretrained('/content/model')
loaded_model.to(device)

# Tokenize  test data
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
test_inputs = tokenizer(test_sentences, return_tensors="pt", padding=True, truncation=True)
test_inputs = {key: value.to(device) for key, value in test_inputs.items()}

# Perform inference on test data
loaded_model.eval()
with torch.no_grad():
    outputs = loaded_model(**test_inputs)
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=1)
    predicted_labels = torch.argmax(probabilities, dim=1).cpu().numpy()

# Convert test labels to NumPy array
true_labels = torch.tensor(test_labels).numpy()

# Print sentences, correct answers, and predicted labels
for i in range(len(test_data)):
    print(f"Example {i + 1}:")
    print("Sentence:", test_sentences[i])
    print("Correct Answer:", true_labels[i] + 1)  # Add 1 to convert back to 1-indexing
    print("Predicted Label:", predicted_labels[i] + 1)  # Add 1 to convert back to 1-indexing
    print("--------------")

# Evaluate model performance
accuracy = accuracy_score(true_labels, predicted_labels)
print(f"Accuracy on the test set: {accuracy}")