In [2]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Step 1: Data Preparation
# Example dataset of user reviews and sentiment labels
reviews = [
    ("This movie is fantastic!", "positive"),
    ("The acting was terrible.", "negative"),
    ("Great plot and characters.", "positive"),
    ("Disappointing ending.", "negative")
]

# Split data into text and labels
texts = [review[0] for review, _ in reviews]
labels = [review[1] for _, review in reviews]

# Step 2: Tokenization with BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenized_texts = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

# Step 3: Split dataset into train and test sets
train_inputs, test_inputs, train_labels, test_labels = train_test_split(
    tokenized_texts['input_ids'], labels, test_size=0.2, random_state=42
)

# Convert labels to PyTorch tensors
train_labels = torch.tensor([1 if label == 'positive' else 0 for label in train_labels])
test_labels = torch.tensor([1 if label == 'positive' else 0 for label in test_labels])

# Step 4: Load pretrained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
optimizer = AdamW(model.parameters(), lr=5e-5)

# Step 5: Fine-tune BERT model on sentiment classification task
model.train()
for epoch in range(3):  # Example: Train for 3 epochs
    optimizer.zero_grad()
    outputs = model(train_inputs, labels=train_labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()

# Step 6: Evaluation
model.eval()
with torch.no_grad():
    outputs = model(test_inputs)
    predictions = torch.argmax(outputs.logits, dim=1)
    accuracy = accuracy_score(test_labels.tolist(), predictions.tolist())
    conf_matrix = classification_report(test_labels.tolist(), predictions.tolist())

# Print evaluation metrics
print("Accuracy:", accuracy)

print("Confusion Matrix:\n", conf_matrix)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy: 1.0
Confusion Matrix:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       1.00      1.00      1.00         1
weighted avg       1.00      1.00      1.00         1

