In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load IMDB dataset
df = pd.read_csv('/content/drive/MyDrive/IMDB Dataset.csv')  #Path to IMDB dataset
df = df.sample(frac=1).reset_index(drop=True)  # Shuffle dataset

# Select 80 reviews
reviews = df["review"][:80].tolist()
labels = df["sentiment"][:80].tolist()

# Map sentiment labels to numerical values
label_map = {"negative": 0, "positive": 1}
labels = [label_map[label] for label in labels]

# Split data into train and test sets
train_reviews, test_reviews, train_labels, test_labels = train_test_split(reviews, labels, test_size=0.2, random_state=42)

# Define dataset class
class IMDBDataset(Dataset):
    def __init__(self, reviews, labels, tokenizer, max_length):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = self.reviews[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(review,
                                  max_length=self.max_length,
                                  padding='max_length',
                                  truncation=True,
                                  return_tensors='pt')
        return {'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
                'labels': torch.tensor(label, dtype=torch.long)}

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define training parameters
batch_size = 8
epochs = 3
learning_rate = 2e-5

# Define device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Prepare data loaders
train_dataset = IMDBDataset(train_reviews, train_labels, tokenizer, max_length=128)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Define optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)

# Move model to device
model.to(device)

# Training loop
model.train()
for epoch in range(epochs):
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

# Evaluation
test_dataset = IMDBDataset(test_reviews, test_labels, tokenizer, max_length=128)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].cpu().numpy()

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probabilities = nn.functional.softmax(logits, dim=1)
        predicted_labels = np.argmax(probabilities.cpu().numpy(), axis=1)

        predictions.extend(predicted_labels)
        true_labels.extend(labels)

# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)
print("Accuracy:", accuracy)

# Deployment
# Save the trained model
model_save_path = "bert_sentiment_model.pt"
torch.save(model.state_dict(), model_save_path)

# Load the saved model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.load_state_dict(torch.load(model_save_path))
model.eval()

# Define a function for inference
def predict_sentiment(text):
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    inputs.to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probabilities = nn.functional.softmax(logits, dim=1)
        predicted_label = torch.argmax(probabilities, dim=1).item()
    return predicted_label

# Inference on new data
new_sentences = ["The movie was really good.", "The acting was terrible.",
                 "I loved every minute of it.", "It was a waste of time.",
                 "The plot was confusing.", "The cinematography was stunning."]
new_labels = [1, 0, 1, 0, 0, 1]  # Assuming 0 represents negative and 1 represents positive

for sent, label in zip(new_sentences, new_labels):
    sentiment = "Positive" if predict_sentiment(sent) == 1 else "Negative"
    print(f"Sentence: {sent} | Predicted Sentiment: {sentiment}\n")





Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy: 0.6875


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Sentence: The movie was really good. | Predicted Sentiment: Positive

Sentence: The acting was terrible. | Predicted Sentiment: Negative

Sentence: I loved every minute of it. | Predicted Sentiment: Negative

Sentence: It was a waste of time. | Predicted Sentiment: Negative

Sentence: The plot was confusing. | Predicted Sentiment: Negative

Sentence: The cinematography was stunning. | Predicted Sentiment: Negative

