# LAB-6-B

### Author

- [Navaneeth Sivakumar - 21BAI1302](https://github.com/Sivakumar-Navaneeth)

In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split

### Load CSV data

In [None]:
df = pd.read_csv('Dataset/IMDB Dataset.csv') 

### Split the data into training and test sets

In [None]:
train_texts, test_texts, train_labels, test_labels = train_test_split(df['review'], df['sentiment'], test_size=0.2)

### Load the pre-trained BERT tokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

### Tokenize the text data

In [None]:
train_encodings = tokenizer(train_texts.tolist(), padding=True, truncation=True, max_length=128, return_tensors="pt")
test_encodings = tokenizer(test_texts.tolist(), padding=True, truncation=True, max_length=128, return_tensors="pt")

### Create a simple dataset class

In [None]:
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        # Convert sentiment labels to numerical representation (0 for negative, 1 for positive)
        self.labels = torch.tensor(labels.map({'negative': 0, 'positive': 1}).values)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}, self.labels[idx]



### Create datasets

In [None]:
train_dataset = SentimentDataset(train_encodings, train_labels)
test_dataset = SentimentDataset(test_encodings, test_labels)

### Load BERT model for sequence classification

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

### Create data loaders

In [None]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

### Use Adam optimizer

In [None]:
optimizer = AdamW(model.parameters(), lr=5e-5)

### Train the model

In [None]:
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model.to(device)
model.train()

for epoch in range(2):  # Train for 2 epochs
    for batch, labels in tqdm(train_loader, desc="Training"):
        inputs = {key: val.to(device) for key, val in batch.items()}
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch} Loss: {loss.item()}")

### Evaluate the model

In [None]:
model.eval()
correct = 0
total = 0

for batch, labels in test_loader:
    inputs = {key: val.to(device) for key, val in batch.items()}
    labels = labels.to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=-1)
        correct += (predictions == labels).sum().item()
        total += labels.size(0)

accuracy = correct / total
print(f"Test Accuracy: {accuracy:.2f}")