In [19]:
# Import necessary libraries and modules
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch.nn as nn

In [11]:
class SentimentDataset(Dataset):
    # The constructor method which initializes some properties of the SentimentDataset class
    def __init__(self, reviews, targets, tokenizer, max_len):
        self.reviews = reviews
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
    # Method to get the length of the dataset
    def __len__(self):
        return len(self.reviews)
    # Method to retrieve a single item from the dataset at the specified index 'item'
    def __getitem__(self, item):
        review = str(self.reviews[item])[:self.max_len]
        target = self.targets[item]
        # Tokenize the review text and return tensors
        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        # Return a dictionary containing the review, the input_ids tensor, the attention_mask tensor, and the target label
        return {
            'review': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }

# Function to create a DataLoader from the DataFrame
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = SentimentDataset(
        reviews=df.review.to_numpy(),
        targets=df.target.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )
    return DataLoader(
        ds,
        batch_size=batch_size,
    )

In [12]:
# Load IMDB Dataset and split it into train and test datasets
df = pd.read_csv("IMDB Dataset.csv")
df['target'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
train_df, test_df = train_test_split(df, test_size=0.2)

In [None]:
# Define the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
MAX_LEN = 180
BATCH_SIZE = 16
# Create the DataLoaders
train_data_loader = create_data_loader(train_df, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(test_df, tokenizer, MAX_LEN, BATCH_SIZE)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
model = model.to(device)

EPOCHS = 1

# Define optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
total_steps = len(train_data_loader) * EPOCHS
loss_fn = torch.nn.CrossEntropyLoss().to(device)

In [20]:
# Training loop
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)
    for batch in tqdm(train_data_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        targets = batch["targets"].to(device)
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        logits = outputs.logits
        _, preds = torch.max(logits, dim=1)
        loss = loss_fn(logits, targets)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        optimizer.zero_grad()

Epoch 1/1
----------


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2500/2500 [38:45:54<00:00, 55.82s/it]


In [21]:
# Function to calculate the accuracy of the model
def calculate_accuracy(preds, targets):
    correct_predictions = (preds == targets).float() 
    acc = correct_predictions.sum() / len(correct_predictions)
    return acc

In [22]:
# Function to evaluate the model
def evaluate_model(model, data_loader, device, loss_fn):
    model = model.eval()
    
    losses = []
    accuracies = []
    
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)
            
            outputs = model(
              input_ids=input_ids,
              attention_mask=attention_mask
            )
            _, preds = torch.max(outputs.logits, dim=1)
            
            loss = loss_fn(outputs.logits, targets)
            losses.append(loss.item())
            
            accuracy = calculate_accuracy(preds, targets)
            accuracies.append(accuracy)
    
    return np.mean(losses), np.mean(accuracies)

In [23]:
# Evaluate the model on the test data
test_loss, test_acc = evaluate_model(
    model,
    test_data_loader,
    device,
    loss_fn
)
print(f'Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}')


Test Loss: 0.3948, Test Acc: 0.8113
