In [2]:
!pip install transformers datasets --quiet


In [4]:
from datasets import load_dataset
from transformers import BertTokenizer
import torch
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset

# Set seed for reproducibility
torch.manual_seed(42)

# Load dataset
dataset = load_dataset("imdb")

# Use smaller dataset for faster training/testing (optional)
def shrink_dataset(data, size=1000):
    return data.select(range(size))

dataset["train"] = shrink_dataset(dataset["train"], size=1000)
dataset["test"] = shrink_dataset(dataset["test"], size=200)

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Custom dataset wrapper
class IMDbDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

    def __len__(self):
        return len(self.labels)

# Tokenize and prepare DataLoaders
def prepare_data(tokenizer, dataset_split, max_len=512, batch_size=8):
    texts = dataset_split["text"]
    labels = dataset_split["label"]
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_len, return_tensors='pt')
    imdb_dataset = IMDbDataset(encodings, labels)
    dataloader = DataLoader(imdb_dataset, batch_size=batch_size, shuffle=True)
    return dataloader, imdb_dataset

train_loader, train_dataset = prepare_data(tokenizer, dataset["train"])
val_loader, val_dataset = prepare_data(tokenizer, dataset["test"])

# Check dataset sizes
print(f"Train Dataset Size: {len(train_dataset)}")
print(f"Validation Dataset Size: {len(val_dataset)}")

# Peek at one sample
sample = train_dataset[0]
print("\nSample Keys:", sample.keys())
print("Input IDs:", sample['input_ids'][:10])  # Just first 10 tokens
print("Label:", sample['labels'])



Train Dataset Size: 1000
Validation Dataset Size: 200

Sample Keys: dict_keys(['input_ids', 'attention_mask', 'labels'])
Input IDs: tensor([  101,  1045, 12524,  1045,  2572,  8025,  1011,  3756,  2013,  2026])
Label: tensor(0)


In [5]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch
import time

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load pre-trained BERT tokenizer and model for binary classification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(device)

# Confirm model loaded and check classifier layer
print("\nModel Loaded Successfully.")
print(model.classifier)


Using device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Model Loaded Successfully.
Linear(in_features=768, out_features=2, bias=True)


In [8]:
from torch.utils.data import DataLoader
from transformers import BertTokenizer
import torch

# Ensure you have your tokenizer loaded (use a specific model)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to tokenize the text data
def tokenize_data(dataset, tokenizer, max_length=512):
    input_ids = []
    attention_masks = []
    labels = []

    for item in dataset:
        # Debugging: print dataset entry to ensure proper structure
        print(f"Tokenizing: {item}")
        
        # Tokenize each text in the dataset
        encoding = tokenizer.encode_plus(
            item['text'],                      # The text from the dataset
            truncation=True,                    # Truncate longer sequences
            padding='max_length',               # Pad sequences to max_length
            max_length=max_length,             # Limit the length to max_length
            add_special_tokens=True,           # Add special tokens [CLS] and [SEP]
            return_attention_mask=True,        # Return attention mask
            return_tensors='pt'                # Return as PyTorch tensors
        )
        
        # Ensure encoding returns tensors
        input_ids.append(encoding['input_ids'].squeeze(0))        # Remove batch dimension
        attention_masks.append(encoding['attention_mask'].squeeze(0))
        labels.append(item['label'])                               # Append label
    
    # Convert to tensors
    input_ids = torch.stack(input_ids)
    attention_masks = torch.stack(attention_masks)
    labels = torch.tensor(labels)
    
    return input_ids, attention_masks, labels

# Example dataset structure for debugging
train_dataset = [
    {"text": "I love machine learning", "label": 1},
    {"text": "This is a great tutorial", "label": 0},
    {"text": "BERT is an amazing model", "label": 1}
]

# Apply tokenization to the dataset
input_ids, attention_masks, labels = tokenize_data(train_dataset, tokenizer)

# Create DataLoader for batching
train_data = torch.utils.data.TensorDataset(input_ids, attention_masks, labels)
train_dataloader = DataLoader(train_data, batch_size=2, shuffle=True)

# Sample output check
print(f"Sample Input IDs: {input_ids[0]}")
print(f"Sample Attention Mask: {attention_masks[0]}")
print(f"Sample Label: {labels[0]}")


Tokenizing: {'text': 'I love machine learning', 'label': 1}
Tokenizing: {'text': 'This is a great tutorial', 'label': 0}
Tokenizing: {'text': 'BERT is an amazing model', 'label': 1}
Sample Input IDs: tensor([ 101, 1045, 2293, 3698, 4083,  102,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,


TRAINING CODE (with datset loader)

In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm


# Load CSV
df = pd.read_csv(r"C:\Users\srevatshen\imdb-emotion-classifier\data\imdb_reviews.csv")

df = df[['review', 'sentiment']].dropna()

# Encode labels: positive -> 1, negative -> 0
label_encoder = LabelEncoder()
df['sentiment'] = label_encoder.fit_transform(df['sentiment'])

# Train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['review'].tolist(), df['sentiment'].tolist(), test_size=0.2, random_state=42)

# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Dataset class
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encodings = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            'input_ids': encodings['input_ids'].squeeze(),
            'attention_mask': encodings['attention_mask'].squeeze(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Dataloaders
train_dataset = SentimentDataset(train_texts, train_labels, tokenizer)
val_dataset = SentimentDataset(val_texts, val_labels, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = torch.argmax(logits, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    acc = correct / total * 100
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} - Loss: {avg_loss:.4f} - Accuracy: {acc:.2f}%")

# Save model + tokenizer
model.save_pretrained("saved_model/")
tokenizer.save_pretrained("saved_model/")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3: 100%|█████████████████████████████████████████████████████████████████████| 200/200 [16:29<00:00,  4.95s/it]


Epoch 1 - Loss: 0.6770 - Accuracy: 55.75%


Epoch 2/3: 100%|█████████████████████████████████████████████████████████████████████| 200/200 [16:35<00:00,  4.98s/it]


Epoch 2 - Loss: 0.3567 - Accuracy: 84.31%


Epoch 3/3: 100%|█████████████████████████████████████████████████████████████████████| 200/200 [16:00<00:00,  4.80s/it]


Epoch 3 - Loss: 0.1388 - Accuracy: 95.25%


('saved_model/tokenizer_config.json',
 'saved_model/special_tokens_map.json',
 'saved_model/vocab.txt',
 'saved_model/added_tokens.json')

In [None]:
MODEL EVALUATION

In [None]:
from sklearn.metrics import classification_report, accuracy_score
import torch
from tqdm import tqdm

# Evaluation function
def evaluate(model, dataloader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Accuracy
    acc = accuracy_score(all_labels, all_preds)
    print(f"\n Validation Accuracy: {acc * 100:.2f}%")

    # Classification report
    print("\n Classification Report:")
    print(classification_report(all_labels, all_preds, target_names=['Negative', 'Positive']))

# Load model if needed
# model = BertForSequenceClassification.from_pretrained("saved_model/")
# tokenizer = BertTokenizer.from_pretrained("saved_model/")
# model.to(device)

# Evaluate
evaluate(model, val_loader, device)


In [None]:
STREAMLIT LIBRARY FOR FRONT END IMPLEMENTATION OF THE SENTIMENT ANALYSIS MODEL

In [4]:
!pip install streamlit


Defaulting to user installation because normal site-packages is not writeable
Collecting streamlit
  Downloading streamlit-1.44.1-py3-none-any.whl.metadata (8.9 kB)
Collecting altair<6,>=4.0 (from streamlit)
  Downloading altair-5.5.0-py3-none-any.whl.metadata (11 kB)
Collecting blinker<2,>=1.0.0 (from streamlit)
  Downloading blinker-1.9.0-py3-none-any.whl.metadata (1.6 kB)
Collecting protobuf<6,>=3.20 (from streamlit)
  Downloading protobuf-5.29.4-cp310-abi3-win_amd64.whl.metadata (592 bytes)
Collecting tenacity<10,>=8.1.0 (from streamlit)
  Downloading tenacity-9.1.2-py3-none-any.whl.metadata (1.2 kB)
Collecting toml<2,>=0.10.1 (from streamlit)
  Downloading toml-0.10.2-py2.py3-none-any.whl.metadata (7.1 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-win_amd64.whl.metadata (44 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting narwhals>=1.14.2 (from altair<6,>=4.0->