In [6]:
from tokenizer import Tokenizer
from dataset import IMDB_Dataset
import os
import pandas as pd

VOCAB_JSON = 'vocab.json'
DATASET_CSV = 'IMDB Dataset.csv'
MAX_LENGTH = 512

if not os.path.exists(VOCAB_JSON):
    tokenizer = Tokenizer()
    df = pd.read_csv(DATASET_CSV)
    tokenizer.generate_vocab(df['review'].tolist())
    tokenizer.save_vocab(VOCAB_JSON)
else:
    tokenizer = Tokenizer(vocab_json=VOCAB_JSON)

print(f"Vocab size: {tokenizer.get_vocab_size()}")

dataset = IMDB_Dataset(data_path=DATASET_CSV, max_length=MAX_LENGTH, tokenizer=tokenizer)

Vocab size: 390936


In [2]:
from torch.utils.data import DataLoader
import torch
from model import SemanticClassifier

BATCH_SIZE = 32

dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
model = SemanticClassifier(num_classes=2, vocab_size=tokenizer.get_vocab_size(), max_len=MAX_LENGTH)

device = torch.device(
    "mps" if torch.backends.mps.is_available() else
    "cuda" if torch.cuda.is_available() else
    "cpu"
)

print(f"Using device: {device}")

Using device: mps


In [5]:
import torch
import torch.nn as nn
import torch.optim as optim

NUM_EPOCHS = 5

model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

model.train()

for epoch in range(NUM_EPOCHS):
    total_loss = 0

    for input_ids, labels in dataloader:
        input_ids = input_ids.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        logits = model(input_ids)
        loss = criterion(logits, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        print(f"Epoch {epoch+1}/{NUM_EPOCHS} | Batch Loss: {loss.item():.4f}")

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{NUM_EPOCHS} | Loss: {avg_loss:.4f}")

Epoch 1/5 | Batch Loss: 0.6075
Epoch 1/5 | Batch Loss: 0.7358
Epoch 1/5 | Batch Loss: 0.7253
Epoch 1/5 | Batch Loss: 0.5969
Epoch 1/5 | Batch Loss: 0.5955
Epoch 1/5 | Batch Loss: 0.7046
Epoch 1/5 | Batch Loss: 0.7388
Epoch 1/5 | Batch Loss: 0.7046
Epoch 1/5 | Batch Loss: 0.5997
Epoch 1/5 | Batch Loss: 0.6113
Epoch 1/5 | Batch Loss: 0.6066
Epoch 1/5 | Batch Loss: 0.6055
Epoch 1/5 | Batch Loss: 0.6598
Epoch 1/5 | Batch Loss: 0.7458
Epoch 1/5 | Batch Loss: 0.6693
Epoch 1/5 | Batch Loss: 0.6208
Epoch 1/5 | Batch Loss: 0.6226
Epoch 1/5 | Batch Loss: 0.7472
Epoch 1/5 | Batch Loss: 0.6705
Epoch 1/5 | Batch Loss: 0.6851
Epoch 1/5 | Batch Loss: 0.5927
Epoch 1/5 | Batch Loss: 0.6506
Epoch 1/5 | Batch Loss: 0.6831
Epoch 1/5 | Batch Loss: 0.6779
Epoch 1/5 | Batch Loss: 0.7279
Epoch 1/5 | Batch Loss: 0.6171
Epoch 1/5 | Batch Loss: 0.6688
Epoch 1/5 | Batch Loss: 0.6226
Epoch 1/5 | Batch Loss: 0.6172
Epoch 1/5 | Batch Loss: 0.6929
Epoch 1/5 | Batch Loss: 0.6957
Epoch 1/5 | Batch Loss: 0.6336
Epoch 1/