In [1]:
# if needed in Colab run the following command:
# !pip install torch==2.1.0 torchtext==0.16.0
# !pip install portalocker>=2.0.0

import torch
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch.optim as optim
from tqdm.notebook import tqdm
import time
import hashlib

In [None]:
# Takes a long time...
!wget https://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip > /dev/null 2>&1

In [14]:
# Global variables
BATCH_SIZE = 8
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
OUTPUT_DIM = 1
DROPOUT = 0.2
NUM_LAYERS = 2
NUM_EPOCHS = 5
GLOVE_PATH = "glove.6B.100d.txt"
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(DEVICE)

cuda


### Building Vocab using tokenizer

In [15]:
# Load tokenizer
tokenizer = get_tokenizer("basic_english")

# Build vocab
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

train_iter = IMDB(split='train')
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=['<unk>'])
vocab.set_default_index(vocab['<unk>'])  # Default index for unknown words
VOCAB_SIZE = len(vocab)
PAD_IDX = vocab['<unk>']

### Checking datas and building loaders

In [16]:
# Function for converting text into vocabulary indices
def text_pipeline(text):
    return vocab(tokenizer(text))

# Function for preparing labels
def label_pipeline(label):
    return label - 1

# Preparing data for DataLoader
def collate_batch(batch):
    text_list, label_list = [], []
    for label, text in batch:
        text_list.append(torch.tensor(text_pipeline(text), dtype=torch.int32))
        label_list.append(label_pipeline(label))
    text_list = pad_sequence(text_list, batch_first=True, padding_value=PAD_IDX)
    label_list = torch.tensor(label_list, dtype=torch.float32)
    return text_list, label_list

# Function for calculating a unique hash for each data item
def hash_data(data):
    _, text = data
    return hashlib.md5(text.encode('utf-8')).hexdigest()

# Deletes duplicates in the dataset
def remove_duplicates(data):
    seen = set() 
    unique_data = [] 
    for item in data:
        data_hash = hash_data(item)  
        if data_hash not in seen:
            seen.add(data_hash)  
            unique_data.append(item)
    return unique_data

def verify_no_overlap(train_datas, val_datas, test_datas, hash_fn):
    # Hashes calculations
    train_hashes = set(hash_fn(data) for data in train_datas)
    val_hashes = set(hash_fn(data) for data in val_datas)
    test_hashes = set(hash_fn(data) for data in test_datas)
    
    assert train_hashes.isdisjoint(val_hashes), "Overlap between train and validation"
    assert train_hashes.isdisjoint(test_hashes), "Overlap between train and test"
    assert val_hashes.isdisjoint(test_hashes), "Overlap between validation and test"
    
    print("The data sets are well disjointed.")

In [17]:
train_iter = IMDB(split='train')
test_iter = IMDB(split='test')
all_data = list(train_iter) + list(test_iter)
all_data = remove_duplicates(all_data)
all_hashes = [hash_data(data) for data in all_data]

# Check for duplicates
if len(all_hashes) != len(set(all_hashes)):
    print("Caution: Duplicate data exists!")
else:
    print("All data is unique.")

All data is unique.


In [18]:
TRAIN_SIZE = int(0.8 * len(all_data))
VAL_SIZE = int(0.1 * len(all_data))
TEST_SIZE = len(all_data) - TRAIN_SIZE - VAL_SIZE

train_datas, val_datas, test_datas = random_split(all_data, [TRAIN_SIZE, VAL_SIZE, TEST_SIZE])
verify_no_overlap(train_datas, val_datas, test_datas, hash_fn=hash_data) # Useless if we called 'remove_duplicates' previously

The data sets are well disjointed.


In [19]:
train_loader = DataLoader(train_datas, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch, drop_last=True)
val_loader = DataLoader(val_datas, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch, drop_last=False)
test_loader = DataLoader(test_datas, batch_size=1, shuffle=False, collate_fn=collate_batch, drop_last=False)

# Print loaders' sizes
print(f"Size train loader : {len(train_loader.dataset)}")
print(f"Size validation loader : {len(val_loader.dataset)}")
print(f"Size test loader : {len(test_loader.dataset)}")

Size train loader : 39665
Size validation loader : 4958
Size test loader : 4959


## Loading pre-trained embedings

In [20]:
def load_glove_embeddings(vocab, path=GLOVE_PATH, embedding_dim=EMBEDDING_DIM):
    # Load GloVe embeddings into a dictionary
    glove_embeddings = {}
    with open(path, 'r', encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = torch.tensor([float(val) for val in values[1:embedding_dim+1]], dtype=torch.float32) # works if we choose an emb dim < 100
            glove_embeddings[word] = vector

    # Create a weights matrix for words in vocab
    weights_matrix = torch.zeros((len(vocab), embedding_dim))
    for word, idx in vocab.get_stoi().items():
        if word in glove_embeddings:
            weights_matrix[idx] = glove_embeddings[word]
        else:
            weights_matrix[idx] = torch.zeros(embedding_dim)
    return weights_matrix

GLOVE_EMBS = load_glove_embeddings(vocab)
print(GLOVE_EMBS.shape)

torch.Size([100683, 100])


## Define models

In [21]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size=VOCAB_SIZE, pad_idx=PAD_IDX, embed_size=EMBEDDING_DIM, hidden_size=HIDDEN_DIM,
                 output_size=OUTPUT_DIM, dropout=DROPOUT, num_layers=NUM_LAYERS, pretrained_embs=GLOVE_EMBS, pretrained=False):
        super(LSTMClassifier, self).__init__()
        if pretrained:
            self.embedding = nn.Embedding.from_pretrained(pretrained_embs, freeze=True) # To ensure weights are not trained
        else:
            self.embedding = nn.Embedding(vocab_size, embed_size)
            
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers=num_layers, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, (hidden, _) = self.lstm(embedded)
        output = self.fc(hidden[-1])
        return self.sigmoid(output)

In [22]:
class GRUClassifier(nn.Module):
    def __init__(self, vocab_size=VOCAB_SIZE, embed_size=EMBEDDING_DIM, hidden_size=HIDDEN_DIM,
                 output_size=OUTPUT_DIM, dropout=DROPOUT, num_layers=NUM_LAYERS, pretrained_embs=GLOVE_EMBS, pretrained=False):
        super(GRUClassifier, self).__init__()
        if pretrained:
            self.embedding = nn.Embedding.from_pretrained(pretrained_embs, freeze=True) # To ensure weights are not trained
        else:
            self.embedding = nn.Embedding(vocab_size, embed_size)
            
        self.gru = nn.GRU(embed_size, hidden_size, num_layers=num_layers,
                          dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x)
        gru_out, _ = self.gru(embedded)
        last_hidden_state = gru_out[:, -1, :]
        output = self.fc(last_hidden_state)
        return self.sigmoid(output)

## Train and test functions

In [23]:
# Train function
def train_epoch(model, data_loader, criterion, optimizer):
    model.train()
    total_loss, total_acc = 0, 0
    size_loader = 0
    for text, labels in tqdm(data_loader):
        text, labels = text.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        output = model(text).squeeze(dim=1) # could use .squeeze() if drop_last=True
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        total_acc += ((output >= 0.5) == labels).sum().item()
        size_loader += labels.size(0)
    return total_loss / size_loader, total_acc / size_loader

# Test function
def evaluate(model, data_loader, criterion):
    model.eval()
    total_loss, total_acc = 0, 0
    size_loader = 0
    with torch.no_grad():
        for text, labels in tqdm(data_loader):
            text, labels = text.to(DEVICE), labels.to(DEVICE)
            output = model(text).squeeze(dim=1)
            loss = criterion(output, labels)
            total_loss += loss.item()
            total_acc += ((output >= 0.5) == labels).sum().item()
            size_loader += labels.size(0)
    return total_loss / size_loader, total_acc / size_loader

def count_trainable_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [29]:
model = GRUClassifier(pretrained=True)
model.to(DEVICE)
criterion = nn.BCELoss(reduction='sum')
optimizer = optim.Adam(model.parameters(), lr=0.001)
print(f"Number of trainable params: {count_trainable_parameters(model)}")

for epoch in range(NUM_EPOCHS):
    start_time = time.time()
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer)
    val_loss, val_acc = evaluate(model, val_loader, criterion)
    end_time = time.time()

    print(f"Epoch: {epoch+1}/{NUM_EPOCHS} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | "
          f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f} | Time: {end_time-start_time:.2f}s")

Number of trainable params: 187521


  0%|          | 0/4958 [00:00<?, ?it/s]

  0%|          | 0/620 [00:00<?, ?it/s]

Epoch: 1/5 | Train Loss: 0.5888 | Train Acc: 0.6591 | Val Loss: 0.4745 | Val Acc: 0.7969 | Time: 37.07s


  0%|          | 0/4958 [00:00<?, ?it/s]

  0%|          | 0/620 [00:00<?, ?it/s]

Epoch: 2/5 | Train Loss: 0.3769 | Train Acc: 0.8345 | Val Loss: 0.3357 | Val Acc: 0.8552 | Time: 42.33s


  0%|          | 0/4958 [00:00<?, ?it/s]

  0%|          | 0/620 [00:00<?, ?it/s]

Epoch: 3/5 | Train Loss: 0.3004 | Train Acc: 0.8725 | Val Loss: 0.2836 | Val Acc: 0.8840 | Time: 43.10s


  0%|          | 0/4958 [00:00<?, ?it/s]

  0%|          | 0/620 [00:00<?, ?it/s]

Epoch: 4/5 | Train Loss: 0.2564 | Train Acc: 0.8956 | Val Loss: 0.2568 | Val Acc: 0.8939 | Time: 43.31s


  0%|          | 0/4958 [00:00<?, ?it/s]

  0%|          | 0/620 [00:00<?, ?it/s]

Epoch: 5/5 | Train Loss: 0.2227 | Train Acc: 0.9100 | Val Loss: 0.2354 | Val Acc: 0.9020 | Time: 43.24s
