In [1]:
import codecs
from datetime import datetime
import time 
import json
from pathlib import Path
import os
import glob
import numpy as np
import math
from tqdm import tqdm
from multiprocessing import Pool, cpu_count
from concurrent.futures import ThreadPoolExecutor
import threading

import torch 
import torch.nn.functional as F
from torch import nn, optim, Tensor
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler, DistributedSampler, random_split
from torch.nn.utils import clip_grad_norm_
from torch.nn import TransformerEncoder, TransformerEncoderLayer
import torch.multiprocessing as mp
from torchsummary import summary
from transformers import AutoTokenizer

from data_loading_utils import read_lines_from_file_as_data_chunks





# Dataset

In [2]:
class WPDataset(Dataset):
    """
    A class loading clean text from txt files to be used as an input 
    to PyTorch DataLoader.

    Datapoints are sequences of words (tokenized) + label (next token). If the 
    words have not been seen before (i.e, they are not found in the
    'word_to_id' dict), they will be mapped to the unknown word '<UNK>'.
    chunk_size: how much we read from the file at the time - we could play around with it. 
    """
    def __init__(self, filenames, tokenizer, samples_length=5, chunk_size=1000000, artificial_padding=True):
        self.sequences = [] # X
        self.labels = [] # Y 
        self.tokenizer = tokenizer
        self.samples_length = samples_length
        self.artificial_padding = artificial_padding
        self.pad_token_id = tokenizer.pad_token_id  # Get the PAD token ID = 0 
        
        with ThreadPoolExecutor(max_workers=4) as executor:
            futures = [executor.submit(self.read_file, filename, chunk_size) for filename in filenames]
            for future in futures:
                future.result()  # Ensure all files are processed
        # Convert lists to numpy arrays for faster access and better memory management
        self.sequences = np.array(self.sequences)
        self.labels = np.array(self.labels)

    def read_file(self, filename, chunk_size):
        print("Read in ", filename)
        start_time = time.time()
        try:
            read_lines_from_file_as_data_chunks(filename, chunk_size, self.process_lines)
        except FileNotFoundError:
            print(f"File not found: {filename}")
        except Exception as e:
            print(f"An error occurred: {e}")
        end_time = time.time()  # End the timer
        print(f"Time taken to read {filename}: {end_time - start_time:.2f} seconds")

    def process_lines(self, data, eof, file_name):
        """
        eof: end of file 
        Callback function to process lines read from file.
        """
        if not eof:
            text = data.strip()  # Remove leading/trailing whitespace
            # split sentence into sub-sentences so that it can be passed to tokenizer, which has a max capacity of 512 
            line_chunks = self.split_into_chunks(text) 
            for chunk in line_chunks:
                line_tokens = self.tokenizer.tokenize(chunk) # data is already lower case 
                line_tokens_ids = self.tokenizer.convert_tokens_to_ids(line_tokens)
                self.create_sequences(line_tokens_ids)
        else:
            print(f"Finished reading file: {file_name}")

    def split_into_chunks(self, line, max_length=512):
        """Splits a long line into chunks of max_length tokens."""
        return [line[i:i + max_length] for i in range(0, len(line), max_length)]

    def create_sequences(self, token_ids):
        """
        Create sequences and labels from tokenized text.
        """
        n = self.samples_length
        if self.artificial_padding:
            k = 0 
            while k < len(token_ids) - n:
                for i in range(1, n + 1):
                    seq = token_ids[k:i+k] + [self.pad_token_id] * (n - i)
                    label = token_ids[i + k]
                    self.sequences.append(seq)
                    self.labels.append(label)
                k += n
            remaining_tokens = len(token_ids) - k
            if remaining_tokens > 1:
                for i in range(1, remaining_tokens):
                    seq = token_ids[k:i+k] + [self.pad_token_id] * (n - i)
                    label = token_ids[i + k]
                    self.sequences.append(seq)
                    self.labels.append(label)     
        else: 
            # Ensure all sequences are of length samples_length
            for i in range(self.samples_length, len(token_ids)): # sliding window 
                seq = token_ids[i-self.samples_length:i]
                label = token_ids[i]
                self.sequences.append(seq)
                self.labels.append(label)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return torch.tensor(self.sequences[idx]), torch.tensor(self.labels[idx])

# Model

In [3]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(1, max_len, d_model)
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
            x: Tensor, shape ``[batch_size, seq_len, embedding_dim]``
        """
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

class TransformerModel(nn.Module):

    def __init__(self, device, ntoken: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.1, last_output=True, max_len=5):
        super().__init__()
        self.model_type = 'Transformer'
        self.last_output = last_output
        
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout, batch_first=True)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.embedding = nn.Embedding(ntoken, d_model)
        self.d_model = d_model
        if last_output:
            self.linear = nn.Linear(d_model, ntoken)
        else:
            self.linear = nn.Linear(d_model*max_len, ntoken)
        self.device = device
        self.to(device)
        
    def forward(self, src: Tensor, src_mask: Tensor = None) -> Tensor:
        """
        Arguments:
            src: Tensor, shape ``[batch_size, seq_len]``
            src_mask: Tensor, shape ``[seq_len, seq_len]``

        Returns:
            output Tensor of shape ``[batch_size, seq_len, ntoken]``
        """
        src = self.embedding(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        if src_mask is None:
            """Generate a square causal mask for the sequence. The masked positions are filled with float('-inf').
            Unmasked positions are filled with float(0.0).
            """
            src_mask = nn.Transformer.generate_square_subsequent_mask(src.size(1)).to(src.device)
        output = self.transformer_encoder(src, src_mask)
        if self.last_output:
            output = self.linear(output[:, -1, :])  # Take the last token's output
            return output
        else:
            flattened_transf = output.reshape(src.size(0), 1, -1)  # Flatten the output
            result = self.linear(torch.tanh(flattened_transf))
            return result.squeeze(1)

# Evaluate function

In [15]:
def evaluate(dataloader, model, device, criterion):
    correct, incorrect, total_loss = 0, 0, 0
    for seq, label in dataloader:
        sequence, label = seq.to(device), label.to(device)
        logits = model(sequence)
        _, predicted_word_ids = logits.topk(1)
        assert (label.shape == predicted_word_ids.squeeze(1).shape)
        total_loss += criterion(logits.squeeze(1), label).item()
        comparison = torch.eq(label, predicted_word_ids.squeeze(1))
        count_same_entries = torch.sum(comparison).item()
        #count_same_entries = (label == predicted_word_ids.squeeze(1)).sum().item()
        
        correct += count_same_entries
        incorrect += label.shape[0] - count_same_entries

    print( "Correctly predicted words    : ", correct )
    print( "Incorrectly predicted words  : ", incorrect )
    print( "Accuracy                     : ", correct / (correct + incorrect))
    print( "PPL                          : ", math.exp(total_loss/(correct + incorrect)))
    
    return correct / (correct + incorrect), math.exp(total_loss/(correct + incorrect))

# Data initialization

In [6]:
seq_length = 5
batch_size = 64

In [18]:
# select files with text for training (will also be used for test and validation dataset)
filenames = ['data/clean_data/articles.txt'] #,'data/clean_data/twitter.txt', 'data/clean_data/news_summarization.txt']

# choose tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')  
dataset = WPDataset(filenames=filenames, tokenizer=tokenizer, samples_length=seq_length)

Read in  data/clean_data/articles.txt
Finished reading file: data/clean_data/articles.txt
Time taken to read data/clean_data/articles.txt: 1.91 seconds


In [19]:
# set up dataloaders
generator = torch.Generator().manual_seed(42)
training_data, validation_data, test_data = random_split(dataset, [0.8, 0.05, 0.15], generator=generator)

print("There are: ", len(dataset), "total number of samples in the dataset.")
print( "There are: ", len(training_data), " training datapoints and ", tokenizer.vocab_size, "unique tokens in the vocabulary" ) 
print( "There are: ", len(validation_data), " validation datapoints")
print( "There are: ", len(test_data), " testing datapoints")

val_dataloader = DataLoader(validation_data, batch_size=batch_size, drop_last=True, num_workers=4, shuffle=True)
train_dataloader = DataLoader(training_data, batch_size=batch_size, drop_last=True, num_workers=4, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=batch_size, drop_last=True, num_workers=4, shuffle=True)

There are:  699865 total number of sables in the dataset.
There are:  559893  training datapoints and  30522 unique tokens in the vocabulary


# Hyper-parameters

In [9]:
learning_rate = 0.001
n_epoch = 2

ntoken = tokenizer.vocab_size  # Vocabulary size - BERT
num_processes = 4
d_model = 64  # Model dimension
nhead = 8  # Number of attention heads 12 | 8
d_hid = 512  # Hidden dimension in the feedforward layer - or maybe even 256? often it's 4*d_model
nlayers = 3  # Number of transformer layers
dropout = 0.1  # Dropout rate

# Training

In [10]:
# Reproducibility
np.random.seed(5719)

device = (
"cuda"
if torch.cuda.is_available()
else "mps"
if torch.backends.mps.is_available()
else "cpu"
)
print( "Running on", device )

Running on cuda


In [11]:
# Create model 
model = TransformerModel(device, ntoken=ntoken, d_model=d_model, nhead=nhead, nlayers=nlayers, 
                         d_hid=d_hid, dropout=dropout).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()
summary(model)

Layer (type:depth-idx)                        Param #
├─PositionalEncoding: 1-1                     --
|    └─Dropout: 2-1                           --
├─TransformerEncoder: 1-2                     --
|    └─ModuleList: 2-2                        --
|    |    └─TransformerEncoderLayer: 3-1      83,008
|    |    └─TransformerEncoderLayer: 3-2      83,008
|    |    └─TransformerEncoderLayer: 3-3      83,008
├─Embedding: 1-3                              1,953,408
├─Linear: 1-4                                 1,983,930
Total params: 4,186,362
Trainable params: 4,186,362
Non-trainable params: 0


Layer (type:depth-idx)                        Param #
├─PositionalEncoding: 1-1                     --
|    └─Dropout: 2-1                           --
├─TransformerEncoder: 1-2                     --
|    └─ModuleList: 2-2                        --
|    |    └─TransformerEncoderLayer: 3-1      83,008
|    |    └─TransformerEncoderLayer: 3-2      83,008
|    |    └─TransformerEncoderLayer: 3-3      83,008
├─Embedding: 1-3                              1,953,408
├─Linear: 1-4                                 1,983,930
Total params: 4,186,362
Trainable params: 4,186,362
Non-trainable params: 0

In [12]:
# Check if a checkpoint exists
path_to_load = 'weights/transformer_news_articles.pt'

if os.path.exists(path_to_load):
    # Load the saved model state
    model.load_state_dict(torch.load(path_to_load))


In [13]:
model.train()
#prev_accuracy = 0
path_to_save = 'weights/transformer_news_articles_twitter.pt'
for epoch in range(n_epoch):
    total_loss = 0
    hidden = None
    with tqdm(train_dataloader, desc="Epoch {}".format(epoch + 1), mininterval=50) as tepoch:
        for sequence, label in tepoch:
            sequence, label = sequence.to(device), label.to(device)
            optimizer.zero_grad()
            
            logits = model(sequence)       
            loss = criterion(logits.squeeze(1), label)
            loss.backward()
            #clip_grad_norm_(model.parameters(), 5) TODO 
            optimizer.step()
            total_loss += loss.item()
    print("Epoch ", epoch+1, " loss: ", total_loss)
    total_loss = 0    
    print("Saving Model")
    torch.save(model.state_dict(), path_to_save)


Epoch 1:  51%|█████     | 692890/1368650 [12:23:06<12:04:44, 15.54it/s]


KeyboardInterrupt: 

# Evaluation on test set - Experiment 2

In [20]:
model.eval()
print( "Evaluating on the test data..." )
test_accuracy, ppl = evaluate(test_dataloader, model, device, criterion)

Evaluating on the test data...
Number of test sentences:  1640

Correctly predicted words    :  12315
Incorrectly predicted words  :  92645
Accuracy                     :  0.11733041158536585
PPL                          :  1.113897906775641
Test acc:  0.11733041158536585
ppl:  1.113897906775641
