In [1]:
pip install torch-summary

Note: you may need to restart the kernel to use updated packages.


In [2]:
import codecs
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import BertTokenizer
from data_loading_utils import read_lines_from_file_as_data_chunks
import time  # Import the time module
import threading
from concurrent.futures import ThreadPoolExecutor
import math
from torch import nn, optim
from torchsummary import summary
from datetime import datetime
from tqdm import tqdm

import math
import os
from tempfile import TemporaryDirectory
from typing import Tuple

import torch
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset



# Dataset creation

In [3]:
class WPDataset(Dataset):
    """
    A class loading clean text from txt files to be used as an input 
    to PyTorch DataLoader.

    Datapoints are sequences of words (tokenized) + label (next token). If the 
    words have not been seen before (i.e, they are not found in the
    'word_to_id' dict), they will be mapped to the unknown word '<UNK>'.
    chunk_size: how much we read from the file at the time - we could play around with it. 
    """
    def __init__(self, filenames, tokenizer, samples_length=5, chunk_size=1000000, artificial_padding=True):
        self.sequences = [] # X
        self.labels = [] # Y 
        self.tokenizer = tokenizer
        self.samples_length = samples_length
        self.artificial_padding = artificial_padding
        self.pad_token_id = tokenizer.pad_token_id  # Get the PAD token ID = 0 
        
        with ThreadPoolExecutor(max_workers=4) as executor:
            futures = [executor.submit(self.read_file, filename, chunk_size) for filename in filenames]
            for future in futures:
                future.result()  # Ensure all files are processed

    def read_file(self, filename, chunk_size):
        print("Read in ", filename)
        start_time = time.time()
        try:
            read_lines_from_file_as_data_chunks(filename, chunk_size, self.process_lines)
        except FileNotFoundError:
            print(f"File not found: {filename}")
        except Exception as e:
            print(f"An error occurred: {e}")
        end_time = time.time()  # End the timer
        print(f"Time taken to read {filename}: {end_time - start_time:.2f} seconds")

    def process_lines(self, data, eof, file_name):
        """
        eof: end of file 
        Callback function to process lines read from file.
        """
        if not eof:
            text = data.strip()  # Remove leading/trailing whitespace
            # split sentence into sub-sentences so that it can be passed to tokenizer, which has a max capacity of 512 
            line_chunks = self.split_into_chunks(text) 
            for chunk in line_chunks:
                line_tokens = self.tokenizer.tokenize(chunk) # data is already lower case 
                line_tokens_ids = self.tokenizer.convert_tokens_to_ids(line_tokens)
                self.create_sequences(line_tokens_ids)
        else:
            print(f"Finished reading file: {file_name}")

    def split_into_chunks(self, line, max_length=512):
        """Splits a long line into chunks of max_length tokens."""
        return [line[i:i + max_length] for i in range(0, len(line), max_length)]

    def create_sequences(self, token_ids):
        """
        Create sequences and labels from tokenized text.
        """
        n = self.samples_length
        if self.artificial_padding:
            k = 0 
            while k < len(token_ids) - n:
                for i in range(1, n + 1):
                    seq = token_ids[k:i+k] + [self.pad_token_id] * (n - i)
                    label = token_ids[i + k]
                    self.sequences.append(seq)
                    self.labels.append(label)
                k += n
            remaining_tokens = len(token_ids) - k
            if remaining_tokens > 1:
                for i in range(1, remaining_tokens):
                    seq = token_ids[k:i+k] + [self.pad_token_id] * (n - i)
                    label = token_ids[i + k]
                    self.sequences.append(seq)
                    self.labels.append(label)     
        else: 
            # Ensure all sequences are of length samples_length
            for i in range(self.samples_length, len(token_ids)): # sliding window 
                seq = token_ids[i-self.samples_length:i]
                label = token_ids[i]
                self.sequences.append(seq)
                self.labels.append(label)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return torch.tensor(self.sequences[idx]), torch.tensor(self.labels[idx])

# Example usage

In [19]:
filenames = ['data/articles.txt'] #,'data/clean_data/news_summarization.txt', 'data/clean_data/twitter.txt', 'data/clean_data/mobile_text.txt']

# Define the tokenizer (using BERT tokenizer as an example)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
dataset = WPDataset(filenames, tokenizer)

Read in  data/articles.txt
Finished reading file: data/articles.txt
Time taken to read data/articles.txt: 6.99 seconds


In [20]:
len(dataset.sequences)

699865

In [21]:
for i in range(0,20):
    print(dataset.sequences[i], ' ', dataset.labels[i])
    print('')

[2821, 0, 0, 0, 0]   2129

[2821, 2129, 0, 0, 0]   1996

[2821, 2129, 1996, 0, 0]   19377

[2821, 2129, 1996, 19377, 0]   1038

[2821, 2129, 1996, 19377, 1038]   8017

[8017, 0, 0, 0, 0]   2098

[8017, 2098, 0, 0, 0]   11834

[8017, 2098, 11834, 0, 0]   27014

[8017, 2098, 11834, 27014, 0]   2020

[8017, 2098, 11834, 27014, 2020]   1996

[1996, 0, 0, 0, 0]   2279

[1996, 2279, 0, 0, 0]   2502

[1996, 2279, 2502, 0, 0]   2518

[1996, 2279, 2502, 2518, 0]   2256

[1996, 2279, 2502, 2518, 2256]   8069

[8069, 0, 0, 0, 0]   2020

[8069, 2020, 0, 0, 0]   3712

[8069, 2020, 3712, 0, 0]   2152

[8069, 2020, 3712, 2152, 0]   4408

[8069, 2020, 3712, 2152, 4408]   7168



In [22]:
dataset.labels[1]

1996

# Model 

https://pytorch.org/tutorials/beginner/transformer_tutorial.html  

In [4]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(1, max_len, d_model)
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
            x: Tensor, shape ``[batch_size, seq_len, embedding_dim]``
        """
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

In [37]:
class TransformerModel(nn.Module):

    def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.1, last_output=True, max_len=5):
        super().__init__()
        self.model_type = 'Transformer'
        self.last_output = last_output
        
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout, batch_first=True)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.embedding = nn.Embedding(ntoken, d_model)
        self.d_model = d_model
        if last_output:
            self.linear = nn.Linear(d_model, ntoken)
        else:
            self.linear = nn.Linear(d_model*max_len, ntoken)
        
    def forward(self, src: Tensor, src_mask: Tensor = None) -> Tensor:
        """
        Arguments:
            src: Tensor, shape ``[batch_size, seq_len]``
            src_mask: Tensor, shape ``[seq_len, seq_len]``

        Returns:
            output Tensor of shape ``[batch_size, seq_len, ntoken]``
        """
        src = self.embedding(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        if src_mask is None:
            """Generate a square causal mask for the sequence. The masked positions are filled with float('-inf').
            Unmasked positions are filled with float(0.0).
            """
            src_mask = nn.Transformer.generate_square_subsequent_mask(src.size(1)).to(src.device)
        output = self.transformer_encoder(src, src_mask)
        if self.last_output:
            output = self.linear(output[:, -1, :])  # Take the last token's output
            return output
        else:
            flattened_transf = output.reshape(src.size(0), 1, -1)  # Flatten the output
            result = self.linear(torch.tanh(flattened_transf))
            return result.squeeze(1)

# Model Information

In [59]:
ntoken = 37000
d_model = 512
num_layers = 6 # 6
dim_hid = 2048
seq_length = 5
expansion_factor = 4
n_heads = 8
dropout = 0.1

model = TransformerModel(ntoken=ntoken, d_model=d_model, nhead=n_heads, nlayers=num_layers, d_hid=dim_hid, dropout=dropout)

In [8]:
# Example usage
ntoken = 30522  # Vocabulary size
d_model = 768  # Model dimension
nhead = 12  # Number of attention heads
d_hid = 3072  # Hidden dimension in the feedforward layer
nlayers = 3  # Number of transformer layers
dropout = 0.1  # Dropout rate

model = TransformerModel(ntoken, d_model, nhead, d_hid, nlayers, dropout)

In [9]:
summary(model)

Layer (type:depth-idx)                        Param #
├─PositionalEncoding: 1-1                     --
|    └─Dropout: 2-1                           --
├─TransformerEncoder: 1-2                     --
|    └─ModuleList: 2-2                        --
|    |    └─TransformerEncoderLayer: 3-1      7,087,872
|    |    └─TransformerEncoderLayer: 3-2      7,087,872
|    |    └─TransformerEncoderLayer: 3-3      7,087,872
├─Embedding: 1-3                              23,440,896
├─Linear: 1-4                                 23,471,418
Total params: 68,175,930
Trainable params: 68,175,930
Non-trainable params: 0


Layer (type:depth-idx)                        Param #
├─PositionalEncoding: 1-1                     --
|    └─Dropout: 2-1                           --
├─TransformerEncoder: 1-2                     --
|    └─ModuleList: 2-2                        --
|    |    └─TransformerEncoderLayer: 3-1      7,087,872
|    |    └─TransformerEncoderLayer: 3-2      7,087,872
|    |    └─TransformerEncoderLayer: 3-3      7,087,872
├─Embedding: 1-3                              23,440,896
├─Linear: 1-4                                 23,471,418
Total params: 68,175,930
Trainable params: 68,175,930
Non-trainable params: 0

# Training loop 

In [41]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print( "Running on", device )
batch_size = 64
learning_rate = 1e-3
n_epoch = 5

ntoken = 30522  # Vocabulary size
d_model = 768  # Model dimension
nhead = 12  # Number of attention heads
d_hid = 3072  # Hidden dimension in the feedforward layer
nlayers = 3  # Number of transformer layers - use 3 or 6
dropout = 0.1  # Dropout rate

model = TransformerModel(ntoken=ntoken, d_model=d_model, nhead=nhead, nlayers=nlayers, 
                         d_hid=d_hid, dropout=dropout).to(device)

Running on cuda


In [42]:
filenames = ['data/articles.txt'] #,'data/clean_data/news_summarization.txt', 'data/clean_data/twitter.txt', 'data/clean_data/mobile_text.txt']

# Define the tokenizer (using BERT tokenizer as an example)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
dataset = WPDataset(filenames, tokenizer)

# Define the split sizes
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

# Split the dataset
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

Read in  data/articles.txt
Finished reading file: data/articles.txt
Time taken to read data/articles.txt: 6.61 seconds


In [43]:
def evaluate(dataloader, model, device):
    correct, incorrect = 0, 0
    model.eval()
    for seq, label in dataloader:
        sequence, label = seq.to(device), label.to(device)
        logits = model(sequence).to(device)
        _, predicted_word_ids = logits.topk(1)
        assert (label.shape == predicted_word_ids.squeeze(1).shape)
        comparison = torch.eq(label, predicted_word_ids.squeeze(1))
        count_same_entries = torch.sum(comparison).item()
        count_same_entries = (label == predicted_word_ids.squeeze(1)).sum().item()
        
        correct += count_same_entries
        incorrect += label.shape[0] - count_same_entries

    print("Correctly predicted words    :", correct)
    print("Incorrectly predicted words  :", incorrect)
    return correct/(correct+incorrect)

In [44]:
criterion = nn.CrossEntropyLoss()
model_optimizer = optim.Adam(model.parameters(), lr=learning_rate)

best_val_accuracy = 0
model.train()  # turn on train mode
print(datetime.now().strftime("%X"), "Training starts")
for epoch in range(n_epoch):
    iteration = 0
    total_loss = 0
    with tqdm(train_loader, desc="Epoch {}".format(epoch + 1)) as tepoch:
        for sequence, label in tepoch:
            sequence, label = sequence.to(device), label.to(device)
            model_optimizer.zero_grad()
            
            logits = model(sequence).to(device)        
            loss = criterion(logits.squeeze(1), label)
            loss.backward()
            model_optimizer.step()
            
            iteration += 1
            total_loss += loss 
            # Update tqdm description with the current loss
            tepoch.set_postfix(loss=(total_loss / iteration).item())
        
    print("Epoch", epoch+1, "loss:", total_loss.detach().item()/iteration)
    
    print("Evaluating on the validation data...")
    word_level_accuracy = evaluate(val_loader, model, device)
    print("Validation accuracy:", word_level_accuracy)
    # save best model
    if word_level_accuracy > best_val_accuracy:
        torch.save(model.state_dict(), 'weights/tranformer6.pt')
        best_val_accuracy = word_level_accuracy
        

15:50:24 Training starts


Epoch 1: 100%|██████████| 8749/8749 [06:55<00:00, 21.06it/s, loss=7.18]


Epoch 1 loss: 7.184396520316608
Evaluating on the validation data...
Correctly predicted words    : 6293
Incorrectly predicted words  : 133680
Validation accuracy: 0.04495867060075872


Epoch 2: 100%|██████████| 8749/8749 [07:02<00:00, 20.71it/s, loss=7.08]


Epoch 2 loss: 7.078634879700537
Evaluating on the validation data...
Correctly predicted words    : 6293
Incorrectly predicted words  : 133680
Validation accuracy: 0.04495867060075872


Epoch 3: 100%|██████████| 8749/8749 [06:51<00:00, 21.25it/s, loss=7.07]


Epoch 3 loss: 7.068792683449537
Evaluating on the validation data...
Correctly predicted words    : 6293
Incorrectly predicted words  : 133680
Validation accuracy: 0.04495867060075872


Epoch 4: 100%|██████████| 8749/8749 [06:49<00:00, 21.39it/s, loss=7.06]


Epoch 4 loss: 7.0646172062521435
Evaluating on the validation data...
Correctly predicted words    : 6293
Incorrectly predicted words  : 133680
Validation accuracy: 0.04495867060075872


Epoch 5: 100%|██████████| 8749/8749 [06:48<00:00, 21.44it/s, loss=7.06]


Epoch 5 loss: 7.062300423619842
Evaluating on the validation data...
Correctly predicted words    : 6293
Incorrectly predicted words  : 133680
Validation accuracy: 0.04495867060075872


In [36]:
test_model = TransformerModel(ntoken=ntoken, d_model=d_model, nhead=nhead, nlayers=nlayers, 
                              d_hid=d_hid, dropout=dropout).to(device)
test_model.load_state_dict(torch.load('weights/tranformer.pt'))
evaluate(val_loader, model, device)

torch.Size([64, 1])
Correctly predicted words    : 0
Incorrectly predicted words  : 0


ZeroDivisionError: division by zero

In [None]:
import time

criterion = nn.CrossEntropyLoss()
model_optimizer = optim.Adam(model.parameters(), lr=learning_rate)

def train(model: nn.Module, n_epoch, training_loader) -> None:
    model.train()  # turn on train mode
    print( datetime.now().strftime("%X"), "Training starts" )
    for epoch in range(n_epoch):
    iteration = 0
    for input_tensor, label in training_loader:
        input_tensor, label = input_tensor.to(device), label.to(device)
        charlm_optimizer.zero_grad()
        logits = model(input_tensor).to(device)
        loss = criterion(logits.squeeze(1), label)
        loss.backward()
        charlm_optimizer.step()
        iteration += 1

    model.train()  # turn on train mode
    total_loss = 0.
    log_interval = 200
    start_time = time.time()

    num_batches = len(train_data) // bptt
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        output = model(data)
        output_flat = output.view(-1, ntokens)
        loss = criterion(output_flat, targets)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        if batch % log_interval == 0 and batch > 0:
            lr = scheduler.get_last_lr()[0]
            ms_per_batch = (time.time() - start_time) * 1000 / log_interval
            cur_loss = total_loss / log_interval
            ppl = math.exp(cur_loss)
            print(f'| epoch {epoch:3d} | {batch:5d}/{num_batches:5d} batches | '
                  f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | '
                  f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
            total_loss = 0
            start_time = time.time()

def evaluate(model: nn.Module, eval_data: Tensor) -> float:
    model.eval()  # turn on evaluation mode
    total_loss = 0.
    with torch.no_grad():
        for i in range(0, eval_data.size(0) - 1, bptt):
            data, targets = get_batch(eval_data, i)
            seq_len = data.size(0)
            output = model(data)
            output_flat = output.view(-1, ntokens)
            total_loss += seq_len * criterion(output_flat, targets).item()
    return total_loss / (len(eval_data) - 1)

In [None]:
best_val_loss = float('inf')
epochs = 3

with TemporaryDirectory() as tempdir:
    best_model_params_path = os.path.join(tempdir, "best_model_params.pt")

    for epoch in range(1, epochs + 1):
        epoch_start_time = time.time()
        train(model)
        val_loss = evaluate(model, val_data)
        val_ppl = math.exp(val_loss)
        elapsed = time.time() - epoch_start_time
        print('-' * 89)
        print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
            f'valid loss {val_loss:5.2f} | valid ppl {val_ppl:8.2f}')
        print('-' * 89)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), best_model_params_path)

        scheduler.step()
    model.load_state_dict(torch.load(best_model_params_path)) # load best model states

In [56]:
training_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
model = TransformerModel(d_model=d_model, src_vocab_size=src_vocab_size, target_vocab_size=target_vocab_size, seq_length=seq_length,num_layers=num_layers, expansion_factor=expansion_factor, n_heads=n_heads, dropout=dropout).to(device)

criterion = nn.CrossEntropyLoss()
model_optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [58]:
model.train()
print(datetime.now().strftime("%X"), "Training starts")
for epoch in range(n_epoch) :
    iteration = 0
    for input_tensor, label in training_loader :
        sequence, label = input_tensor.to(device), label.to(device)
        model_optimizer.zero_grad()
        
        # Shift the label to create the target sequence
        #trg_input = torch.cat([torch.full((label.size(0), 1), start_token, dtype=torch.long).to(device), label[:, :-1]], dim=1)

        logits = model(sequence, label).to(device)
        loss = criterion(logits.squeeze(1), label)
        loss.backward()
        model_optimizer.step()
        iteration += 1
    print( datetime.now().strftime("%X"), "End of epoch", epoch+1, ", loss=", loss.detach().item())

14:51:19 Training starts
src shape:  torch.Size([64, 5])
trg shape:  torch.Size([64])
pe_expanded:  torch.Size([64, 5, 512])
x:  torch.Size([64, 5, 512])

pe_expanded:  torch.Size([64, 5, 512])
x:  torch.Size([64, 512])



RuntimeError: The size of tensor a (64) must match the size of tensor b (5) at non-singleton dimension 1

In [None]:
    charlm.eval()
    # Generate 50 characters starting from the input text
    try :
        char_list = list("he took out his wand and"[-MAXLEN:])
        for i in range(300) :
            input_tensor = torch.tensor( [char_to_id[c] for c in char_list] + [char_to_id[PADDING_SYMBOL]]*(MAXLEN-len(char_list))).unsqueeze(0).to(device)
            logits = charlm(input_tensor).squeeze().to(device)
            _, new_character_tensor = logits.topk(1)
            new_character = id_to_char[new_character_tensor.detach().item()]
            print( new_character, end='' )
            if len(char_list) == MAXLEN :
                char_list.pop(0)
            char_list.append( new_character )
        print()
    except KeyError :
        continue
    charlm.train()

In [13]:
# Iterate through the DataLoader
i = 0 
for batch in dataloader:
    sequences, labels = batch
    print(sequences.shape, labels.shape)
    print(sequences)
    print(labels)
    print('')     
    # Your training loop here

ValueError: too many dimensions 'str'

# Testing loading configs:
Total data: 5.862,7 MB

news_summarization.txt: 264MB 
twitter.txt: 551,9 GB

USING news_summarization.txt ONLY 

1. chunk_size=1000000, artifical_padding = False   
   time = 528.20 seconds, memory = 6.44 GB
2. chunk_size=1000000, artifical_padding = True
   time = 571.33 seconds, memory = 6.93 GB 

3. chunk_size=2000000, artifical_padding = True
   time = 561.67 seconds, memory = 6.95 GB
4. chunk_size=500000, artifical_padding = True
   time = 562.89 seconds, memory = 6.95 GB 
   
5. Thread, chunk_size=1000000, artifical_padding = True
   time = 546.93 seconds, memory = 6.86 GB

USING news_summarization.txt AND twitter.txt  

6. Thread, chunk_size=1000000, artifical_padding = True
   time = 1102.74 seconds, memory =  13.61 GB

7. No tread, chunk_size=1000000, artifical_padding = True
    time = 515.84 + 1158.22 , memory = 20.01 GB 