In [1]:
import codecs
from datetime import datetime
import json
from pathlib import Path
import os
import glob
import numpy as np
import torch 
import torch.nn.functional as F
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler, DistributedSampler, random_split
from torch.nn.utils import clip_grad_norm_
!pip install torch-summary
from torchsummary import summary
import torch.multiprocessing as mp
from transformers import AutoTokenizer
from tqdm import tqdm
from multiprocessing import Pool, cpu_count
import math

from data_loading_utils import read_lines_from_file_as_data_chunks
import time  # Import the time module
import threading
from concurrent.futures import ThreadPoolExecutor





In [2]:
class WPDataset(Dataset):
    """
    A class loading clean text from txt files to be used as an input 
    to PyTorch DataLoader.

    Datapoints are sequences of words (tokenized) + label (next token). If the 
    words have not been seen before (i.e, they are not found in the
    'word_to_id' dict), they will be mapped to the unknown word '<UNK>'.
    chunk_size: how much we read from the file at the time - we could play around with it. 
    """
    def __init__(self, filenames, tokenizer, samples_length=5, chunk_size=1000000, artificial_padding=True):
        self.sequences = [] # X
        self.labels = [] # Y 
        self.tokenizer = tokenizer
        self.samples_length = samples_length
        self.artificial_padding = artificial_padding
        self.pad_token_id = tokenizer.pad_token_id  # Get the PAD token ID = 0 
        
        with ThreadPoolExecutor(max_workers=4) as executor:
            futures = [executor.submit(self.read_file, filename, chunk_size) for filename in filenames]
            for future in futures:
                future.result()  # Ensure all files are processed
        # Convert lists to numpy arrays for faster access and better memory management
        self.sequences = np.array(self.sequences)
        self.labels = np.array(self.labels)

    def read_file(self, filename, chunk_size):
        print("Read in ", filename)
        start_time = time.time()
        try:
            read_lines_from_file_as_data_chunks(filename, chunk_size, self.process_lines)
        except FileNotFoundError:
            print(f"File not found: {filename}")
        except Exception as e:
            print(f"An error occurred: {e}")
        end_time = time.time()  # End the timer
        print(f"Time taken to read {filename}: {end_time - start_time:.2f} seconds")

    def process_lines(self, data, eof, file_name):
        """
        eof: end of file 
        Callback function to process lines read from file.
        """
        if not eof:
            text = data.strip()  # Remove leading/trailing whitespace
            # split sentence into sub-sentences so that it can be passed to tokenizer, which has a max capacity of 512 
            line_chunks = self.split_into_chunks(text) 
            for chunk in line_chunks:
                line_tokens = self.tokenizer.tokenize(chunk) # data is already lower case 
                line_tokens_ids = self.tokenizer.convert_tokens_to_ids(line_tokens)
                self.create_sequences(line_tokens_ids)
        else:
            print(f"Finished reading file: {file_name}")

    def split_into_chunks(self, line, max_length=512):
        """Splits a long line into chunks of max_length tokens."""
        return [line[i:i + max_length] for i in range(0, len(line), max_length)]

    def create_sequences(self, token_ids):
        """
        Create sequences and labels from tokenized text.
        """
        n = self.samples_length
        if self.artificial_padding:
            k = 0 
            while k < len(token_ids) - n:
                for i in range(1, n + 1):
                    seq = token_ids[k:i+k] + [self.pad_token_id] * (n - i)
                    label = token_ids[i + k]
                    self.sequences.append(seq)
                    self.labels.append(label)
                k += n
            remaining_tokens = len(token_ids) - k
            if remaining_tokens > 1:
                for i in range(1, remaining_tokens):
                    seq = token_ids[k:i+k] + [self.pad_token_id] * (n - i)
                    label = token_ids[i + k]
                    self.sequences.append(seq)
                    self.labels.append(label)     
        else: 
            # Ensure all sequences are of length samples_length
            for i in range(self.samples_length, len(token_ids)): # sliding window 
                seq = token_ids[i-self.samples_length:i]
                label = token_ids[i]
                self.sequences.append(seq)
                self.labels.append(label)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return torch.tensor(self.sequences[idx]), torch.tensor(self.labels[idx])

In [3]:
class RNN(nn.Module):
    """
    Recurrent Neural Network (RNN) with optional GRU or LSTM units.

    Attributes:
    - no_of_output_symbols (int): Size of the output vocabulary.
    - embedding_size (int): Dimensionality of the embeddings.
    - hidden_size (int): Number of features in the hidden state.
    - num_layers (int): Number of recurrent layers.
    - use_GRU (bool): If True, use GRU; otherwise, use LSTM.
    - dropout (float): Dropout probability.
    - device (torch.device): Device for the model ('cpu', 'mps' or 'cuda').
    """
    def __init__(self, embedding_size, hidden_size, no_of_output_symbols, device, num_layers, use_GRU, dropout):
        super().__init__()
        self.no_of_output_symbols = no_of_output_symbols
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.use_GRU = use_GRU
        self.dropout = dropout

        # initialize layers
        self.embedding = nn.Embedding(no_of_output_symbols, embedding_size)
        if use_GRU == True:
            self.rnn = nn.GRU(embedding_size, hidden_size, num_layers=num_layers, batch_first=True, dropout=dropout)
        else:
            self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers=num_layers, batch_first=True, dropout=dropout)
        self.output = nn.Linear( hidden_size, no_of_output_symbols )
        self.device = device
        self.to(device)

    def forward(self, x, hidden):
        """
        x is a list of lists of size (batch_size, max_seq_length)
        Each inner list contains word IDs and represents one datapoint (n tokens).
       
        Returns:
        the output from the RNN: logits for the predicted next word, hidden state
        """
        x_emb = self.embedding(x) # x_emb shape: (batch_size, max_seq_length, emb_dim)
        if self.use_GRU:
            output, hidden = self.rnn(x_emb, hidden) # output shape: (batch_size, max_seq_length, hidden)
        else:
            output, (h_n, c_n) = self.rnn(x_emb, hidden)  # LSTM expects a tuple (hidden state, cell state)
            hidden = (h_n, c_n)
            
        return self.output(output[:, -1, :]), hidden # logit shape: (batch_size, 1, vocab_size)
    
 

In [32]:
def evaluate(dataloader, model, device, criterion):
    correct, incorrect, total_loss = 0, 0, 0
    model.eval()
    hidden = None
    for seq, label in dataloader:
        sequence, label = seq.to(device), label.to(device)
        logits, hidden = model(sequence, hidden)
        _, predicted_word_ids = logits.topk(1)
        assert (label.shape == predicted_word_ids.squeeze(1).shape)
        total_loss += criterion(logits.squeeze(1), label).item()
        comparison = torch.eq(label, predicted_word_ids.squeeze(1))
        count_same_entries = torch.sum(comparison).item()
        #count_same_entries = (label == predicted_word_ids.squeeze(1)).sum().item()
        
        correct += count_same_entries
        incorrect += label.shape[0] - count_same_entries

    print( "Correctly predicted words    : ", correct )
    print( "Incorrectly predicted words  : ", incorrect )
    print( "Accuracy                     : ", correct / (correct + incorrect))
    print( "PPL                          : ", math.exp(total_loss/len(dataloader))
    
    return correct / (correct + incorrect), math.exp(total_loss/len(dataloader))
    

In [33]:
# ================ Hyper-parameters ================ #

batch_size = 64
embedding_size = 50 #16
hidden_size = 64 #25
num_layers = 2
seq_length = 5      # number of tokens used as a datapoint
learning_rate = 0.001
epochs = 1
num_processes = 4
use_GRU = True
dropout = 0.5
artificial_padding = True

# ====================== Data ===================== #

# select files with text for training (will also be used for test and validation dataset)
filenames = ['data/clean_data/news_summarization.txt'] 
filenames = ['data/clean_data/twitter.txt']
filenames = ['data/clean_data/articles.txt']

# choose tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Reproducibility
np.random.seed(5719)

device = (
"cuda"
if torch.cuda.is_available()
else "mps"
if torch.backends.mps.is_available()
else "cpu"
)
print( "Running on", device )

# set up dataloaders
dataset = WPDataset(filenames=filenames, tokenizer=tokenizer, samples_length=seq_length, artificial_padding=artificial_padding)


generator = torch.Generator().manual_seed(42)
training_data, validation_data, test_data = random_split(dataset, [0.8, 0.05, 0.15], generator=generator)

print( "There are", len(training_data), " training datapoints and ", tokenizer.vocab_size, "unique tokens in the dataset" ) 
val_dataloader = DataLoader(validation_data, batch_size=batch_size, drop_last=True, num_workers=4, shuffle=True)
train_dataloader = DataLoader(training_data, batch_size=batch_size, drop_last=True, num_workers=4, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=batch_size, drop_last=True, num_workers=4, shuffle=True)

# ==================== Training ==================== #

rnn_model = RNN(embedding_size, hidden_size, no_of_output_symbols=tokenizer.vocab_size, device=device, num_layers=num_layers, use_GRU=use_GRU, dropout=dropout).to(device)
optimizer = optim.Adam(rnn_model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()
summary(rnn_model)

# Check if a checkpoint exists
checkpoint_path = 'best_model.pth'

if os.path.exists(checkpoint_path):
    # Load the saved model state
    rnn_model = torch.load(checkpoint_path)

prev_accuracy = 0

for epoch in range(epochs):
    total_loss = 0
    hidden = None
    with tqdm(train_dataloader, desc="Epoch {}".format(epoch + 1)) as tepoch:
        rnn_model.train()
        for sequence, label in tepoch:
            sequence, label = sequence.to(device), label.to(device)
            optimizer.zero_grad()
            logits, hidden = rnn_model(sequence, hidden)
            if use_GRU:
                hidden = hidden.detach()  # Detach hidden states to avoid backprop through the entire sequence
            else: 
                hidden = tuple([h.detach() for h in hidden])    
            loss = criterion(logits.squeeze(), label)
            loss.backward()
            
            clip_grad_norm_(rnn_model.parameters(), 5)
            optimizer.step()
            total_loss += loss.item()
    print("Epoch", epoch, "loss:", total_loss )
    total_loss = 0
    torch.save(rnn_model, checkpoint_path)
    print("Evaluating on the validation data...")
    accuracy, ppl = evaluate(val_dataloader, rnn_model, device, criterion)
    if accuracy > prev_accuracy:
        prev_accuracy = accuracy
        path = 'best_model.pth'
        torch.save(rnn_model, path)


# Save the model state
checkpoint_path = 'checkpoint_path_epoch_' + str(epoch) + '.pth'
torch.save(rnn_model, checkpoint_path)

# ==================== Evaluation ==================== #

rnn_model.eval()
print( "Evaluating on the test data..." )

print( "Number of test sentences: ", len(test_dataloader) )
print()

test_accuracy, ppl = evaluate(test_dataloader, rnn_model, device, criterion)

# ==================== Save the model  ==================== #

dt = str(datetime.now()).replace(' ','_').replace(':','_').replace('.','_')
newdir = 'model_' + dt
os.mkdir( newdir )
torch.save( rnn_model, os.path.join(newdir, 'rnn.model') )

settings = {
    'epochs': epochs,
    'learning_rate': learning_rate,
    'batch_size': batch_size,
    'hidden_size': hidden_size,
    'embedding_size': embedding_size,
    'num_layers': num_layers,
    'dropout': dropout,
    'use_GRU': use_GRU,
    'test_accuracy': test_accuracy
}
with open( os.path.join(newdir, 'settings.json'), 'w' ) as f:
    json.dump(settings, f)

s = f"accuracy: {test_accuracy}, epochs: {epochs}, num_layers: {num_layers}, use_GRU: {use_GRU}, dropout: {dropout}, embedding_size: {embedding_size}, hidden_size: {hidden_size}, batch_size: {batch_size}, learning_rate: {learning_rate}"
with open("experiments.txt", 'a') as f:
    f.write(s + '\n')  


Running on cuda
Read in  data/clean_data/articles.txt
Finished reading file: data/clean_data/articles.txt
Time taken to read data/clean_data/articles.txt: 1.91 seconds
There are 559893  training datapoints and  30522 unique tokens in the dataset
Layer (type:depth-idx)                   Param #
├─Embedding: 1-1                         1,526,100
├─GRU: 1-2                               47,232
├─Linear: 1-3                            1,983,930
Total params: 3,557,262
Trainable params: 3,557,262
Non-trainable params: 0


Epoch 1: 100%|██████████| 8748/8748 [00:27<00:00, 320.24it/s]

Epoch 0 loss: 52778.69981575012
Evaluating on the validation data...





Correctly predicted words    :  4463
Incorrectly predicted words  :  30481
Accuracy                     :  0.12771863553113552
PPL                          :  1.1003528007311816
Evaluating on the test data...
Number of test sentences:  1640

Correctly predicted words    :  13565
Incorrectly predicted words  :  91395
Accuracy                     :  0.12923971036585366
PPL                          :  1.100034508706035


In [None]:
# ==================== Experiment 1  ==================== #

In [7]:
import nltk
from nltk.corpus import words

class RNNpredictor:
    def __init__(self, model, tokenizer, device):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device

    def filter_vocab_by_prefix(self, vocab, prefix):
        if prefix == None:
            return vocab
        # return {token: idx for token, idx in vocab.items() if token.startswith(prefix)}
        filtered_vocab = {}
        filtered_subtokens = {}
        for token, idx in vocab.items():
            if token.startswith(prefix):
                filtered_vocab[token] = idx
            elif token.startswith('##') and token[2:].isalpha():
                filtered_subtokens[token] = idx
        return filtered_vocab, filtered_subtokens

    def mask_logits_by_vocab(self, logits, filtered_vocab):
        mask = torch.full_like(logits, float('-inf'))
        for token, idx in filtered_vocab.items():
            mask[idx] = logits[idx]
        return mask

    def mask_logits_by_subword(self, mask, logits, filtered_subwords):
        for token, idx in filtered_subwords.items():
            mask[idx] = logits[idx]
        return mask

    def remove_last_word(self, input_string, cut=True):
        last_space_index = input_string.rfind(' ')
        if last_space_index == -1:
            return None, input_string.lower()
        else:
            if cut:
                # if prompt is longer than seven words, cut it
                words = input_string.lower().split()
                last_seven_words = words[-7:]
                result = ' '.join(last_seven_words)
                if input_string[-1] == " ":
                    result += " "
                last_space_index = result.rfind(' ')
                input_string = result
            return input_string[:last_space_index], input_string[last_space_index + 1:]

    def predict_next_word(self, prompt, number_of_suggestions, max_subwords=3):
        self.model.eval()

        input_text = prompt
        hidden = None
        vocab = self.tokenizer.get_vocab()
        #nltk.download('words')
        english_words = set(words.words())

        # remove last word from prompt (word that is supposed to be predicted)
        prompt, prefix = self.remove_last_word(prompt, True)
        full_prompt, _ = self.remove_last_word(input_text, False)
        if prompt == None:
            tokens = [self.tokenizer.cls_token_id]
        else:
            tokens = self.tokenizer.encode(prompt, add_special_tokens=False)
        input_ids_start = torch.tensor(tokens).unsqueeze(0).to(self.device)  # Add batch dimension
        input_ids = input_ids_start


        first_pass = []
        suggestions = []
        i = 0
        while len(suggestions) < number_of_suggestions:
            generated_subwords = []
            for subword in range(max_subwords):
                if len(suggestions) == 0 and i == 0:
                    with torch.no_grad():
                        outputs, hidden = self.model(input_ids, hidden)
                        next_token_logits = outputs.squeeze()  # Get the logits for the last token

                    # filter by prefix
                    filtered_vocab, filtered_subwords = self.filter_vocab_by_prefix(vocab, prefix)
                    # Mask the logits based on the filtered vocabulary
                    masked_logits = self.mask_logits_by_vocab(next_token_logits, filtered_vocab)
                    # Mask the logits for most common '##'
                    masked_logits = self.mask_logits_by_subword(masked_logits, next_token_logits, filtered_subwords)
                    # Normalize the masked logits to get probabilities
                    probs = torch.softmax(masked_logits, dim=-1)
                    first_pass = probs.topk(len(filtered_vocab)+len(filtered_subwords)).indices.tolist()
                    next_token_id = first_pass[i]
                elif len(generated_subwords) == 0:
                    next_token_id = first_pass[i]
                else:
                    # filter by prefix
                    filtered_vocab, _ = self.filter_vocab_by_prefix(vocab, generated_subwords[-1])
                    if len(filtered_vocab) == 0:
                        break

                    with torch.no_grad():
                        outputs, hidden = self.model(input_ids, hidden)
                        next_token_logits = outputs.squeeze()  # Get the logits for the last toke

                    # Mask the logits based on the filtered vocabulary
                    masked_logits = self.mask_logits_by_vocab(next_token_logits, filtered_vocab)
                    # Find most likely end
                    next_token_id = masked_logits.topk(1).indices.tolist()[0]

                # Decode the generated subwords so far
                subword_text = self.tokenizer.decode([next_token_id], clean_up_tokenization_spaces=True)
                # print("subword", subword_text, subword_text.lower() in english_words)

                # Check if the last token can complete a word
                if not subword_text.startswith('[unused') and subword_text != self.tokenizer.pad_token:
                    if subword == 0:
                        i += 1
                    # is the word complete?
                    if subword_text.lower() in english_words and len(generated_subwords) == 0:
                        suggestions.append(subword_text)
                        break
                    # Check if it's not a continuation of a word
                    if not subword_text.startswith("##") and len(generated_subwords) > 0:
                        break
                    if subword_text.startswith("##"):
                        # is the word complete?
                        if len(generated_subwords) == 0 and prefix + subword_text[2:] in english_words:
                            if prefix + subword_text[2:] not in suggestions:
                                suggestions.append(prefix + subword_text[2:])
                            break
                        else:
                            if len(generated_subwords) > 0:
                                if generated_subwords[-1] + subword_text[2:] in english_words:
                                    if generated_subwords[-1] + subword_text[2:] not in suggestions:
                                        suggestions.append(generated_subwords[-1] + subword_text[2:])
                                    break
                                else:
                                    generated_subwords.append(generated_subwords[-1] + subword_text[2:])
                                    next_token_id_input = self.tokenizer.encode(generated_subwords[-1] + subword_text[2:], add_special_tokens=False)
                                    input_ids = torch.cat([input_ids, torch.tensor([next_token_id_input]).to(self.device)], dim=1).to(self.device)  # Append the predicted token to the input
                            else:
                                generated_subwords.append(prefix + subword_text[2:])
                                next_token_id_input = self.tokenizer.encode(prefix + subword_text[2:], add_special_tokens=False)
                                input_ids = torch.cat([input_ids, torch.tensor([next_token_id_input]).to(self.device)], dim=1).to(self.device)  # Append the predicted token to the input
            input_ids = input_ids_start
        return suggestions

In [8]:
## Test padding and no padding models on sample mobile sentences

def experiment1(with_padding):
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

    # Reproducibility
    np.random.seed(5719)
    
    device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
    )
    print( "Running on", device )
    
    if with_padding:
        rnn_model = torch.load('Final_with_padding/rnn.model')
    else:
        rnn_model = torch.load('Final_no_padding/rnn.model')
    rnn = RNNpredictor(rnn_model, tokenizer, device)
    
    
    rnn_model.eval()
    hidden = None
    total_loss = 0
    correct_5, incorrect_5 = 0,0
    correct_1, incorrect_1 = 0,0
    with open('samples_mobile_test.txt') as f:
        for line in f:
            sentence = line.strip().split()
            sequence = ' '.join(sentence[:-1])
            sequence += ' '
            target = sentence[-1]
            next_words = rnn.predict_next_word(sequence, 5)
            if target in next_words:
                correct_5 += 1
            else:
                incorrect_5 += 1
            if target == next_words[0]:
                correct_1 += 1
            else: 
                incorrect_1 += 1
    word_accuracy_5 = correct_5 / (correct_5 + incorrect_5)
    if with_padding:
        print("The top-5 word-level accuracy of the model with artificial padding is ", word_accuracy_5)
    else: 
        print("The top-5 word-level accuracy of the model without artificial padding is ", word_accuracy_5)
    
    word_accuracy_1 = correct_1 / (correct_1 + incorrect_1)
    if with_padding:
        print("The top-1 word-level accuracy of the model with artificial padding is ", word_accuracy_1)
    else:
        print("The top-1 word-level accuracy of the model without artificial padding is ", word_accuracy_1)


experiment1(True)
experiment1(False)

Running on cuda


2024-06-04 10:12:56.774088: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-04 10:12:56.804076: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-04 10:12:56.804117: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-04 10:12:56.804937: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-04 10:12:56.809996: I tensorflow/core/platform/cpu_feature_guar

The top-5 word-level accuracy of the model with artificial padding is  0.245
The top-1 word-level accuracy of the model with artificial padding is  0.09
Running on cuda
The top-5 word-level accuracy of the model without artificial padding is  0.22
The top-1 word-level accuracy of the model without artificial padding is  0.1


In [None]:
# ==================== Experiment 2  ==================== #

In [28]:
def evaluate(dataloader, model, device, criterion):
    model.eval()
    correct, incorrect, total_loss = 0, 0, 0
    hidden = None
    for seq, label in dataloader:
        sequence, label = seq.to(device), label.to(device)
        logits, _ = model(sequence, hidden)
        _, predicted_word_ids = logits.topk(1)
        assert (label.shape == predicted_word_ids.squeeze(1).shape)
        total_loss += criterion(logits.squeeze(1), label).item()
        comparison = torch.eq(label, predicted_word_ids.squeeze(1))
        count_same_entries = torch.sum(comparison).item()
        #count_same_entries = (label == predicted_word_ids.squeeze(1)).sum().item()
        
        correct += count_same_entries
        incorrect += label.shape[0] - count_same_entries

    print( "Correctly predicted words    : ", correct )
    print( "Incorrectly predicted words  : ", incorrect )
    print( "Accuracy                     : ", correct / (correct + incorrect))
    print("PPL                           : ", math.exp(total_loss/len(dataloader))
    
    return correct / (correct + incorrect), math.exp(total_loss/len(dataloader))
  

In [30]:
def experiment2evaluation(news, twitter, articles, with_padding):
    # ==================== Model Setup ==================== #
    batch_size = 64
    embedding_size = 50 #16
    hidden_size = 64 #25
    num_layers = 2
    seq_length = 5      # number of tokens used as a datapoint
    learning_rate = 0.001
    epochs = 1
    num_processes = 4
    use_GRU = True
    dropout = 0.5
    artificial_padding = True
    
    
    device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
    )
    print( "Running on", device )

    if with_padding:
        rnn_model = torch.load('Final_with_padding/rnn.model')
    else:
        rnn_model = torch.load('Final_no_padding/rnn.model')
    criterion = nn.CrossEntropyLoss()
    
    # ==================== Data Setup ==================== #
    filenames = []
    
    if news:
        filenames.append('data/clean_data/news_summarization.txt')
    if twitter: 
        filenames.append('data/clean_data/twitter.txt')
    if articles:
        filenames.append('data/clean_data/articles.txt')
    
    # choose tokenizer
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    # Reproducibility
    np.random.seed(5719)

    # set up dataloaders
    dataset = WPDataset(filenames=filenames, tokenizer=tokenizer, samples_length=seq_length, artificial_padding=artificial_padding)
    generator = torch.Generator().manual_seed(42)
    training_data, validation_data, test_data = random_split(dataset, [0.8, 0.05, 0.15], generator=generator)
    
    val_dataloader = DataLoader(validation_data, batch_size=batch_size, drop_last=True, num_workers=4, shuffle=True)
    train_dataloader = DataLoader(training_data, batch_size=batch_size, drop_last=True, num_workers=4, shuffle=True)
    test_dataloader = DataLoader(test_data, batch_size=batch_size, drop_last=True, num_workers=4, shuffle=True)
    
    
    # ==================== Evaluation ==================== #
    rnn_model.eval()
    print( "Evaluating on the test data..." )
    
    print( "Number of test sentences: ", len(test_dataloader) )
    print()
    
    test_accuracy, ppl = evaluate(test_dataloader, rnn_model, device, criterion)

news = False
twitter = True
articles = False
with_padding = True
experiment2evaluation(news, twitter, articles, with_padding)

news = True
twitter = False
articles = False
with_padding = True
experiment2evaluation(news, twitter, articles, with_padding)

Running on cuda
Read in  data/clean_data/twitter.txt
Finished reading file: data/clean_data/twitter.txt
Time taken to read data/clean_data/twitter.txt: 451.49 seconds
Evaluating on the test data...
Number of test sentences:  256621

Correctly predicted words    :  1979975
Incorrectly predicted words  :  14443769
Accuracy                     :  0.12055564188043846
PPL                           :  1.1110886593104148
Running on cuda
Read in  data/clean_data/news_summarization.txt
Finished reading file: data/clean_data/news_summarization.txt
Time taken to read data/clean_data/news_summarization.txt: 189.64 seconds
Evaluating on the test data...
Number of test sentences:  116749

Correctly predicted words    :  1124400
Incorrectly predicted words  :  6347536
Accuracy                     :  0.15048308764957302
PPL                           :  1.1017415763762994
