<a href="https://colab.research.google.com/github/PSchloss12/intro_ai_project/blob/main/project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sources
- https://pytorch.org/tutorials/index.html
- https://towardsdatascience.com/language-modeling-with-lstms-in-pytorch-381a26badcbf
- https://writesonic.com/blog/how-to-train-chatgpt-own-data/
- https://arxiv.org/abs/1708.02182



In [None]:
# x_np = torch.from_numpy(np_array)
# x_data = torch.tensor(data)

# # We move our tensor to the GPU if available
# if torch.cuda.is_available():
#   tensor = tensor.to('cuda')

# # print('First row: ',tensor[0])
# # print('First column: ', tensor[:, 0])
# # print('Last column:', tensor[..., -1])


# # to save
# torch.save(model.state_dict(), PATH)

# # to load
# model = TheModelClass(*args, **kwargs)
# model.load_state_dict(torch.load(PATH))
# model.eval()

In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import torch.optim as optim
import torchtext

import numpy as np
import tqdm

import os
import math

In [None]:
# Get Best Available Device
device = (
    "cuda"
    if torch.cuda.is_available() # GPU
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


In [None]:
# class NeuralNetwork(nn.Module):
#     def __init__(self):
#         super().__init__()
#         self.flatten = nn.Flatten()
#         self.linear_relu_stack = nn.Sequential(
#             nn.Linear(28*28, 512),
#             nn.ReLU(),
#             nn.Linear(512, 512),
#             nn.ReLU(),
#             nn.Linear(512, 10),
#         )

#     def forward(self, x):
#         x = self.flatten(x)
#         logits = self.linear_relu_stack(x)
#         return logits

In [None]:
# # Create NN and move to device
# model = NeuralNetwork().to(device)
# print(model)
# # get layer weights and biases
# print(f"Model structure: {model}\n\n")
# for name, param in model.named_parameters():
#     print(f"Layer: {name} | Size: {param.size()} | Values : {param[:2]} \n")

In [None]:
!pip install datasets
import datasets

# feed in data
# X = torch.__(device=device)
# logits = model(X)
# pred_probab = nn.Softmax(dim=1)(logits)
# y_pred = pred_probab.argmax(1)
# print(f"Predicted class: {y_pred}")

def get_data(dataset, vocab, batch_size):
    '''
    Implementing the Dataloader
    given a dataset gives a way to iterate over batches of it (In a batch, all examples are processed in parallel)
    '''
    data = []
    for example in dataset:
        if example['tokens']:
            # appends each sequence of tokenized text with an <eos> token to mark its end
            tokens = example['tokens'].append('<eos>')
            # encodes each token to a numerical value equal to its index in the vocabulary; rare words match to unknown token
            tokens = [vocab[token] for token in example['tokens']]
            data.extend(tokens)
    # combines all the numerical sequences into a list (1D Tensor)
    data = torch.LongTensor(data)
    num_batches = data.shape[0] // batch_size
    # reshapes it into a 2D tensor of dimensions [batch_size, num_batches]
    data = data[:num_batches * batch_size]
    data = data.view(batch_size, num_batches)
    return data

#  load data
files = []
for file in os.listdir("/content/drive/MyDrive/AI/Project/train"):
  files.append("/content/drive/MyDrive/AI/Project/train/"+file)
# dataset = datasets.load_dataset('wikitext', 'wikitext-2-raw-v1')

dataset = datasets.load_dataset('text', data_files={'train': files,'test': "/content/drive/MyDrive/AI/Project/test/red_fairybook_parsed.txt"})
# tokenize data, basically breaks into words and punctuation here
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')
tokenize_data = lambda example, tokenizer: {'tokens': tokenizer(example['text'])}
tokenized_dataset = dataset.map(tokenize_data, remove_columns=['text'],
fn_kwargs={'tokenizer': tokenizer})

# create vocab of any word that occurs at least 3 times
# length will be the number of neurons in the output classification layer
vocab = torchtext.vocab.build_vocab_from_iterator(tokenized_dataset['train']['tokens'],min_freq=3)
# manually add an <unk> token and set is as the default index so that whenever we request from the vocabulary the index of a word that it doesn’t have we get <unk>
vocab.insert_token('<unk>', 0)
# add <eos> token; We will later insert it at the end of each sequence so model will learn to do so as well
vocab.insert_token('<eos>', 1)
vocab.set_default_index(vocab['<unk>'])



In [None]:
batch_size = 128
train_data = get_data(tokenized_dataset['train'], vocab, batch_size)
# valid_data = get_data(tokenized_dataset['validation'], vocab, batch_size)
test_data = get_data(tokenized_dataset['test'], vocab, batch_size)


In [None]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout_rate,
                tie_weights):

        super().__init__()
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers,
                    dropout=dropout_rate, batch_first=True)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_dim, vocab_size)

        # The purpose of this is to make the embedding layer share weights with the output layer. This helps reduce the number of parameters
        if tie_weights:
            assert embedding_dim == hidden_dim, 'cannot tie, check dims'
            self.embedding.weight = self.fc.weight
        self.init_weights()

    def forward(self, src, hidden):
        embedding = self.dropout(self.embedding(src))
        output, hidden = self.lstm(embedding, hidden)
        output = self.dropout(output)
        prediction = self.fc(output)
        return prediction, hidden

    def init_weights(self):
      '''
      initialize the embedding weights uniformly in the range [-0.1, 0.1]
      and all other layers uniformly in the range [-1/sqrt(H), 1/sqrt(H)]
      '''
      init_range_emb = 0.1
      init_range_other = 1/math.sqrt(self.hidden_dim)
      self.embedding.weight.data.uniform_(-init_range_emb, init_range_emb)
      self.fc.weight.data.uniform_(-init_range_other, init_range_other)
      self.fc.bias.data.zero_()
      for i in range(self.num_layers):
          self.lstm.all_weights[i][0] = torch.FloatTensor(self.embedding_dim,
                  self.hidden_dim).uniform_(-init_range_other, init_range_other)
          self.lstm.all_weights[i][1] = torch.FloatTensor(self.hidden_dim,
                  self.hidden_dim).uniform_(-init_range_other, init_range_other)

    def init_hidden(self, batch_size, device):
        '''
        set the LSTM’s hidden and cell state to zero
        '''
        hidden = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        cell = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        return hidden, cell

    def detach_hidden(self, hidden):
        '''
        need this function while training to explicitly tell PyTorch that hidden states due to different sequences are independent
        '''
        hidden, cell = hidden
        hidden = hidden.detach()
        cell = cell.detach()
        return hidden, cell

In [None]:
def evaluate(model, data, criterion, batch_size, seq_len, device):

    epoch_loss = 0
    model.eval()
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)

    with torch.no_grad():
        for idx in range(0, num_batches - 1, seq_len):
            hidden = model.detach_hidden(hidden)
            src, target = get_batch(data, seq_len, num_batches, idx)
            src, target = src.to(device), target.to(device)
            batch_size= src.shape[0]

            prediction, hidden = model(src, hidden)
            prediction = prediction.reshape(batch_size * seq_len, -1)
            target = target.reshape(-1)

            loss = criterion(prediction, target)
            epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

def get_batch(data, seq_len, num_batches, idx):
    '''
    given the index of the first batch of tokens in the batch returns the corresponding batch of sequences
    '''
    src = data[:, idx:idx+seq_len]
    target = data[:, idx+1:idx+seq_len+1]
    return src, target

def train(model, data, optimizer, criterion, batch_size, seq_len, clip, device):
    epoch_loss = 0
    model.train()
    # drop all batches that are not a multiple of seq_len
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)

    for idx in tqdm(range(0, num_batches - 1, seq_len), desc='Training: ',leave=False):  # The last batch can't be a src
        optimizer.zero_grad()
        hidden = model.detach_hidden(hidden)

        src, target = get_batch(data, seq_len, num_batches, idx)
        src, target = src.to(device), target.to(device)
        batch_size = src.shape[0]
        prediction, hidden = model(src, hidden)

        prediction = prediction.reshape(batch_size * seq_len, -1)
        target = target.reshape(-1)
        loss = criterion(prediction, target)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

def generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, device, seed=None):
    if seed is not None:
        torch.manual_seed(seed)
    model.eval()
    tokens = tokenizer(prompt)
    indices = [vocab[t] for t in tokens]
    batch_size = 1
    hidden = model.init_hidden(batch_size, device)
    with torch.no_grad():
        for i in range(max_seq_len):
            src = torch.LongTensor([indices]).to(device)
            prediction, hidden = model(src, hidden)
            probs = torch.softmax(prediction[:, -1] / temperature, dim=-1)
            prediction = torch.multinomial(probs, num_samples=1).item()

            while prediction == vocab['<unk>']:
                prediction = torch.multinomial(probs, num_samples=1).item()

            if prediction == vocab['<eos>']:
                break

            indices.append(prediction)

    itos = vocab.get_itos()
    tokens = [itos[i] for i in indices]
    return tokens


# Hyperparameter Tuning & Model Initialization

In [None]:
vocab_size = len(vocab)
# set the embedding and hidden dimensions as the same value because we will use weight tying
embedding_dim = 1024             # 400 in the paper
hidden_dim = 1024                # 1150 in the paper
num_layers = 2                   # 3 in the paper
dropout_rate = 0.65
tie_weights = True
lr = 1e-3                        # They used 30 and a different optimizer

# initialize the model, optimizer and loss criterion
model = LSTM(vocab_size, embedding_dim, hidden_dim, num_layers, dropout_rate, tie_weights).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {num_params:,} trainable parameters')



The model has 24,170,525 trainable parameters


# Training & Evaluation

In [None]:
n_epochs = 50
seq_len = 50
clip = 0.25
saved = False

lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=0)

if saved:
    model.load_state_dict(torch.load('best-val-lstm_lm.pt',  map_location=device))
    test_loss = evaluate(model, test_data, criterion, batch_size, seq_len, device)
    print(f'Test Perplexity: {math.exp(test_loss):.3f}')
else:
    best_valid_loss = float('inf')

    for epoch in range(n_epochs):
        train_loss = train(model, train_data, optimizer, criterion, batch_size, seq_len, clip, device)
        valid_loss = evaluate(model, valid_data, criterion, batch_size, seq_len, device)

        lr_scheduler.step(valid_loss)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), 'best-val-lstm_lm.pt')

        print(f'\tTrain Perplexity: {math.exp(train_loss):.3f}')
        print(f'\tValid Perplexity: {math.exp(valid_loss):.3f}')

TypeError: ignored

In [None]:
# prompt = 'Think about'
# max_seq_len = 30
# seed = 0

# temperatures = [0.5, 0.7, 0.75, 0.8, 1.0]
# for temperature in temperatures:
#     generation = generate(prompt, max_seq_len, temperature, model, tokenizer,
#                           vocab, device, seed)
#     print(str(temperature)+'\n'+' '.join(generation)+'\n')