<a href="https://colab.research.google.com/github/Page0526/Pytorch-crash-course/blob/main/deep-neural-networks/nlp/Neural_Language_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task
Our task is to implement a based neural language model.

Training data: WikiText-2

Test data: WikiText-103

Evaluation criteria: Perplexity on the test dataset

Reference implementation: https://colab.research.google.com/drive/1-v8bRLr-UWQnxNA8m7Xpt4-XF8Ki_40T?usp=sharingLinks to an external site.

You are recommended to use the preprocessing function in the reference implementation to avoid distribution shifts.

Submission format: you have to submit the Jupyter Notebook. The notebook should clearly show that you use the WikiText-2 for training and WikiText-103 for testing. The perplexity should be displayed in the output, I will not rerun the code to get the perplexity.

# Sample from Lecturer

In [4]:
# interactive features of VSCode, Colab, Jupyter notebook
!pip install ipympl



In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
# %matplotlib widget
# from google.colab import output
# output.enable_custom_widget_manager()

In [None]:
train_data = """A closed-loop controller or feedback controller is a control loop which incorporates feedback, in contrast to an open-loop controller or non-feedback controller. A closed-loop controller uses feedback to control states or outputs of a dynamical system. Its name comes from the information path in the system: process inputs (e.g., voltage applied to an electric motor) have an effect on the process outputs (e.g., speed or torque of the motor), which is measured with sensors and processed by the controller; the result (the control signal) is "fed back" as input to the process, closing the loop"""
test_data = "TBD"

In [None]:
import tqdm
import re

'''
Preprocessing process
1. Lower all characters
2. Remove all characters that are not alpha-bet or number
3. Create vocab dictionary
4. Create token for words
'''
def processing(text: str):
    text = text.lower()
    corpus = re.sub(r'[^a-z\s]', '', text).split()
    vocab = sorted(set(corpus))
    vocab.append('<pad>')
    vocab.append('<unk>')
    word_2_id = {w: i for i, w in enumerate(vocab)}
    id_2_word = {i: w for i, w in enumerate(vocab)}
    text_id = [word_2_id[w] for w in corpus]

    return corpus, vocab, text_id, word_2_id, id_2_word

corpus, vocab, text_id, word_2_id, id_2_word = processing(train_data)

'''
Define a Neural Language Model
1. V = len of the vocab dictionary
2. D = dimension
3. N = context window
'''
V = len(vocab)
D = 100
N = 5
n_hidden = 512
net = nn.Sequential(
    nn.Embedding(num_embeddings=V, embedding_dim=D),
    nn.LeakyReLU(negative_slope=0.15),
    nn.Flatten(),
    nn.Linear(in_features=D*N, out_features=n_hidden),
    nn.LeakyReLU(negative_slope=0.15),
    nn.Linear(in_features=n_hidden, out_features=n_hidden),
    nn.LeakyReLU(negative_slope=0.15),
    nn.Linear(in_features=n_hidden, out_features=V)
)

'''
Loss fn = CrossEntropyLoss
Optimizer = Adam/SGD
EPOCHS = 100
'''
def train(text_id: list, net: nn.Module, lr: float, optimizer: str, nepochs: int, N: int):
    fig,ax = plt.subplots(1,1)
    losses = []
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(net.parameters(), lr=lr) if optimizer == "adam" else optim.SGD(net.parameters(), lr=lr)
    for ei in tqdm.trange(nepochs):
        for i in range(len(text_id) - N):
            inputs = torch.LongTensor(text_id[i:i+N]).reshape(1, -1)
            target = torch.LongTensor([text_id[i+N]])
            # inputs.shape -> [1, 5]
            output = net(inputs)
            loss = criterion(output, target)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        losses.append(loss.item())
    ax.clear()
    ax.plot(losses)
    fig.canvas.draw()
    fig.canvas.flush_events()
    # test()    # perplexity
    return net


def test(text: str, net: nn.Module):
    pass


def text2id(text):
    corpus = text.lower()
    corpus = re.sub(r'[^a-z\s]', '', corpus).split()
    text_id = [word_2_id.get(w, V - 1) for w in corpus]
    return text_id


def predict(net: nn.Module, text: str, N: int):
    text_id = text2id(text)
    if len(text_id) < N:
        text_id = [word_2_id['<pad>']] * (N - len(text_id)) + text_id
        text_id = text_id[-N:]
    # print(f"{text_id=}")
    prob = net(torch.LongTensor(text_id).reshape(1, -1))
    prob = torch.softmax(prob, dim=1)
    # print(f"{prob=}")
    next_word_id = torch.argmax(prob).item()
    # print(f"{next_word_id=}")
    return id_2_word[next_word_id], prob.detach().numpy()[0]

def perplexity(net: nn.Module, text: str):
    pass

In [None]:
len(text_id), V

In [None]:
train(text_id, net, lr=1e-3, optimizer='adam', nepochs=100, N=5)

Support for third party widgets will remain active for the duration of the session. To disable support:

In [None]:
next_word, prob = predict(net, "closedloop", N=5)
print(f"{next_word=}")
print()
print("Prob")
print("\n".join([f"{id_2_word[i]} {prob[i]}" for i in range(len(prob))]))
plt.figure(figsize=(10, 5))
plt.bar(range(len(prob)), prob)
plt.xticks(range(len(prob)), id_2_word.values(), rotation=90)
plt.show()

In [None]:
test_perplexity = perplexity(net, test_data)
print(f"Test perplexity: {test_perplexity}")

# My solution

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset

In [6]:
train_data = load_dataset('Salesforce/wikitext','wikitext-2-v1')
test_data = load_dataset('Salesforce/wikitext','wikitext-103-v1')

In [8]:
train_data, test_data

(DatasetDict({
     test: Dataset({
         features: ['text'],
         num_rows: 4358
     })
     train: Dataset({
         features: ['text'],
         num_rows: 36718
     })
     validation: Dataset({
         features: ['text'],
         num_rows: 3760
     })
 }),
 DatasetDict({
     test: Dataset({
         features: ['text'],
         num_rows: 4358
     })
     train: Dataset({
         features: ['text'],
         num_rows: 1801350
     })
     validation: Dataset({
         features: ['text'],
         num_rows: 3760
     })
 }))

In [7]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
import re
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from collections import Counter

ss = SnowballStemmer('english')
sw = stopwords.words('english')

In [9]:
test_data['test'][0]

{'text': ''}

In [10]:
from torch.nn.utils.rnn import pad_sequence

class NeuralNetworkDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset
        self.dataset = self.dataset.map(self.preprocess)
        self.create_vocab()
        self.dataset = self.dataset.map(self.remove_rare_tokens)
        self.dataset = self.dataset.map(self.text2id)
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        row = self.dataset[idx]
        text_ids = row['text_id']
        return text_ids
    
    def preprocess(self, row):
        row['all_tokens'] = [ss.stem(i) for i in 
                             re.split(r" +",
                                  re.sub(r"[^a-z@# ]","",
                                         row['text'].lower()))
                            if (i not in sw) and len(i)]
        return row
    
    def create_vocab(self):
        counts = Counter([i for s in self.dataset['all_tokens'] for i in s])
        counts = {k:v for k, v in counts.items() if v>10} # filtering
        self.vocab = list(counts.keys())
        self.vocab.append('<pad>')
        self.vocab.append('<unk>')
        self.n_v = len(self.vocab)
        self.id2tok = dict(enumerate(self.vocab))
        self.tok2id = {token: id for id, token in self.id2tok.items()}

    def remove_rare_tokens(self, row):
        row['tokens'] = [t for t in row['all_tokens'] if t in self.vocab]
        return row
    
    def text2id(self, row):
        row['text_id'] = [self.tok2id.get(w, self.tok2id['<unk>']) for s in row['all_tokens'] for w in s]
        return row
    
    def collate_fn(self, batch):
        return pad_sequence([torch.tensor(item) for item in batch], batch_first=True, padding_value=self.tok2id['<pad>'])

In [11]:
trainset = NeuralNetworkDataset(train_data['train'])

Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

KeyboardInterrupt: 

In [None]:
testset = NeuralNetworkDataset(test_data['test'])

In [11]:
len(testset), len(trainset)

(4358, 36718)

In [None]:
trainloader = DataLoader(trainset, batch_size=64, shuffle=True, collate_fn=trainset.collate_fn)
testloader = DataLoader(testset, batch_size=64, shuffle=False, collate_fn=testset.collate_fn)
# test
next(iter(trainloader)), next(iter(testloader))

In [13]:
testset.dataset['text'][3]

' Robert Boulter is an English film , television and theatre actor . He had a guest @-@ starring role on the television series The Bill in 2000 . This was followed by a starring role in the play Herons written by Simon Stephens , which was performed in 2001 at the Royal Court Theatre . He had a guest role in the television series Judge John Deed in 2002 . In 2004 Boulter landed a role as " Craig " in the episode " Teddy \'s Story " of the television series The Long Firm ; he starred alongside actors Mark Strong and Derek Jacobi . He was cast in the 2005 theatre productions of the Philip Ridley play Mercury Fur , which was performed at the Drum Theatre in Plymouth and the <unk> Chocolate Factory in London . He was directed by John Tiffany and starred alongside Ben Whishaw , Shane Zaza , Harry Kent , Fraser Ayres , Sophie Stanton and Dominic Hall . \n'

In [14]:
testset.dataset['all_tokens'][3]

['robert',
 'boulter',
 'english',
 'film',
 'televis',
 'theatr',
 'actor',
 'guest',
 '@@',
 'star',
 'role',
 'televis',
 'seri',
 'bill',
 'follow',
 'star',
 'role',
 'play',
 'heron',
 'written',
 'simon',
 'stephen',
 'perform',
 'royal',
 'court',
 'theatr',
 'guest',
 'role',
 'televis',
 'seri',
 'judg',
 'john',
 'deed',
 'boulter',
 'land',
 'role',
 'craig',
 'episod',
 'teddi',
 'stori',
 'televis',
 'seri',
 'long',
 'firm',
 'star',
 'alongsid',
 'actor',
 'mark',
 'strong',
 'derek',
 'jacobi',
 'cast',
 'theatr',
 'product',
 'philip',
 'ridley',
 'play',
 'mercuri',
 'fur',
 'perform',
 'drum',
 'theatr',
 'plymouth',
 'unk',
 'chocol',
 'factori',
 'london',
 'direct',
 'john',
 'tiffani',
 'star',
 'alongsid',
 'ben',
 'whishaw',
 'shane',
 'zaza',
 'harri',
 'kent',
 'fraser',
 'ayr',
 'sophi',
 'stanton',
 'domin',
 'hall']

In [15]:
testset[3]

[1235,
 2160,
 1401,
 749,
 1235,
 2160,
 1401,
 2160,
 1431,
 1426,
 2160,
 749,
 1235,
 749,
 1929,
 2160,
 1426,
 2160,
 2160,
 936,
 1436,
 2160,
 1426,
 2160,
 2160,
 749,
 1426,
 749,
 1861,
 2160,
 2160,
 2160,
 936,
 749,
 2160,
 2160,
 1235,
 2160,
 750,
 2160,
 2160,
 1235,
 2160,
 1431,
 749,
 2160,
 2160,
 2160,
 2160,
 2160,
 2160,
 2160,
 1235,
 1235,
 2160,
 1426,
 749,
 2160,
 749,
 1426,
 749,
 1861,
 2160,
 2160,
 2160,
 749,
 1235,
 2160,
 1401,
 2160,
 1426,
 1426,
 1436,
 2160,
 1426,
 1426,
 2160,
 1477,
 2160,
 2160,
 2160,
 1235,
 1235,
 2160,
 1426,
 749,
 637,
 1426,
 2160,
 2160,
 936,
 749,
 1235,
 2160,
 1929,
 1477,
 1235,
 2160,
 2160,
 2160,
 749,
 1929,
 2160,
 2160,
 2160,
 2160,
 1929,
 2160,
 2160,
 749,
 637,
 936,
 749,
 1929,
 637,
 749,
 1235,
 1436,
 2160,
 1235,
 2160,
 1235,
 2160,
 2160,
 2160,
 1426,
 750,
 2160,
 1431,
 1235,
 2160,
 2160,
 936,
 749,
 2160,
 2160,
 1235,
 2160,
 1431,
 749,
 2160,
 2160,
 1235,
 2160,
 1426,
 749,
 2160,
 

In [None]:
class NeuralLanguageModel(nn.Module):
    def __init__(self, v_size, dim, w_size, n_hidden):
        super(NeuralLanguageModel, self).__init__()
        self.model = nn.Sequential(
            nn.Embedding(num_embeddings=v_size, embedding_dim=dim),
            nn.LeakyReLU(negative_slope=0.15),
            nn.Flatten(),
            nn.Linear(in_features=dim*w_size, out_features=n_hidden),
            nn.LeakyReLU(negative_slope=0.15),
            nn.Linear(in_features=n_hidden, out_features=n_hidden),
            nn.LeakyReLU(negative_slope=0.15),
            nn.Linear(in_features=n_hidden, out_features=v_size)
        )

    def forward(self, x):
        return self.model(x)

In [None]:
vocab=trainset.vocab
v_size=len(vocab)

neural_model = NeuralLanguageModel(v_size=v_size, dim=100, w_size=5, n_hidden=64)
neural_model

In [None]:
config =  {
        "shuffle": True,
        "lr": 1e-3,
        "epochs": 3,
        "train_steps":1, 
        "val_steps":1, 
        "checkpoint_frequency": 1
        }

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(neural_model.parameters(), lr = config['lr'])
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
!pip install lightning

In [None]:
import lightning as L
import os
from tqdm import tqdm

class Trainer:
    def __init__(self, model, criterion, vocab, dataloader, optimizer, device, config, w_size):
        self.model = model.to(device)
        self.criterion = criterion
        self.vocab = vocab
        self.dataloader = dataloader
        self.optimizer = optimizer
        self.device = device
        self.w_size = w_size
        self.config = config
        self.loss = {
            "train": [],
            "val": []
        }
        
    def train(self):
        
        for epoch in tqdm(range(self.config['epochs'])):
                self.train_step()
                self.val_step()
                
                print(f"Epoch: {epoch + 1}/{self.config['epochs']}, Train Loss {self.loss['train'][-1]:.5f}, Val Loss {self.loss['val'][-1]:.5f}")

    def train_step(self):
        running_loss = []
        for idx, sentence in enumerate(self.dataloader):
            if sentence.shape[1] == 0:
                continue
            print(f"index {idx}| sentence.shape {sentence.shape}")
            for i in range(sentence.shape[1] - self.w_size):
                inputs = sentence[0][i:i+self.w_size].clone().detach().reshape(1, -1)
                target = sentence[0][i+self.w_size].clone().detach()
                print(f"i {i}|target {target.shape}")
                output = self.model(inputs)
                loss = self.criterion(output, target)
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

                running_loss.append([loss.item()])
        self.loss['train'].append(np.mean(running_loss))
    
    def val_step(self):
        val_loss = []
        self.model.eval()
        
        with torch.inference_mode():
            for idx, sentence in enumerate(self.dataloader, 1):
                if sentence.shape[1] == 0:
                    continue
                for i in range(sentence.shape[1] - self.w_size):
                    inputs = torch.tensor(sentence[0][i:i+self.w_size], dtype=torch.long, device=device).reshape(1, -1)
                    target = torch.tensor(sentence[0][i+self.w_size], dtype=torch.long, device=device)
                    output = self.model(inputs)
                    loss = self.criterion(output, target)

                    val_loss.append(loss.item())
                
        self.loss['val'].append(np.mean(val_loss))
        
    def save_model(self, save_path):
        model_path = os.path.join(save_dir, "model.pt")
        torch.save(self.model, model_path)

In [32]:
len(trainloader)

574

In [44]:
trainer = Trainer(neural_model, criterion, vocab, trainloader, optimizer, device, config, 5)

trainer.train()

  0%|          | 0/3 [00:00<?, ?it/s]


index 0| sentence.shape torch.Size([64, 558])
i 0|target torch.Size([])


RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)