# Predict Next Word

## Prepare the Notebook

In [1]:
# Install Packages

!pip install pytorch-lightning torchmetrics torchviz datasets -q|

/bin/bash: -c: line 2: syntax error: unexpected end of file


In [37]:
# Import Packages

from tqdm import tqdm

import re
import math
import pandas as pd
from datasets import load_dataset

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

nltk.download('stopwords')

import torch
import torchtext
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import torchmetrics

import warnings

warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Import Dataset

dataset = load_dataset('wikitext', 'wikitext-2-raw-v1')

  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
dataset['train']['text'][9]

" As with previous Valkyira Chronicles games , Valkyria Chronicles III is a tactical role @-@ playing game where players take control of a military unit and take part in missions against enemy forces . Stories are told through comic book @-@ like panels with animated character portraits , with characters speaking partially through voiced speech bubbles and partially through unvoiced text . The player progresses through a series of linear missions , gradually unlocked as maps that can be freely scanned through and replayed as they are unlocked . The route to each story location on the map varies depending on an individual player 's approach : when one option is selected , the other is sealed off to the player . Outside missions , the player characters rest in a camp , where units can be customized and character growth occurs . Alongside the main story missions are character @-@ specific sub missions relating to different squad members . After the game 's completion , additional episodes

## Data Analysis and Preprocessing

In [5]:
tokenizer = torchtext.data.utils.get_tokenizer(nltk.word_tokenize, language='basic-english')
stemmer = SnowballStemmer('english')
englishStopwords = stopwords.words('english')

In [6]:
def clean(text):
    text = text.lower()
    text = re.sub('[^a-z A-Z 0-9-]+', '', text)
    return [stemmer.stem(token).lower() for token in tokenizer(text) if token not in englishStopwords]

In [7]:
smallDataset = dataset

In [8]:
smallDataset

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [9]:
tokenize_data = lambda example, clean: {'tokens': clean(example['text'])}  
tokenized_dataset = smallDataset.map(tokenize_data, remove_columns=['text'], 
fn_kwargs={'clean': clean})
print(tokenized_dataset['train'][88]['tokens'])

  0%|          | 0/4358 [00:00<?, ?ex/s]

  0%|          | 0/36718 [00:00<?, ?ex/s]

  0%|          | 0/3760 [00:00<?, ?ex/s]

['ammunit', 'brought', 'rapid', 'prepar', 'use', 'laboratori', 'establish', 'littl', 'rock', 'arsenal', 'purpos', 'illustr', 'piti', 'scarciti', 'materi', 'countri', 'fact', 'may', 'state', 'found', 'necessari', 'use', 'public', 'document', 'state', 'librari', 'cartridg', 'paper', 'gunsmith', 'employ', 'conscript', 'tool', 'purchas', 'impress', 'repair', 'damag', 'gun', 'brought', 'equal', 'number', 'found', 'littl', 'rock', 'commenc', 'inspect', 'work', 'observ', 'spirit', 'men', 'decid', 'garrison', '500', 'strong', 'could', 'hold', 'fitch', 'would', 'lead', 'remaind', '-', '1500', '-', 'gen', 'l', 'rust', 'soon', 'shotgun', 'rifl', 'could', 'obtain', 'littl', 'rock', 'instead', 'pike', 'lanc', 'arm', 'two', 'day', 'elaps', 'chang', 'could', 'effect']


In [10]:
type(tokenized_dataset['train'][88]['tokens'])

list

In [11]:
vocab = torchtext.vocab.build_vocab_from_iterator(tokenized_dataset['train']['tokens'], 
min_freq=3) 
vocab.insert_token('<unk>', 0)           
vocab.insert_token('<eos>', 1)            
vocab.set_default_index(vocab['<unk>'])   
print(len(vocab))                         
print(vocab.get_itos()[:10])            

21261
['<unk>', '<eos>', '-', 'first', 'one', 'also', 'two', 'time', 'year', 'use']


In [12]:
", ".join(vocab.get_itos()[:15])

'<unk>, <eos>, -, first, one, also, two, time, year, use, game, state, new, includ, song'

In [13]:
def getData(dataset, vocab, BATCH_SIZE):
    data = []                                                   
    for example in dataset:
        if example['tokens']:                                      
            tokens = example['tokens'].append('<eos>')             
            tokens = [vocab[token] for token in example['tokens']] 
            data.extend(tokens)                                    
    data = torch.LongTensor(data)                                 
    numBatches = data.shape[0] // BATCH_SIZE 
    data = data[:numBatches * BATCH_SIZE]                       
    data = data.view(BATCH_SIZE, numBatches)          
    return data

In [14]:
BATCH_SIZE = 128
train_data = getData(tokenized_dataset['train'], vocab, BATCH_SIZE)
valid_data = getData(tokenized_dataset['validation'], vocab, BATCH_SIZE)
test_data = getData(tokenized_dataset['test'], vocab, BATCH_SIZE)

## Training

In [15]:
VOCAB_SIZE = len(vocab)

In [16]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [17]:
class ExtractTensor(nn.Module):
    def forward(self, X):
        output, hidden = X
        return output[-1, :]

In [32]:
vocabSize = len(vocab)
embeddingDim = 1024
hiddenDim = 1024
numLayer = 2
dropoutRate = 0.65              
tieWeights = True                  
lr = 1e-3     
batchSize = 128

In [45]:
class LSTM(nn.Module):
    def __init__(self, vocabSize, embeddingDim, hiddenDim, numLayer, dropoutRate, tieWeights):
        super().__init__()
        self.numLayer = numLayer
        self.hiddenDim = hiddenDim
        self.embeddingDim = embeddingDim
    
        self.embedding = nn.Embedding(vocabSize, embeddingDim)
        self.lstm = nn.LSTM(embeddingDim, hiddenDim, num_layers=numLayer, dropout=dropoutRate, batch_first=True)
        self.dropout = nn.Dropout(dropoutRate)
        self.linear = nn.Linear(hiddenDim, vocabSize)
    
        if tieWeights:
            assert embeddingDim == hiddenDim, 'cannot tie' 'check dims'
            self.embedding.weight = self.linear.weight
        self.initWeights()
    
    def forward(self, src, hidden):
        embedding = self.dropout(self.embedding(src))
        output, hidden = self.lstm(embedding, hidden)
        output = self.dropout(output)
        prediction = self.linear(output)
        return prediction
    
    def initWeights(self):
        init_range_emb = 0.1
        init_range_other = 1/math.sqrt(self.hiddenDim)
        self.embedding.weight.data.uniform_(-init_range_emb, init_range_emb)
        self.linear.weight.data.uniform_(-init_range_other, init_range_other)
        self.linear.bias.data.zero_()
        for i in range(self.numLayer):
            self.lstm.all_weights[i][0] = torch.FloatTensor(self.embeddingDim,
                    self.hiddenDim).uniform_(-init_range_other, init_range_other)
            self.lstm.all_weights[i][1] = torch.FloatTensor(self.hiddenDim,
                    self.hiddenDim).uniform_(-init_range_other, init_range_other)

    
    def initHidden(self, batch_size, device):
        hidden = torch.zeros(self.numLayer, batch_size, self.hiddenDim).to(device)
        cell = torch.zeros(self.numLayer, batch_size, self.hiddenDim).to(device)
        return hidden, cell
    
    def detachHidden(self, hidden):
        hidden, cell = hidden
        hidden = hidden.detach()
        cell = cell.detach()
        return hidden, cell

model = LSTM(vocabSize, embeddingDim, hiddenDim, numLayer, dropoutRate, tieWeights).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {num_params:,} trainable parameters')

The model has 38,586,125 trainable parameters


In [21]:
def getBatch(data, seqLen, numBatches, idx):
    src = data[:, idx:idx+seqLen]
    target = data[:, idx+1:idx+seqLen+1]
    return src, target

In [66]:
def train(model, data, optimizer, criterion, batchSize, seqLen, clip, device):

    epochLoss = 0
    model.train()
    numBatches = data.shape[-1]
    data = data[:, :numBatches - (numBatches -1) % seqLen]
    numBatches = data.shape[-1]

    hidden = model.initHidden(batchSize, device)

    for idx in tqdm(range(0, numBatches - 1, seqLen), desc='Training: ',leave=False):
        optimizer.zero_grad()
        hidden = model.detachHidden(hidden)

        src, target = getBatch(data, seqLen, numBatches, idx)
        src, target = src.to(device), target.to(device)
        batch_size = src.shape[0]
        prediction = model(src, hidden)

        prediction = prediction.reshape(batchSize * seqLen, -1)
        target = target.reshape(-1)
        loss = criterion(prediction, target)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epochLoss += loss.item() * seqLen
    return epochLoss / numBatches

In [67]:
def evaluate(model, data, criterion, batchSize, seqLen, device):

    epochLoss = 0
    model.eval()
    numBatches = data.shape[-1]
    data = data[:, :numBatches - (numBatches -1) % seqLen]
    numBatches = data.shape[-1]

    hidden = model.initHidden(batchSize, device)

    with torch.no_grad():
        for idx in range(0, numBatches - 1, seqLen):
            hidden = model.detachHidden(hidden)
            src, target = getBatch(data, seqLen, numBatches, idx)
            src, target = src.to(device), target.to(device)
            batch_size = src.shape[0]

            prediction = model(src, hidden)
            prediction = prediction.reshape(batchSize * seqLen, -1)
            target = target.reshape(-1)


            loss = criterion(prediction, target)
            epochLoss += loss.item() * seqLen
    return epochLoss / numBatches

In [69]:
nEpochs = 10
seqLen = 50
clip = 0.25
saved = False

lrScheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=0)

if saved:
    model.load_state_dict(torch.load('best-val-lstm_lm.pt',  map_location=device))
    testLoss = evaluate(model, test_data, criterion, batchSize, seqLen, device)
    print(f'Test Perplexity: {math.exp(testLoss):.3f}')
else:
    bestValidLoss = float('inf')

    for epoch in range(nEpochs):
        trainLoss = train(model, train_data, optimizer, criterion, 
                    batchSize, seqLen, clip, device)
        validLoss = evaluate(model, valid_data, criterion, batchSize, 
                    seqLen, device)
        
        lrScheduler.step(validLoss)

        if validLoss < bestValidLoss:
            bestValidLoss = validLoss
            torch.save(model.state_dict(), 'best-val-lstm_lm.pt')

        print(f'\tTrain Perplexity: {math.exp(trainLoss):.3f}')
        print(f'\tValid Perplexity: {math.exp(validLoss):.3f}')

                                                           

	Train Perplexity: 3188.757
	Valid Perplexity: 2449.526


                                                           

	Train Perplexity: 2640.479
	Valid Perplexity: 2027.794


                                                           

	Train Perplexity: 2110.949
	Valid Perplexity: 1749.192


                                                           

	Train Perplexity: 1718.450
	Valid Perplexity: 1509.963


                                                           

	Train Perplexity: 1417.383
	Valid Perplexity: 1378.071


                                                           

	Train Perplexity: 1205.265
	Valid Perplexity: 1281.776


                                                           

	Train Perplexity: 1049.338
	Valid Perplexity: 1219.801


                                                           

	Train Perplexity: 927.260
	Valid Perplexity: 1175.253


                                                           

	Train Perplexity: 832.072
	Valid Perplexity: 1142.512


                                                           

	Train Perplexity: 759.211
	Valid Perplexity: 1114.313


## Evaluating

In [70]:
def generate(prompt, maxSeqLen, temperature, model, tokenizer, vocab, device, seed=None):
    if seed is not None:
        torch.manual_seed(seed)
    model.eval()
    tokens = clean(prompt)
    indices = [vocab[t] for t in tokens]
    batchSize = 1
    hidden = model.initHidden(batchSize, device)
    with torch.no_grad():
        for i in range(maxSeqLen):
            src = torch.LongTensor([indices]).to(device)
            prediction = model(src, hidden)
            probs = torch.softmax(prediction[:, -1] / temperature, dim=-1)  
            prediction = torch.multinomial(probs, num_samples=1).item()    
            
            while prediction == vocab['<unk>']:
                prediction = torch.multinomial(probs, num_samples=1).item()

            if prediction == vocab['<eos>']:
                break

            indices.append(prediction)

    itos = vocab.get_itos()
    tokens = [itos[i] for i in indices]
    return tokens

In [72]:
prompt = 'My name is'
maxSeqLen = 30
seed = 0

temperatures = [0.5, 0.7, 0.75, 0.8, 1.0]
for temperature in temperatures:
    generation = generate(prompt, maxSeqLen, temperature, model, tokenizer, 
                          vocab, device, seed)
    print(str(temperature)+'\n'+' '.join(generation)+'\n')

0.5
name peopl

0.7
name robert davenport would describ reason issu dictat

0.75
name mention previous attempt liabil chess world peopl

0.8
name mention previous attempt liabil chess world peopl confer umpir investig umpir umpir

1.0
name mainstay 2501 previous attempt pessimist liabil chess film peopl park track investig movi peel



## Notes

1. Don't use Stemmer
2. Don't use Stopwords