# Predict Next Word

## Prepare the Notebook

In [1]:
# Install Packages

!pip install pytorch-lightning torchmetrics torchviz datasets -q

[0m

In [34]:
# Import Packages

import re
import pandas as pd
from datasets import load_dataset

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

nltk.download('stopwords')

import torch
import torchtext
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import pytorch_lightning as pl

import warnings

warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
# Import Dataset

dataset = load_dataset('wikitext', 'wikitext-2-raw-v1')

Downloading builder script:   0%|          | 0.00/2.03k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

Downloading and preparing dataset wikitext/wikitext-2-raw-v1 (download: 4.50 MiB, generated: 12.90 MiB, post-processed: Unknown size, total: 17.40 MiB) to /root/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126...


Downloading data:   0%|          | 0.00/4.72M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Dataset wikitext downloaded and prepared to /root/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [20]:
dataset['train']['text'][9]

" As with previous Valkyira Chronicles games , Valkyria Chronicles III is a tactical role @-@ playing game where players take control of a military unit and take part in missions against enemy forces . Stories are told through comic book @-@ like panels with animated character portraits , with characters speaking partially through voiced speech bubbles and partially through unvoiced text . The player progresses through a series of linear missions , gradually unlocked as maps that can be freely scanned through and replayed as they are unlocked . The route to each story location on the map varies depending on an individual player 's approach : when one option is selected , the other is sealed off to the player . Outside missions , the player characters rest in a camp , where units can be customized and character growth occurs . Alongside the main story missions are character @-@ specific sub missions relating to different squad members . After the game 's completion , additional episodes

## Data Analysis and Preprocessing

In [58]:
tokenizer = torchtext.data.utils.get_tokenizer(nltk.word_tokenize, language='basic-english')
stemmer = SnowballStemmer('english')
englishStopwords = stopwords.words('english')

In [59]:
def clean(text):
    text = text.lower()
    text = re.sub('[^a-z A-Z 0-9-]+', '', text)
    return [stemmer.stem(token).lower() for token in tokenize(text) if token not in englishStopwords]

In [77]:
smallDataset = dataset

In [78]:
smallDataset

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [79]:
tokenize_data = lambda example, clean: {'tokens': clean(example['text'])}  
tokenized_dataset = smallDataset.map(tokenize_data, remove_columns=['text'], 
fn_kwargs={'clean': clean})
print(tokenized_dataset['train'][88]['tokens'])

  0%|          | 0/4358 [00:00<?, ?ex/s]

  0%|          | 0/36718 [00:00<?, ?ex/s]

  0%|          | 0/3760 [00:00<?, ?ex/s]

['ammunit', 'brought', 'rapid', 'prepar', 'use', 'laboratori', 'establish', 'littl', 'rock', 'arsenal', 'purpos', 'illustr', 'piti', 'scarciti', 'materi', 'countri', 'fact', 'may', 'state', 'found', 'necessari', 'use', 'public', 'document', 'state', 'librari', 'cartridg', 'paper', 'gunsmith', 'employ', 'conscript', 'tool', 'purchas', 'impress', 'repair', 'damag', 'gun', 'brought', 'equal', 'number', 'found', 'littl', 'rock', 'commenc', 'inspect', 'work', 'observ', 'spirit', 'men', 'decid', 'garrison', '500', 'strong', 'could', 'hold', 'fitch', 'would', 'lead', 'remaind', '-', '1500', '-', 'gen', 'l', 'rust', 'soon', 'shotgun', 'rifl', 'could', 'obtain', 'littl', 'rock', 'instead', 'pike', 'lanc', 'arm', 'two', 'day', 'elaps', 'chang', 'could', 'effect']


In [80]:
type(tokenized_dataset['train'][88]['tokens'])

list

##### 

In [81]:
vocab = torchtext.vocab.build_vocab_from_iterator(tokenized_dataset['train']['tokens'], 
min_freq=3) 
vocab.insert_token('<unk>', 0)           
vocab.insert_token('<eos>', 1)            
vocab.set_default_index(vocab['<unk>'])   
print(len(vocab))                         
print(vocab.get_itos()[:10])            

21261
['<unk>', '<eos>', '-', 'first', 'one', 'also', 'two', 'time', 'year', 'use']


In [84]:
", ".join(vocab.get_itos()[:15])

'<unk>, <eos>, -, first, one, also, two, time, year, use, game, state, new, includ, song'

In [91]:
def getData(dataset, vocab, BATCH_SIZE):
    data = []                                                   
    for example in dataset:
        if example['tokens']:                                      
            tokens = example['tokens'].append('<eos>')             
            tokens = [vocab[token] for token in example['tokens']] 
            data.extend(tokens)                                    
    data = torch.LongTensor(data)                                 
    numBatches = data.shape[0] // BATCH_SIZE 
    data = data[:numBatches * BATCH_SIZE]                       
    data = data.view(BATCH_SIZE, numBatches)          
    return data

In [92]:
BATCH_SIZE = 128
train_data = getData(tokenized_dataset['train'], vocab, BATCH_SIZE)
valid_data = getData(tokenized_dataset['validation'], vocab, BATCH_SIZE)
test_data = getData(tokenized_dataset['test'], vocab, BATCH_SIZE)

## Training

In [108]:
VOCAB_SIZE = len(vocab)

In [None]:
device = 

In [117]:
class ExtractTensor(nn.Module):
    def forward(self, X):
        output, hidden = X
        return output[-1, :]

In [None]:
class LSTM(nn.Module):
    def __init__(self, vocabSize, embeddingDim, hiddenDim, numLayer, dropoutRate, tieWeights):
        super().__init__()
        self.numLayer = numLayer
        self.hiddenDim = hiddenDim
        self.embeddingDim = embeddingDim
    
        self.embedding = nn.Embedding(vocabSize, embeddingDim)
        self.lstm = nn.LSTm(embeddingDim, hiddenDim, num_layers = numLayer, dropout, dropoutRate, batch_first = True)
        self.dropout = nn.Dropout(dropoutRate)
        self.linear = nn.Linear(hiddenDim, vocab_size)
    
        if tieWeights:
            assert embeddingDim == hiddenDim, 'cannot tie', 'check dims'
            self.embedding.weight = self.linear.weight
        self.initWeights()
    
    def forward(self, src, hidden):
        

In [None]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout_rate,
                tie_weights):

        super().__init__()
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers,
                    dropout=dropout_rate, batch_first=True)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_dim, vocab_size)

        if tie_weights:
            assert embedding_dim == hidden_dim, 'cannot tie, check dims'
            self.embedding.weight = self.fc.weight
        self.init_weights()

    def forward(self, src, hidden):
        embedding = self.dropout(self.embedding(src))
        output, hidden = self.lstm(embedding, hidden)
        output = self.dropout(output)
        prediction = self.fc(output)
        return prediction, hidden

    def init_weights(self):
        init_range_emb = 0.1
        init_range_other = 1/math.sqrt(self.hidden_dim)
        self.embedding.weight.data.uniform_(-init_range_emb, init_range_emb)
        self.fc.weight.data.uniform_(-init_range_other, init_range_other)
        self.fc.bias.data.zero_()
        for i in range(self.num_layers):
            self.lstm.all_weights[i][0] = torch.FloatTensor(self.embedding_dim,
                    self.hidden_dim).uniform_(-init_range_other, init_range_other)
            self.lstm.all_weights[i][1] = torch.FloatTensor(self.hidden_dim,
                    self.hidden_dim).uniform_(-init_range_other, init_range_other)

    def init_hidden(self, batch_size, device):
        hidden = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        cell = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        return hidden, cell

    def detach_hidden(self, hidden):
        hidden, cell = hidden
        hidden = hidden.detach()
        cell = cell.detach()
        return hidden, cell

In [115]:
sampleModel = nn.Sequential(
    nn.Embedding(VOCAB_SIZE, 1024),
    nn.LSTM(1024, 1024, num_layers = 2, batch_first = True),
    ExtractTensor()
)

In [116]:
for value in train_data:
    print(value.size())
    output = sampleModel(value)
    print(output.size())
    break

torch.Size([8422])
torch.Size([1024])
