In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch

In [2]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator



In [3]:
from torch import nn

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
with open('data/paragraphs.txt', 'r') as file:
    para = file.read()

In [6]:
paragraphs = [x for x in para.split('\n')]

In [7]:
xtrain, xtest = train_test_split(paragraphs, test_size=0.2, random_state=42)

In [8]:
tokenizer = get_tokenizer('basic_english')

def get_tokens(paragraphs):
    for x in paragraphs:
        yield tokenizer(x)

In [9]:
tokens = get_tokens(xtrain)

In [10]:
vocab = build_vocab_from_iterator(tokens, specials=['<unk>'])

In [11]:
vocab.set_default_index(vocab['<unk>'])

In [12]:
data = [torch.tensor(vocab(tokenizer(item)), dtype=torch.long) for item in xtrain]

In [13]:
data

[tensor([    1,  3358,  5964,     8,    46,    94,    26, 45652, 24204,    32,
         30428,   783,     2,   978,   112,     7, 64627,    38,     2,   408,
            42,    43,  3749,     8,  4947,     3]),
 tensor([    5,  6296,     2,   930,    15,    19,   748, 28464,     2,    44,
             9,     1,  1001,   245,     4,     1,   159,     6,    41, 61153,
          5457,     2,   695,    26,  3888,     4,     1,  1164,     4,  3615,
            15,    19, 61373,     4,  1367,  5604]),
 tensor([  27,   60,   56,    7,  950,  195,  115,    4,    1, 1765,   79,  583,
           59,    5,    1, 7401,   10,    7,   69,    3,   53,   37,   58,   68,
           14,   67,   21,    3]),
 tensor([  794,  2360,     8,  1970, 15817,    28,    11,  7346,  2124,    22,
             1,   301, 11032,   266,    22, 50566,   316,     6,     1,   301,
           480, 19310,   405,    22, 45629,   301,   480,   737,     3,   135,
         19160,   664,    29, 15817,     6,    29,  2291,   943, 

Padding is done to normalize all the tensor data in dataset. 

In [28]:
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

In [15]:
PAD_IDX = 0

In [16]:
padded = pad_sequence(data, padding_value=PAD_IDX)

In [17]:
padded

tensor([[    1,     5,    27,  ...,    11, 21609, 32004],
        [ 3358,  6296,    60,  ...,   805,   921,  2183],
        [ 5964,     2,    56,  ...,   155,  2115,     8],
        ...,
        [    0,     0,     0,  ...,     0,     0,     0],
        [    0,     0,     0,  ...,     0,     0,     0],
        [    0,     0,     0,  ...,     0,     0,     0]])

The below line creates a 1D pytorch tensor containing the length of each sentence sequence.

In [18]:
lengths = torch.tensor([len(s) for s in data])    
lengths

tensor([26, 36, 28,  ..., 80, 55, 74])

An embedding is a way to map word indices (integers) into dense vectors of real numbers that capture semantic meaning

In [23]:
vocab_size = padded.max().item() + 1

In [24]:
embed = nn.Embedding(vocab_size, 32, padding_idx=PAD_IDX)

In [25]:
embed

Embedding(71741, 32, padding_idx=0)

In [26]:
print(padded.max()) 

tensor(71740)


In [27]:
packed = pack_padded_sequence(embed(padded), lengths, enforce_sorted=False)

In [29]:
lstm = nn.LSTM(input_size=32, hidden_size=64, num_layers=1)
packed_out, hidden = lstm(packed)

out, _ = pad_packed_sequence(packed_out) 