In [19]:
import nltk
nltk.__version__

'3.9.1'

In [20]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

In [44]:
# fetching the data :
data = open('general_speak2.txt','r').read()
data = data.replace('\\n','')
# data = data.replace('.','. ')
data



In [35]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [45]:
# tokenizing the data using nltk
from nltk.tokenize import word_tokenize
tokens = word_tokenize(data.lower())
tokens

['the',
 'sun',
 'sets',
 'slowly',
 'behind',
 'the',
 'mountain',
 ',',
 'casting',
 'a',
 'warm',
 'glow',
 'over',
 'the',
 'valley',
 '.',
 'how',
 'are',
 'you',
 'today',
 '?',
 'i',
 'hope',
 'you',
 "'re",
 'having',
 'a',
 'wonderful',
 'day',
 '!',
 'the',
 'scientist',
 'conducted',
 'experiments',
 'to',
 'uncover',
 'the',
 'mysteries',
 'of',
 'the',
 'universe',
 '.',
 'what',
 'time',
 'does',
 'the',
 'meeting',
 'start',
 '?',
 'i',
 'need',
 'to',
 'prepare',
 'my',
 'notes',
 '.',
 'she',
 'walked',
 'through',
 'the',
 'forest',
 ',',
 'listening',
 'to',
 'the',
 'birds',
 'chirping',
 'happily',
 '.',
 'technology',
 'has',
 'transformed',
 'the',
 'way',
 'we',
 'communicate',
 'with',
 'each',
 'other',
 '.',
 'can',
 'you',
 'help',
 'me',
 'find',
 'a',
 'good',
 'book',
 'to',
 'read',
 'this',
 'weekend',
 '?',
 'the',
 'old',
 'house',
 'on',
 'the',
 'hill',
 'has',
 'a',
 'strange',
 ',',
 'eerie',
 'atmosphere',
 'at',
 'night',
 '.',
 'let',
 "'s",
 '

In [46]:
# creating vocabulary :
from collections import Counter
vocab = {'<UNK>':0}
count = Counter(tokens) # creates a dictionary , removes repeates tokens
for token in count.keys():
    vocab[token] = len(vocab)
vocab

{'<UNK>': 0,
 'the': 1,
 'sun': 2,
 'sets': 3,
 'slowly': 4,
 'behind': 5,
 'mountain': 6,
 ',': 7,
 'casting': 8,
 'a': 9,
 'warm': 10,
 'glow': 11,
 'over': 12,
 'valley': 13,
 '.': 14,
 'how': 15,
 'are': 16,
 'you': 17,
 'today': 18,
 '?': 19,
 'i': 20,
 'hope': 21,
 "'re": 22,
 'having': 23,
 'wonderful': 24,
 'day': 25,
 '!': 26,
 'scientist': 27,
 'conducted': 28,
 'experiments': 29,
 'to': 30,
 'uncover': 31,
 'mysteries': 32,
 'of': 33,
 'universe': 34,
 'what': 35,
 'time': 36,
 'does': 37,
 'meeting': 38,
 'start': 39,
 'need': 40,
 'prepare': 41,
 'my': 42,
 'notes': 43,
 'she': 44,
 'walked': 45,
 'through': 46,
 'forest': 47,
 'listening': 48,
 'birds': 49,
 'chirping': 50,
 'happily': 51,
 'technology': 52,
 'has': 53,
 'transformed': 54,
 'way': 55,
 'we': 56,
 'communicate': 57,
 'with': 58,
 'each': 59,
 'other': 60,
 'can': 61,
 'help': 62,
 'me': 63,
 'find': 64,
 'good': 65,
 'book': 66,
 'read': 67,
 'this': 68,
 'weekend': 69,
 'old': 70,
 'house': 71,
 'on': 72,

In [47]:
len(vocab), len(count)

(1889, 1888)

In [48]:
# fetching all sentences in the data :
sentences = data.split('\n')

In [49]:
def token_to_index(token:str,vocab):
    return [vocab[t] if t in vocab else vocab['<UNK>'] for t in word_tokenize(token.lower())]

In [50]:
# tokenizing sentences :
tokenized_sentences = []
for sentence in sentences:
    tokenized_sentences.append(token_to_index(sentence,vocab))
tokenized_sentences

[[1, 2, 3, 4, 5, 1, 6, 7, 8, 9, 10, 11, 12, 1, 13, 14],
 [15, 16, 17, 18, 19, 20, 21, 17, 22, 23, 9, 24, 25, 26],
 [1, 27, 28, 29, 30, 31, 1, 32, 33, 1, 34, 14],
 [35, 36, 37, 1, 38, 39, 19, 20, 40, 30, 41, 42, 43, 14],
 [44, 45, 46, 1, 47, 7, 48, 30, 1, 49, 50, 51, 14],
 [52, 53, 54, 1, 55, 56, 57, 58, 59, 60, 14],
 [61, 17, 62, 63, 64, 9, 65, 66, 30, 67, 68, 69, 19],
 [1, 70, 71, 72, 1, 73, 53, 9, 74, 7, 75, 76, 77, 78, 14],
 [79, 80, 81, 9, 82, 30, 1, 83, 68, 84, 85, 86, 87, 88, 89, 26],
 [1, 90, 91, 9, 92, 93, 94, 95, 96, 97, 1, 98, 14],
 [99, 100, 1, 101, 102, 103, 18, 19, 86, 80, 104, 105, 26],
 [106, 107, 108, 109, 1, 110, 7, 111, 30, 112, 9, 113, 114, 14],
 [1, 115, 116, 117, 118, 119, 120, 7, 121, 122, 119, 1, 123, 14],
 [35, 16, 1, 124, 33, 125, 19, 20, 126, 30, 127, 128, 129, 86, 14],
 [125,
  130,
  131,
  132,
  7,
  133,
  134,
  135,
  7,
  136,
  137,
  138,
  139,
  140,
  7,
  141,
  86,
  142,
  109,
  143,
  144,
  7,
  145,
  30,
  1,
  146,
  14],
 [1, 147, 80, 14

In [51]:
sentences[0],tokenized_sentences[0]

('The sun sets slowly behind the mountain, casting a warm glow over the valley.',
 [1, 2, 3, 4, 5, 1, 6, 7, 8, 9, 10, 11, 12, 1, 13, 14])

In [52]:
# generating training sequence :
train_sequence = []
for ts in tokenized_sentences:
    for i in range(len(ts)):
        train_sequence.append(ts[:i+1])
train_sequence

[[1],
 [1, 2],
 [1, 2, 3],
 [1, 2, 3, 4],
 [1, 2, 3, 4, 5],
 [1, 2, 3, 4, 5, 1],
 [1, 2, 3, 4, 5, 1, 6],
 [1, 2, 3, 4, 5, 1, 6, 7],
 [1, 2, 3, 4, 5, 1, 6, 7, 8],
 [1, 2, 3, 4, 5, 1, 6, 7, 8, 9],
 [1, 2, 3, 4, 5, 1, 6, 7, 8, 9, 10],
 [1, 2, 3, 4, 5, 1, 6, 7, 8, 9, 10, 11],
 [1, 2, 3, 4, 5, 1, 6, 7, 8, 9, 10, 11, 12],
 [1, 2, 3, 4, 5, 1, 6, 7, 8, 9, 10, 11, 12, 1],
 [1, 2, 3, 4, 5, 1, 6, 7, 8, 9, 10, 11, 12, 1, 13],
 [1, 2, 3, 4, 5, 1, 6, 7, 8, 9, 10, 11, 12, 1, 13, 14],
 [15],
 [15, 16],
 [15, 16, 17],
 [15, 16, 17, 18],
 [15, 16, 17, 18, 19],
 [15, 16, 17, 18, 19, 20],
 [15, 16, 17, 18, 19, 20, 21],
 [15, 16, 17, 18, 19, 20, 21, 17],
 [15, 16, 17, 18, 19, 20, 21, 17, 22],
 [15, 16, 17, 18, 19, 20, 21, 17, 22, 23],
 [15, 16, 17, 18, 19, 20, 21, 17, 22, 23, 9],
 [15, 16, 17, 18, 19, 20, 21, 17, 22, 23, 9, 24],
 [15, 16, 17, 18, 19, 20, 21, 17, 22, 23, 9, 24, 25],
 [15, 16, 17, 18, 19, 20, 21, 17, 22, 23, 9, 24, 25, 26],
 [1],
 [1, 27],
 [1, 27, 28],
 [1, 27, 28, 29],
 [1, 27, 28, 29, 30]

In [53]:
# appplying padding in the beginning of each sequence in the train_sequence
# finding the sequence with largest size :
max_size = max(len(t) for t in train_sequence)
for i in range(len(train_sequence)):
    train_sequence[i] = [0 for i in range(max_size-len(train_sequence[i]))] + train_sequence[i]
train_sequence

[[0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  2],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  2,
  3],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  2,
  3,
  4],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  2,
  3,
  4,
  5],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,


In [54]:
# fetching sequence and targets :
train_sequence = torch.tensor(train_sequence)
print(train_sequence.shape)
sequences , targets = train_sequence[:,:-1] , train_sequence[:,-1]
sequences.shape, targets.shape

torch.Size([11973, 34])


(torch.Size([11973, 33]), torch.Size([11973]))

In [55]:
sequences,targets

(tensor([[   0,    0,    0,  ...,    0,    0,    0],
         [   0,    0,    0,  ...,    0,    0,    1],
         [   0,    0,    0,  ...,    0,    1,    2],
         ...,
         [   0,    0,    0,  ...,    7,  341,    1],
         [   0,    0,    0,  ...,  341,    1,  408],
         [   0,    0,    0,  ...,    1,  408, 1154]]),
 tensor([   1,    2,    3,  ...,  408, 1154,   14]))

In [56]:
# making custom dataset :
class CustomDataset(Dataset):
    def __init__(self,sequences,targets,vocab):
        self.sequences = sequences
        self.targets = targets
        self.vocab = vocab
    def __len__(self):
        return self.targets.shape[0]
    def __getitem__(self,index):
        return self.sequences[index] , self.targets[index]

In [57]:
dataset = CustomDataset(sequences,targets,vocab)

In [58]:
len(dataset)

11973

In [59]:
dataset[1001]

(tensor([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,  20, 191, 217, 485,  30, 127,   9,
         220, 486,  85, 487, 488]),
 tensor(19))

In [60]:
sequences[1001],targets[1001]

(tensor([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,  20, 191, 217, 485,  30, 127,   9,
         220, 486,  85, 487, 488]),
 tensor(19))

In [61]:
# creating the datatloader :
dataloader = DataLoader(dataset = dataset, batch_size = 32, shuffle = True)
len(dataloader)

375

In [63]:
# lstm with an embedding of 150
class lstm_model(nn.Module):
    def __init__(self,embeddings,hidden,vocab_size):
        super().__init__()
        self.vocab_size = vocab_size
        self.embd = nn.Embedding(num_embeddings = vocab_size, embedding_dim = embeddings)
        self.lstm = nn.LSTM(input_size=embeddings, hidden_size=hidden, batch_first = True)
        self.fully_connected = nn.Linear(in_features = hidden , out_features = vocab_size)
    def forward(self,text):
        x = self.embd(text)
        hidden,final_hidden_and_cell = self.lstm(x)  # returns tuple  : (all hidden states , (final_hidden_state , final_cell_state))
        return self.fully_connected(final_hidden_and_cell[0])

In [65]:
model = lstm_model(150,250,len(vocab))
model.to('cuda')

lstm_model(
  (embd): Embedding(1889, 150)
  (lstm): LSTM(150, 250, batch_first=True)
  (fully_connected): Linear(in_features=250, out_features=1889, bias=True)
)

In [67]:
# check if all layers have expected output shapes...
x = dataset[100][0].unsqueeze(0).to('cuda')
x = model.embd(x)
print(x.shape)
hidden,final = model.lstm(x)
print(hidden.shape,final[0].shape,final[1].shape)
model.fully_connected(final[0]).shape

torch.Size([1, 33, 150])
torch.Size([1, 33, 250]) torch.Size([1, 1, 250]) torch.Size([1, 1, 250])


torch.Size([1, 1, 1889])

In [68]:
loss_fun = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params = model.parameters(), lr = 0.001)

In [69]:
epochs = 20
model.train()
for epoch in range(epochs):
    epoch_loss = 0
    for data in dataloader:
        optimizer.zero_grad()
        pred = model(data[0].to('cuda'))
        # print(pred.shape)
        # print(data[1].shape)
        loss = loss_fun(pred.squeeze(0),data[1].squeeze(0).to('cuda'))
        epoch_loss += loss
        loss.backward()
        optimizer.step()
    if epoch %2 == 1:
        print(f'Epoch : {epoch+1} | train_loss : {epoch_loss/len(dataloader)}')

Epoch : 2 | train_loss : 3.0774924755096436
Epoch : 4 | train_loss : 1.9271507263183594
Epoch : 6 | train_loss : 1.226694107055664
Epoch : 8 | train_loss : 0.8798696398735046
Epoch : 10 | train_loss : 0.7546950578689575
Epoch : 12 | train_loss : 0.7110781073570251
Epoch : 14 | train_loss : 0.6898009181022644
Epoch : 16 | train_loss : 0.6733013987541199
Epoch : 18 | train_loss : 0.6680858731269836
Epoch : 20 | train_loss : 0.6616161465644836


In [71]:
# making prediction :
def predict(model,vocab,text):
    # tokenize to indices :
    tokens = token_to_index(text,vocab)
    # add paddings :
    padded_token_sequence = [0]*(max_size - 1 - len(tokens)) + tokens # input to the model should a vector of length 15
    input_sequence = torch.tensor(padded_token_sequence)
    pred = model(input_sequence.unsqueeze(0).to('cuda')).squeeze(0)
    max_val , index = torch.max(pred,dim=1)
    return list(vocab.keys())[index]

In [76]:
model.eval()
with torch.no_grad():
    sequence = 20
    input_seq = 'Recycling conserves natural resources'
    for i in range(sequence):
        output = predict(model,vocab,input_seq)
        input_seq += ' '+output
        print(input_seq)

Recycling conserves natural resources ,
Recycling conserves natural resources , reduces
Recycling conserves natural resources , reduces landfill
Recycling conserves natural resources , reduces landfill waste
Recycling conserves natural resources , reduces landfill waste ,
Recycling conserves natural resources , reduces landfill waste , and
Recycling conserves natural resources , reduces landfill waste , and lowers
Recycling conserves natural resources , reduces landfill waste , and lowers greenhouse
Recycling conserves natural resources , reduces landfill waste , and lowers greenhouse gas
Recycling conserves natural resources , reduces landfill waste , and lowers greenhouse gas emissions
Recycling conserves natural resources , reduces landfill waste , and lowers greenhouse gas emissions ,
Recycling conserves natural resources , reduces landfill waste , and lowers greenhouse gas emissions , making
Recycling conserves natural resources , reduces landfill waste , and lowers greenhouse gas