In [1]:
import nltk
nltk.__version__

'3.9.1'

In [2]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

In [3]:
# fetching the data :
data = open('general_speak.txt','r').read()
data = data.replace('\\n','')

In [4]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [5]:
# tokenizing the data using nltk
from nltk.tokenize import word_tokenize
tokens = word_tokenize(data.lower())

In [6]:
# creating vocabulary :
from collections import Counter
vocab = {'<UNK>':0}
count = Counter(tokens) # creates a dictionary , removes repeates tokens
for token in count.keys():
    vocab[token] = len(vocab)

In [7]:
len(vocab), len(count)

(1889, 1888)

In [8]:
# fetching all sentences in the data :
sentences = data.split('\n')

In [9]:
# text -> tokens -> indices
def token_to_index(token:str,vocab):
    return [vocab[t] if t in vocab else vocab['<UNK>'] for t in word_tokenize(token.lower())]

In [10]:
# tokenizing sentences :
tokenized_sentences = []
for sentence in sentences:
    tokenized_sentences.append(token_to_index(sentence,vocab))
tokenized_sentences[0]

[1, 2, 3, 4, 5, 1, 6, 7, 8, 9, 10, 11, 12, 1, 13, 14]

In [11]:
sentences[0],tokenized_sentences[0]

('The sun sets slowly behind the mountain, casting a warm glow over the valley.',
 [1, 2, 3, 4, 5, 1, 6, 7, 8, 9, 10, 11, 12, 1, 13, 14])

In [12]:
# generating training sequence :
train_sequence = []
for ts in tokenized_sentences:
    for i in range(len(ts)):
        train_sequence.append(ts[:i+1])

In [13]:
# appplying padding in the beginning of each sequence in the train_sequence
# finding the sequence with largest size :
max_size = max(len(t) for t in train_sequence)
for i in range(len(train_sequence)):
    train_sequence[i] = [0 for i in range(max_size-len(train_sequence[i]))] + train_sequence[i]

In [16]:
# fetching sequence and targets :
train_sequence = torch.tensor(train_sequence)
print(train_sequence.shape)
sequences , targets = train_sequence[:,:-1] , train_sequence[:,-1]
sequences.shape, targets.shape

torch.Size([11973, 34])


(torch.Size([11973, 33]), torch.Size([11973]))

In [14]:
# making custom dataset :
class CustomDataset(Dataset):
    def __init__(self,sequences,targets,vocab):
        self.sequences = sequences
        self.targets = targets
        self.vocab = vocab
    def __len__(self):
        return self.targets.shape[0]
    def __getitem__(self,index):
        return self.sequences[index] , self.targets[index]

In [17]:
dataset = CustomDataset(sequences,targets,vocab)

In [18]:
len(dataset)

11973

In [21]:
# (training sequence , target token)
dataset[1001]  # show any item of the dataset

(tensor([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,  20, 191, 217, 485,  30, 127,   9,
         220, 486,  85, 487, 488]),
 tensor(19))

In [23]:
# creating the datatloader :
dataloader = DataLoader(dataset = dataset, batch_size = 32, shuffle = True)
len(dataloader)

375

In [24]:
# lstm with an embedding of 150
class lstm_model(nn.Module):
    def __init__(self,embeddings,hidden,vocab_size):
        super().__init__()
        self.vocab_size = vocab_size
        self.embd = nn.Embedding(num_embeddings = vocab_size, embedding_dim = embeddings)
        self.lstm = nn.LSTM(input_size=embeddings, hidden_size=hidden, batch_first = True)
        self.fully_connected = nn.Linear(in_features = hidden , out_features = vocab_size)
    def forward(self,text):
        x = self.embd(text)
        hidden,final_hidden_and_cell = self.lstm(x)  # returns tuple  : (all hidden states , (final_hidden_state , final_cell_state))
        return self.fully_connected(final_hidden_and_cell[0])

In [25]:
model = lstm_model(150,250,len(vocab))
model.to('cuda')

lstm_model(
  (embd): Embedding(1889, 150)
  (lstm): LSTM(150, 250, batch_first=True)
  (fully_connected): Linear(in_features=250, out_features=1889, bias=True)
)

In [26]:
# check if all layers have expected output shapes...
x = dataset[100][0].unsqueeze(0).to('cuda')
x = model.embd(x)
print(x.shape)
hidden,final = model.lstm(x)
print(hidden.shape,final[0].shape,final[1].shape)
model.fully_connected(final[0]).shape

torch.Size([1, 33, 150])
torch.Size([1, 33, 250]) torch.Size([1, 1, 250]) torch.Size([1, 1, 250])


torch.Size([1, 1, 1889])

In [28]:
# initializing cross entropy loss and Adam optimizer
loss_fun = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params = model.parameters(), lr = 0.001)

In [29]:
# run for 20 epochs :
epochs = 20
model.train()
for epoch in range(epochs):
    epoch_loss = 0
    for data in dataloader:
        optimizer.zero_grad()
        pred = model(data[0].to('cuda'))
        # print(pred.shape)
        # print(data[1].shape)
        loss = loss_fun(pred.squeeze(0),data[1].squeeze(0).to('cuda'))
        epoch_loss += loss
        loss.backward()
        optimizer.step()
    if epoch %2 == 1:
        print(f'Epoch : {epoch+1} | train_loss : {epoch_loss/len(dataloader)}')

Epoch : 2 | train_loss : 3.0516974925994873
Epoch : 4 | train_loss : 1.9016647338867188
Epoch : 6 | train_loss : 1.2000373601913452
Epoch : 8 | train_loss : 0.871799647808075
Epoch : 10 | train_loss : 0.7572396397590637
Epoch : 12 | train_loss : 0.7110127806663513
Epoch : 14 | train_loss : 0.6853783130645752
Epoch : 16 | train_loss : 0.67520672082901
Epoch : 18 | train_loss : 0.6670853495597839
Epoch : 20 | train_loss : 0.6646623015403748


In [30]:
# making prediction :
def predict(model,vocab,text):
    # tokenize to indices :
    tokens = token_to_index(text,vocab)
    # add paddings :
    padded_token_sequence = [0]*(max_size - 1 - len(tokens)) + tokens # input to the model should a vector

    input_sequence = torch.tensor(padded_token_sequence)

    pred = model(input_sequence.unsqueeze(0).to('cuda')).squeeze(0)

    max_val , index = torch.max(pred,dim=1) # get the max prediction value and teh prediction
    return list(vocab.keys())[index]

In [31]:
# predictoin from the model in a sequence of length 20
model.eval()
with torch.no_grad():
    sequence = 20
    input_seq = 'Recycling conserves natural resources'
    for i in range(sequence):
        output = predict(model,vocab,input_seq)
        input_seq += ' '+output
        print(input_seq)

Recycling conserves natural resources ,
Recycling conserves natural resources , reduces
Recycling conserves natural resources , reduces landfill
Recycling conserves natural resources , reduces landfill waste
Recycling conserves natural resources , reduces landfill waste ,
Recycling conserves natural resources , reduces landfill waste , and
Recycling conserves natural resources , reduces landfill waste , and lowers
Recycling conserves natural resources , reduces landfill waste , and lowers greenhouse
Recycling conserves natural resources , reduces landfill waste , and lowers greenhouse gas
Recycling conserves natural resources , reduces landfill waste , and lowers greenhouse gas emissions
Recycling conserves natural resources , reduces landfill waste , and lowers greenhouse gas emissions ,
Recycling conserves natural resources , reduces landfill waste , and lowers greenhouse gas emissions , making
Recycling conserves natural resources , reduces landfill waste , and lowers greenhouse gas