In [1]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [3]:

import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import nltk
from nltk.tokenize import word_tokenize

In [4]:
device= torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [5]:
document = """The sun peeked over the horizon. The sky turned golden. Birds stretched their wings. A cool breeze swept through the trees. Leaves rustled. The air smelled fresh.

A cat sat on the fence. It watched the street. A car rolled past. Its engine hummed softly. A cyclist swerved to avoid a puddle. The road was still wet from last night’s rain.

Emma walked outside. She zipped up her coat. Her breath formed a cloud. The city was waking up. A bakery smelled of warm bread. A man sipped his coffee. A child tugged at his mother’s sleeve.

The bus arrived. People rushed in. Emma took a window seat. The bus moved. Buildings blurred past. A woman read a book. A man checked his watch. A teenager tapped on his phone.

Traffic lights turned red. Cars lined up. A truck honked loudly. A pigeon fluttered away. The bus stopped. More passengers entered. Emma looked outside. The park was empty. The fountain was dry.

A signboard caught her eye. “Missing Person.” The face looked familiar. She frowned. Where had she seen it before? She searched her memory. Nothing came. The bus moved again.

At the next stop, she got off. The street smelled of coffee. Vendors arranged their goods. A boy chased a balloon. A street musician tuned his guitar. A police siren wailed in the distance.

She walked into the library. It was quiet. A librarian smiled at her. She picked a book. The pages felt rough. A note slipped out. She picked it up. It had only three words: “Meet me outside.”

She looked around. No one was watching. Her heart pounded. She took a deep breath. She folded the note. She put it in her pocket.

Outside, the wind was stronger. The trees swayed. A dog barked. She scanned the street. No one seemed unusual. But something felt wrong.

A figure stood near the bench. He wore a dark coat. His hands were in his pockets. She took a step forward. He turned. Their eyes met.

Everything suddenly made sense."""

In [6]:
sentences = document.split('.')
sentences = [sentence.strip() for sentence in sentences]

for i in sentences:
  print(i)


The sun peeked over the horizon
The sky turned golden
Birds stretched their wings
A cool breeze swept through the trees
Leaves rustled
The air smelled fresh
A cat sat on the fence
It watched the street
A car rolled past
Its engine hummed softly
A cyclist swerved to avoid a puddle
The road was still wet from last night’s rain
Emma walked outside
She zipped up her coat
Her breath formed a cloud
The city was waking up
A bakery smelled of warm bread
A man sipped his coffee
A child tugged at his mother’s sleeve
The bus arrived
People rushed in
Emma took a window seat
The bus moved
Buildings blurred past
A woman read a book
A man checked his watch
A teenager tapped on his phone
Traffic lights turned red
Cars lined up
A truck honked loudly
A pigeon fluttered away
The bus stopped
More passengers entered
Emma looked outside
The park was empty
The fountain was dry
A signboard caught her eye
“Missing Person
” The face looked familiar
She frowned
Where had she seen it before? She searched her memo

In [7]:
word_tokenize(sentences[0])

['The', 'sun', 'peeked', 'over', 'the', 'horizon']

In [44]:

vocab = {}
cnt=0
for sentence in sentences:
  words = word_tokenize(sentence)
  for word in words:
    if word not in vocab:

      vocab[word] = cnt
      cnt+=1

print(f"len of vocab : {len(vocab)}")


len of vocab : 221


In [45]:
def vocab_and_indexing(a,vocab):
  arr=[]
  for words in word_tokenize(a):
    arr.append(vocab[words])
  return arr

In [46]:


def padding(arr,arr_len):
  return arr+[0]*(arr_len-len(arr))

In [47]:
x=[]
y=[]
for sentence in sentences:
  arr = vocab_and_indexing(sentence,vocab)
  #print(arr)
  for i in range(1,len(arr)):
    x.append(padding(arr[:i],16))
    y.append(arr[i])




In [48]:
x= torch.tensor(x,dtype=torch.float32)
y= torch.tensor(y,dtype=torch.long)

In [49]:
print(y.max())
x.shape,y.shape


tensor(220)


(torch.Size([277, 16]), torch.Size([277]))

In [50]:

class custom_dataset(Dataset):
  def __init__(self,x,y):
    self.x = x
    self.y = y
    #print(self.x.shape)
  def __len__(self):
    return self.x.shape[0]
  def __getitem__(self,idx):
    return self.x[idx],self.y[idx]

In [51]:
data = custom_dataset(x,y)
data= DataLoader(data,batch_size=3,shuffle=False)

In [52]:
for a,b in data:
  break
a=torch.tensor(a,dtype=torch.long)
b=torch.tensor(b,dtype=torch.long)
a= nn.Embedding(len(vocab),100)(a)
a.shape

  a=torch.tensor(a,dtype=torch.long)
  b=torch.tensor(b,dtype=torch.long)


torch.Size([3, 16, 100])

In [53]:
len_vocab= len(vocab)
len_vocab

221

In [83]:
class my_model(nn.Module):
  def __init__(self,len_vocab):
    super(my_model,self).__init__()
    self.embedding = nn.Embedding(len_vocab,100,padding_idx=0)
    self.lstm = nn.LSTM(100,128,batch_first=True)
    self.linear = nn.Linear(128,len_vocab)
  def forward(self,x):
    x = self.embedding(x)
    intermeddiate_hidden_states ,(final_hidden_state,final_layer_output) = self.lstm(x)
    x = self.linear(final_hidden_state.squeeze(0))
    return x

In [84]:

model = my_model(len_vocab)

In [85]:
optim = torch.optim.ADAM(model.parameters(),lr=0.10)
loss_fn = nn.CrossEntropyLoss()

In [86]:
print(f"Min index in x: {x.min()}, Max index in x: {x.max()}")


Min index in x: 0, Max index in x: 98


In [87]:
len(vocab)

221

In [92]:
for i in range(100):
    net_loss = 0
    for x, y in data:
        # Move data to the device first
        x = x.long()
        y = y.long()

        # Now pass the data to the model
        output = model(x)


        loss = loss_fn(output, y)

        optim.zero_grad()
        loss.backward()
        optim.step()
        net_loss += loss.item()
    print(f"epoch : {i} loss : {net_loss/len(data)}")


epoch : 0 loss : 5.266613991029801
epoch : 1 loss : 5.246615727742513
epoch : 2 loss : 5.22968041512274
epoch : 3 loss : 5.214925822391305
epoch : 4 loss : 5.202412297648769
epoch : 5 loss : 5.192342655633086
epoch : 6 loss : 5.184626876666981
epoch : 7 loss : 5.178765271299628
epoch : 8 loss : 5.174146252293741
epoch : 9 loss : 5.170327796730944
epoch : 10 loss : 5.167058478119553
epoch : 11 loss : 5.164196701459987
epoch : 12 loss : 5.161654549260294
epoch : 13 loss : 5.159371565747005
epoch : 14 loss : 5.157303507610034
epoch : 15 loss : 5.155416657847743
epoch : 16 loss : 5.153684277688304
epoch : 17 loss : 5.152084709495626
epoch : 18 loss : 5.150600248767484
epoch : 19 loss : 5.149216328897784
epoch : 20 loss : 5.147920480338476
epoch : 21 loss : 5.146702433145174
epoch : 22 loss : 5.145553117157311
epoch : 23 loss : 5.144464795307447
epoch : 24 loss : 5.143430776493524
epoch : 25 loss : 5.1424449951417985
epoch : 26 loss : 5.14150191891578
epoch : 27 loss : 5.140596753807478
epo

In [99]:
input = "took a "


In [100]:

def predict(input):
  text = vocab_and_indexing(input,vocab)
  text = torch.tensor(padding(text,16),dtype=torch.long)
  text = text.unsqueeze(0)
  #print(text)
  output = model(text)
  output = torch.argmax(output,dim=1)
  word=[i for i in vocab.keys()][output]
  return word

In [101]:
for i in range(5):
  output = predict(input)
  input = input + " " + output
  print(input)


took a  a
took a  a a
took a  a a a
took a  a a a a
took a  a a a a a
