In [158]:
import torch
import pprint
from torch.utils.data import DataLoader
from functools import partial
pp = pprint.PrettyPrinter()

In [159]:
# corpus = ['Lily will go to America on Saturday!',
#              'My sister sits next to Sarah on the math class',
#              'My mom will never forget her trip to Japan with her family',
#              'he will marry Nghia on June 16th, wont Thanh',
#              'My mom went shopping with Olivia, which is my aunt, on Sunday afternoon, did she?',
#             'What did Anna do last night?',
#              'The cop will catch up the burglar, whose name is Kaka, is a famouse footballer.',
#              'Why are you going to Hoang Sa island?']

In [160]:
corpus = [
    "Jennifer and Michael visited the Eiffel Tower in Paris last summer.",
    "John and Emily plan to travel to London for their anniversary.",
    "Elon Musk is the CEO of SpaceX and Tesla, Inc.",
    "Dr. Smith will speak at the conference in New York next week.",
    "Alice works as a software engineer at Google.",
    "The famous painter, Pablo Picasso, was born in Spain.",
    "I met Michelle Obama at the charity event last month.",
    "Steve Jobs co-founded Apple Inc. with Steve Wozniak.",
    "Mount Everest is the highest peak in the world, located in Nepal.",
    "My favorite actor is Tom Hanks.",
    "Taylor Swift won the Grammy Award for Album of the Year.",
    "The Great Wall of China is a UNESCO World Heritage Site.",
    "Queen Elizabeth II has been the reigning monarch of the United Kingdom since 1952.",
    "Stephen Hawking was a renowned theoretical physicist.",
    "Harrison Ford played the role of Indiana Jones in the movie series.",
]


In [161]:
#names = ["lily","sarah","thanh","nghia","olivia",'anna','kaka']
names = ['jennifer','michael','john','emily','elon','musk','smith','alice','pablo','picasso','michelle','steve','jobs','steve','wozniak','tom','hanks','taylor','swift','elizabeth','stephen','hawking','harrison','ford']

In [162]:
def custom_tokenization(sentence):
  #lowercase the word
  tokens = sentence.lower()
  #remove punctuation
  punctuation = ['?','!',",",'.']
  for p in punctuation:
    tokens = tokens.replace(p,'')
  #split to token
  tokens = tokens.split()
  return tokens

In [163]:
train_sentences = [custom_tokenization(sentence) for sentence in corpus] #test tokenization
print(train_sentences)

[['jennifer', 'and', 'michael', 'visited', 'the', 'eiffel', 'tower', 'in', 'paris', 'last', 'summer'], ['john', 'and', 'emily', 'plan', 'to', 'travel', 'to', 'london', 'for', 'their', 'anniversary'], ['elon', 'musk', 'is', 'the', 'ceo', 'of', 'spacex', 'and', 'tesla', 'inc'], ['dr', 'smith', 'will', 'speak', 'at', 'the', 'conference', 'in', 'new', 'york', 'next', 'week'], ['alice', 'works', 'as', 'a', 'software', 'engineer', 'at', 'google'], ['the', 'famous', 'painter', 'pablo', 'picasso', 'was', 'born', 'in', 'spain'], ['i', 'met', 'michelle', 'obama', 'at', 'the', 'charity', 'event', 'last', 'month'], ['steve', 'jobs', 'co-founded', 'apple', 'inc', 'with', 'steve', 'wozniak'], ['mount', 'everest', 'is', 'the', 'highest', 'peak', 'in', 'the', 'world', 'located', 'in', 'nepal'], ['my', 'favorite', 'actor', 'is', 'tom', 'hanks'], ['taylor', 'swift', 'won', 'the', 'grammy', 'award', 'for', 'album', 'of', 'the', 'year'], ['the', 'great', 'wall', 'of', 'china', 'is', 'a', 'unesco', 'world'

In [164]:
train_labels = [[1 if word in names else 0 for word in sentence] for sentence in train_sentences] #test label
pp.pprint(train_labels)

[[1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 1, 1, 0, 0, 0, 0],
 [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
 [1, 1, 0, 0, 0, 0, 1, 1],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 1, 1],
 [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 1, 0, 0, 0, 0, 0],
 [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


In [165]:
vocabulary = set(sorted([word for sentence in train_sentences for word in sentence]))

In [166]:
# Add the unknown token and padden token so that we can use window slide comfortably
vocabulary.add("<unk>")
vocabulary.add("<pad>")

In [167]:
ind2word = {ind:word for ind,word in enumerate(vocabulary)}
word2ind = {word:ind for ind,word in enumerate(vocabulary)}
def token2id(sentence):
  # ind = []
  # for sentence in sentences:
  #   index = []
  #   for word in sentence:
  #     print(word)
  #     index.append(word2ind[word])
  #   ind.append(index)
  # return ind
  return [word2ind[word] if word in vocabulary else word2ind["<unk>"] for word in sentence]
print(train_sentences)
train_sentences_ind = [token2id(sentence) for sentence in train_sentences]
pp.pprint(train_sentences_ind)
# pp.pprint(ind2word[vocab[0][1]])

[['jennifer', 'and', 'michael', 'visited', 'the', 'eiffel', 'tower', 'in', 'paris', 'last', 'summer'], ['john', 'and', 'emily', 'plan', 'to', 'travel', 'to', 'london', 'for', 'their', 'anniversary'], ['elon', 'musk', 'is', 'the', 'ceo', 'of', 'spacex', 'and', 'tesla', 'inc'], ['dr', 'smith', 'will', 'speak', 'at', 'the', 'conference', 'in', 'new', 'york', 'next', 'week'], ['alice', 'works', 'as', 'a', 'software', 'engineer', 'at', 'google'], ['the', 'famous', 'painter', 'pablo', 'picasso', 'was', 'born', 'in', 'spain'], ['i', 'met', 'michelle', 'obama', 'at', 'the', 'charity', 'event', 'last', 'month'], ['steve', 'jobs', 'co-founded', 'apple', 'inc', 'with', 'steve', 'wozniak'], ['mount', 'everest', 'is', 'the', 'highest', 'peak', 'in', 'the', 'world', 'located', 'in', 'nepal'], ['my', 'favorite', 'actor', 'is', 'tom', 'hanks'], ['taylor', 'swift', 'won', 'the', 'grammy', 'award', 'for', 'album', 'of', 'the', 'year'], ['the', 'great', 'wall', 'of', 'china', 'is', 'a', 'unesco', 'world'

In [168]:
def padded_sentence(sentence,window_slide = 2):
  pad_sentence = window_slide*[word2ind["<pad>"]]
  return pad_sentence+sentence+pad_sentence

In [169]:
pad_ind = word2ind["<pad>"]
unk_ind = word2ind["<unk>"]

In [170]:
def custom_fn(batches,window_size,pad_ind):
  x_batch, y_batch = zip(*batches)
  # x_batch
  x_batch = [custom_tokenization(x) for x in x_batch] #tokenization
  x_batch = [token2id(x) for x in x_batch]
  x_batch = [padded_sentence(x_batch,window_size) for x_batch in x_batch] #fill pad to either side of x_batch
  x_batch = [torch.tensor(x_batch) for x_batch in x_batch]
  padded_x_batch = torch.nn.utils.rnn.pad_sequence(x_batch,batch_first = True,padding_value = pad_ind)
  #label
  lengths = [len(y) for y in y_batch] if len(batches)>1 else [len(y_batch)]
  y_batch = [torch.tensor(y_batch) for y_batch in y_batch]
  padded_y_batch = torch.nn.utils.rnn.pad_sequence(y_batch,batch_first = True,padding_value = 0)
  return padded_x_batch,padded_y_batch, lengths

In [171]:
batch_size = 2
shuffle = True
window_size = 2
pad_ind = word2ind['<pad>']
collate_fn = partial(custom_fn,window_size = window_size,pad_ind = pad_ind)
data = list(zip(corpus,train_labels))
data = DataLoader(data,batch_size = batch_size, shuffle = shuffle,collate_fn = collate_fn)
for x_batch, y_batch, _ in data:
  print(x_batch)
  print(y_batch)

tensor([[ 89,  89,  23, 112,  98,   3,  66,  50,  90, 110,  10,  83,  89,  89,
          89,  89,  89,  89],
        [ 89,  89, 102,  35,  96,  77,   9,  50,  22,  24,  86,  50,  88,  67,
          19,   4,  89,  89]])
tensor([[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
tensor([[ 89,  89,  21,  27,  41,  50,  82,  86,  42,   1,  28,  50, 105,  59,
          89,  89],
        [ 89,  89,  52,  85,  68,  50,  17,  81,  28,  50, 106,  49,  28,  13,
          89,  89]])
tensor([[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
tensor([[ 89,  89, 109,  73,  68,  50,  97,  86,  12,   2,   6,  32,  89,  89,
          89],
        [ 89,  89,  50,  55,  79,  86,  93,  68,  69,  58, 106,  78,  39,  89,
          89]])
tensor([[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
tensor([[ 89,  89,  56,  29,  33,  69, 104,  43,  66,  64,  89,  89],
        [ 89,  89,  46,  30,  11,  99,

In [172]:
import torch.nn as nn
class name_reg(nn.Module):
  def __init__(self, hidden_dim, output_dim, hyperparams, vocab_size, embed_dim):
    super(location_reg,self).__init__()
    self.freeze_embed = hyperparams["freeze_embed"]
    self.window_size = hyperparams["window_size"]
    # Embedding layer
    self.embed = nn.Embedding(vocab_size,embed_dim,_freeze = self.freeze_embed)
    # Hidden layer
    self.hidden = nn.Sequential(
        nn.Linear((self.window_size*2+1)*embed_dim,hidden_dim),
        nn.Tanh(),
    )
    # Output layer (B,L~,H)
    self.output = nn.Linear(hidden_dim,1)
    self.probability = nn.Sigmoid()
  def forward(self,batch):
    B, L = batch.shape
    batch_unfold = batch.unfold(1,self.window_size*2+1,1) # B,L~,W
    _, adjusted_length, _ = batch_unfold.size()
    out = self.embed(batch_unfold)
    out = out.view(B,adjusted_length,-1)
    out = self.hidden(out)
    out = self.output(out)
    out = self.probability(out)
    out = out.view(B,-1)
    return out

In [173]:
def loss_func(x_batch,y_batch,lengths):
  bce = nn.BCELoss()
  # print(x_batch)
  # print(y_batch)
  loss = bce(x_batch,y_batch)
  loss /= torch.sum(torch.tensor(lengths)).float()
  return loss

In [174]:
hidden_dim = 64
output_dim = 128
embed_dim = 32
hyperparams = {
    "freeze_embed":False,
    'window_size':2,
    'learning_rate':5e-4,
}
vocab_size = len(vocabulary)
model = location_reg(hidden_dim,output_dim,hyperparams,vocab_size,embed_dim)

In [175]:
optimizer = torch.optim.Adam(model.parameters(),hyperparams["learning_rate"])
def train_epoch(model,loader,optimizer):
  total_loss = 0
  for x_batch, y_batch, lengths in loader:
    optimizer.zero_grad()
    output = model(x_batch)
    loss = loss_func(output.float(),y_batch.float(),lengths)
    loss.backward()
    optimizer.step() # optimizer the parameters
    total_loss+=loss.item()
  return total_loss
def train(model,loader,optimizer,num_epochs = 1000):
  for e in range(num_epochs):
    loss = train_epoch(model,loader,optimizer)
    if (e%100==0): print("Loss at epoch %d is: %.8f" %(e,loss))


In [176]:
train(model,data,optimizer)

Loss at epoch 0 is: 0.89187053
Loss at epoch 100 is: 0.07151346
Loss at epoch 200 is: 0.00719415
Loss at epoch 300 is: 0.00456269
Loss at epoch 400 is: 0.00086311
Loss at epoch 500 is: 0.00093357
Loss at epoch 600 is: 0.00070348
Loss at epoch 700 is: 0.00010459
Loss at epoch 800 is: 0.00010680
Loss at epoch 900 is: 0.00006843


In [178]:
# For prediction
test_sent = ["Lisa will come to Thailand on Sunday","I met Jordan at the Washington airport",'What did Anna do last night?']
test_label = [[1,0,0,0,0,0,0],[0,0,1,0,0,0,0],[0,0,1,0,0,0]]
test_batch = list(zip(test_sent,test_label))
batch_size = 2
print(test_batch)
shuffle = True
test_ld = DataLoader(test_batch,batch_size = batch_size,shuffle = shuffle,collate_fn = collate_fn)
for x_batch, y_batch , _ in test_ld:
  print(model(x_batch))
  print(y_batch)

[('Lisa will come to Thailand on Sunday', [1, 0, 0, 0, 0, 0, 0]), ('I met Jordan at the Washington airport', [0, 0, 1, 0, 0, 0, 0]), ('What did Anna do last night?', [0, 0, 1, 0, 0, 0])]
tensor([[1.2942e-03, 6.6517e-02, 3.0294e-01, 2.1764e-06, 2.0040e-06, 2.4670e-06,
         1.3860e-05],
        [9.8770e-01, 9.5520e-01, 9.6007e-01, 8.0925e-02, 7.5799e-01, 1.0093e-01,
         4.9570e-01]], grad_fn=<ViewBackward0>)
tensor([[0, 0, 1, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 0]])
tensor([[0.9488, 0.9945, 0.9973, 0.5066, 0.0379, 0.0095]],
       grad_fn=<ViewBackward0>)
tensor([[0, 0, 1, 0, 0, 0]])
