In [25]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
!apt-get install -y git

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git is already the newest version (1:2.34.1-1ubuntu1.10).
0 upgraded, 0 newly installed, 0 to remove and 16 not upgraded.


In [27]:
!git clone https://github.com/Tanio253/Simple-NER-from-scatch.git

Cloning into 'Simple-NER-from-scatch'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (3/3), done.
fatal: destination path 'Simple-NER-from-scatch' already exists and is not an empty directory.


In [28]:
import torch
import pprint
from torch.utils.data import DataLoader
from functools import partial
pp = pprint.PrettyPrinter()

In [29]:
corpus = ['Lily will go to America on Saturday!',
             'Tom sits next to Sarah on the math class',
             'Soyeon will never forget her trip to Japan with her family',
             'Thanh will marry Nghia on June 16th, wont he',
             'My mom went shopping with Olivia, which is my aunt, on Sunday afternoon, did she?']

In [30]:
names = ["lily","tom","sarah","soyeon","thanh","nghia","olivia"]

In [31]:
def custom_tokenization(sentence):
  #lowercase the word
  tokens = sentence.lower()
  #remove punctuation
  punctuation = ['?','!',","]
  for p in punctuation:
    tokens = tokens.replace(p,'')
  #split to token
  tokens = tokens.split()
  return tokens

In [32]:
train_sentences = [custom_tokenization(sentence) for sentence in corpus]
print(train_sentences)

[['lily', 'will', 'go', 'to', 'america', 'on', 'saturday'], ['tom', 'sits', 'next', 'to', 'sarah', 'on', 'the', 'math', 'class'], ['soyeon', 'will', 'never', 'forget', 'her', 'trip', 'to', 'japan', 'with', 'her', 'family'], ['thanh', 'will', 'marry', 'nghia', 'on', 'june', '16th', 'wont', 'he'], ['my', 'mom', 'went', 'shopping', 'with', 'olivia', 'which', 'is', 'my', 'aunt', 'on', 'sunday', 'afternoon', 'did', 'she']]


In [33]:
train_labels = [[1 if word in names else 0 for word in sentence] for sentence in train_sentences]
pp.pprint(train_labels)

[[1, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 1, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 1, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


In [34]:
vocabulary = set(sorted([word for sentence in train_sentences for word in sentence]))

In [35]:
# Add the unknown token and padden token so that we can use window slide comfortably
vocabulary.add("<unk>")
vocabulary.add("<pad>")

In [36]:
ind2word = {ind:word for ind,word in enumerate(vocabulary)}
word2ind = {word:ind for ind,word in enumerate(vocabulary)}
def token2id(sentence):
  # ind = []
  # for sentence in sentences:
  #   index = []
  #   for word in sentence:
  #     print(word)
  #     index.append(word2ind[word])
  #   ind.append(index)
  # return ind
  return [word2ind[word] if word in vocabulary else word2ind["<unk>"] for word in sentence]
print(train_sentences)
train_sentences_ind = [token2id(sentence) for sentence in train_sentences]
print(train_sentences_ind)
# pp.pprint(ind2word[vocab[0][1]])

[['lily', 'will', 'go', 'to', 'america', 'on', 'saturday'], ['tom', 'sits', 'next', 'to', 'sarah', 'on', 'the', 'math', 'class'], ['soyeon', 'will', 'never', 'forget', 'her', 'trip', 'to', 'japan', 'with', 'her', 'family'], ['thanh', 'will', 'marry', 'nghia', 'on', 'june', '16th', 'wont', 'he'], ['my', 'mom', 'went', 'shopping', 'with', 'olivia', 'which', 'is', 'my', 'aunt', 'on', 'sunday', 'afternoon', 'did', 'she']]
[[14, 25, 26, 39, 20, 22, 3], [31, 12, 29, 39, 38, 22, 16, 21, 1], [18, 25, 40, 35, 24, 42, 39, 28, 37, 24, 36], [2, 25, 7, 10, 22, 15, 41, 4, 8], [23, 27, 17, 32, 37, 9, 34, 0, 23, 30, 22, 13, 5, 19, 11]]


In [37]:
def padded_sentence(sentence,window_slide = 2):
  pad_sentence = window_slide*[word2ind["<pad>"]]
  return pad_sentence+sentence+pad_sentence

In [38]:
pad_ind = word2ind["<pad>"]
unk_ind = word2ind["<unk>"]

In [39]:
def custom_fn(batches,window_size,pad_ind):
  x_batch, y_batch = zip(*batches)
  # x_batch
  x_batch = [custom_tokenization(x) for x in x_batch] #tokenization
  x_batch = [token2id(x) for x in x_batch]
  x_batch = [padded_sentence(x_batch,window_size) for x_batch in x_batch] #fill pad to either side of x_batch
  x_batch = [torch.tensor(x_batch) for x_batch in x_batch]
  padded_x_batch = torch.nn.utils.rnn.pad_sequence(x_batch,batch_first = True,padding_value = pad_ind)
  #label
  lengths = [len(y) for y in y_batch] if len(batches)>1 else [len(y_batch)]
  y_batch = [torch.tensor(y_batch) for y_batch in y_batch]
  padded_y_batch = torch.nn.utils.rnn.pad_sequence(y_batch,batch_first = True,padding_value = 0)
  return padded_x_batch,padded_y_batch, lengths

In [40]:
train_sentences = corpus.copy()
batch_size = 2
shuffle = True
window_size = 2
pad_ind = word2ind['<pad>']
collate_fn = partial(custom_fn,window_size = window_size,pad_ind = pad_ind)
data = list(zip(train_sentences,train_labels))
data = DataLoader(data,batch_size = batch_size, shuffle = shuffle,collate_fn = collate_fn)
for x_batch, y_batch, _ in data:
  print(x_batch)
  print(y_batch)

tensor([[33, 33, 23, 27, 17, 32, 37,  9, 34,  0, 23, 30, 22, 13,  5, 19, 11, 33,
         33],
        [33, 33, 18, 25, 40, 35, 24, 42, 39, 28, 37, 24, 36, 33, 33, 33, 33, 33,
         33]])
tensor([[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
tensor([[33, 33,  2, 25,  7, 10, 22, 15, 41,  4,  8, 33, 33],
        [33, 33, 31, 12, 29, 39, 38, 22, 16, 21,  1, 33, 33]])
tensor([[1, 0, 0, 1, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 1, 0, 0, 0, 0]])
tensor([[33, 33, 14, 25, 26, 39, 20, 22,  3, 33, 33]])
tensor([[1, 0, 0, 0, 0, 0, 0]])


In [41]:
import torch.nn as nn
class location_reg(nn.Module):
  def __init__(self, hidden_dim, output_dim, hyperparams, vocab_size, embed_dim):
    super(location_reg,self).__init__()
    self.freeze_embed = hyperparams["freeze_embed"]
    self.window_size = hyperparams["window_size"]
    # Embedding layer
    self.embed = nn.Embedding(vocab_size,embed_dim,_freeze = self.freeze_embed)
    # Hidden layer
    self.hidden = nn.Sequential(
        nn.Linear((self.window_size*2+1)*embed_dim,hidden_dim),
        nn.Tanh(),
    )
    # Output layer (B,L~,H)
    self.output = nn.Linear(hidden_dim,1)
    self.probability = nn.Sigmoid()
  def forward(self,batch):
    B, L = batch.shape
    batch_unfold = batch.unfold(1,self.window_size*2+1,1) # B,L~,W
    _, adjusted_length, _ = batch_unfold.size()
    out = self.embed(batch_unfold)
    out = out.view(B,adjusted_length,-1)
    out = self.hidden(out)
    out = self.output(out)
    out = self.probability(out)
    out = out.view(B,-1)
    return out

In [42]:
def loss_func(x_batch,y_batch,lengths):
  bce = nn.BCELoss()
  # print(x_batch)
  # print(y_batch)
  loss = bce(x_batch,y_batch)
  loss /= torch.sum(torch.tensor(lengths)).float()
  return loss

In [43]:
hidden_dim = 64
output_dim = 128
embed_dim = 32
hyperparams = {
    "freeze_embed":False,
    'window_size':2,
    'learning_rate':5e-4,
}
vocab_size = len(vocabulary)
model = location_reg(hidden_dim,output_dim,hyperparams,vocab_size,embed_dim)

In [44]:
optimizer = torch.optim.Adam(model.parameters(),hyperparams["learning_rate"])
def train_epoch(model,loader,optimizer):
  total_loss = 0
  for x_batch, y_batch, lengths in loader:
    optimizer.zero_grad()
    output = model(x_batch)
    loss = loss_func(output.float(),y_batch.float(),lengths)
    loss.backward()
    optimizer.step() # optimizer the parameters
    total_loss+=loss.item()
  return total_loss
def train(model,loader,optimizer,num_epochs = 1000):
  for e in range(num_epochs):
    loss = train_epoch(model,loader,optimizer)
    if (e%100==0): print("Loss at epoch %d is: %.8f" %(e,loss))


In [45]:
train(model,data,optimizer)

Loss at epoch 0 is: 0.81916051
Loss at epoch 100 is: 0.02390070
Loss at epoch 200 is: 0.00428474
Loss at epoch 300 is: 0.00190794
Loss at epoch 400 is: 0.00229514
Loss at epoch 500 is: 0.00148804
Loss at epoch 600 is: 0.00081343
Loss at epoch 700 is: 0.00071269
Loss at epoch 800 is: 0.00055020
Loss at epoch 900 is: 0.00043352


In [46]:
# For prediction
test_sent = ["Lisa will come to Thailand on Sunday","Helios is now the king of Jordan"]
test_label = [[1,0,0,0,0,0,0],[1,0,0,0,0,0,0]]
test_batch = list(zip(test_sent,test_label))
batch_size = 2
print(test_batch)
shuffle = True
test_ld = DataLoader(test_batch,batch_size = batch_size,shuffle = shuffle,collate_fn = collate_fn)
for x_batch, y_batch , _ in test_ld:
  print(model(x_batch))
  print(y_batch)

[('Lisa will come to Thailand on Sunday', [1, 0, 0, 0, 0, 0, 0]), ('Helios is now the king of Jordan', [1, 0, 0, 0, 0, 0, 0])]
tensor([[1.1395e-01, 9.9435e-01, 1.8362e-02, 9.1821e-01, 9.6556e-01, 3.5669e-01,
         5.6825e-04],
        [9.7992e-01, 1.2436e-01, 1.3700e-03, 1.1360e-02, 1.9920e-01, 2.0421e-04,
         8.1018e-05]], grad_fn=<ViewBackward0>)
tensor([[1, 0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 0]])
