# Data Pre-processing

In [None]:
# data from https://www.cs.utexas.edu/~ai-lab/pubs/cocktail-ecml-01.pdf
# and https://www.cs.utexas.edu/users/ml/nldata.html 

import regex as re
from nltk.stem import SnowballStemmer
from urllib.request import urlopen
from contextlib import closing
from sklearn.model_selection import train_test_split

ss = SnowballStemmer('english')

inputs = []
queries = []

# Extract the raw data from the URL
with closing(urlopen('ftp://ftp.cs.utexas.edu/pub/mooney/nl-ilp-data/jobsystem/jobqueries640')) as r:
  for line in r.readlines():
    line = line.decode('utf-8')
    input, query = line.lower().split('],')

    # parse input. lowercase, stem with nltk, add <s>
    input = input[7:-2].split(',')
    input = [ss.stem(x) for x in input]
    inputs.append(input)

    # parse query 
    query = query.strip('.\n')
    # https://stackoverflow.com/questions/43092970/tokenize-by-using-regular-expressions-parenthesis
    query = re.findall(r"\w+(?:'\w+)?|[^\w\s]", query)
    query = ["<s>"] + query + ["</s>"]
    queries.append(query)

# do train test split of 500 training and 140 test instances
inputs_train, inputs_test, queries_train, queries_test = train_test_split(inputs, queries, test_size=140, random_state=8)

In [None]:
# Check the first 5 raw data
for i in range(5):
  print(inputs_train[i])
  print(queries_train[i])

['what', 'job', 'are', 'there', 'use', 'tcl/tk']
['<s>', 'answer', '(', '_1973', ',', '(', 'job', '(', '_1973', ')', ',', 'language', '(', '_1973', ',', '_1990', ')', ',', 'const', '(', '_1990', ',', "'", 'tcl', '/', 'tk', "'", ')', ')', ')', ')', '</s>']
['show', 'me', 'the', 'job', 'use', 'c++', 'that', 'requir', 'a', 'bscs', 'but', 'desir', 'a', 'mscs']
['<s>', 'answer', '(', 'c', ',', '(', 'job', '(', 'c', ')', ',', 'language', '(', 'c', ',', 'l', ')', ',', 'const', '(', 'l', ',', "'", 'c', '+', '+', "'", ')', ',', 'req_deg', '(', 'c', ',', 'd', ')', ',', 'const', '(', 'd', ',', "'", 'bscs', "'", ')', ',', 'des_deg', '(', 'c', ',', 'e', ')', ',', 'const', '(', 'e', ',', "'", 'mscs', "'", ')', ')', ')', ')', '</s>']
['what', 'job', 'are', 'there', 'for', 'a', 'network', 'specialist']
['<s>', 'answer', '(', '_3359', ',', '(', 'job', '(', '_3359', ')', ',', 'area', '(', '_3359', ',', '_3378', ')', ',', 'const', '(', '_3378', ',', "'", 'networking', "'", ')', ')', ')', ')', '</s>']
['g

In [None]:
from collections import Counter

input_vocab = Counter()
for l in inputs_train:
  input_vocab.update(l)

# Update the input word2idx and idx2word
input_word2idx = {}
for w, c in input_vocab.items():
  if c >= 2:
    input_word2idx[w] = len(input_word2idx)
input_word2idx['<UNK>'] = len(input_word2idx) # Set the index for unkown word
input_word2idx['<PAD>'] = len(input_word2idx) # Set the index for padding word
input_idx2word = {i:word for word,i in input_word2idx.items()} # Build the idx2word based on word2idx

input_vocab = list(input_word2idx.keys()) 

# Update the query word2idx and idx2word
query_vocab = Counter()
for q in queries_train:
  query_vocab.update(q)
query_vocab['<UNK>'] = 0 
query_vocab['<PAD>'] = 0
query_idx2word = {i:word for i, word in enumerate(query_vocab.keys())}
query_word2idx = {word:i for i, word in query_idx2word.items()}

In [None]:
# Check the first 5 word2idx, idx2word
print(list(input_word2idx.items())[:10])
print(list(input_idx2word.items())[:10])
print(list(query_word2idx.items())[:10])
print(list(query_idx2word.items())[:10])

[('what', 0), ('job', 1), ('are', 2), ('there', 3), ('use', 4), ('show', 5), ('me', 6), ('the', 7), ('c++', 8), ('that', 9)]
[(0, 'what'), (1, 'job'), (2, 'are'), (3, 'there'), (4, 'use'), (5, 'show'), (6, 'me'), (7, 'the'), (8, 'c++'), (9, 'that')]
[('<s>', 0), ('answer', 1), ('(', 2), ('_1973', 3), (',', 4), ('job', 5), (')', 6), ('language', 7), ('_1990', 8), ('const', 9)]
[(0, '<s>'), (1, 'answer'), (2, '('), (3, '_1973'), (4, ','), (5, 'job'), (6, ')'), (7, 'language'), (8, '_1990'), (9, 'const')]


In [None]:
# Feature Construction
inputs_train_tokens = [[input_word2idx.get(w, input_word2idx['<UNK>']) for w in l] for l in inputs_train]
inputs_test_tokens = [[input_word2idx.get(w, input_word2idx['<UNK>']) for w in l] for l in inputs_test]

queries_train_tokens = [[query_word2idx.get(w, query_word2idx['<UNK>']) for w in l] for l in queries_train]
queries_test_tokens = [[query_word2idx.get(w, query_word2idx['<UNK>']) for w in l] for l in queries_test]

# Add paddings to each feature
def pad(input_seq, max_len, pad_token_idx):
  input_seq = input_seq[:max_len]
  padded_seq = input_seq + (max_len - len(input_seq)) * [pad_token_idx]
  return padded_seq

inputs_max_target_len = max([len(i) for i in inputs_train_tokens])
inputs_train_tokens = [pad(i, inputs_max_target_len, input_word2idx['<PAD>']) for i in inputs_train_tokens]
inputs_test_tokens = [pad(i, inputs_max_target_len, input_word2idx['<PAD>']) for i in inputs_test_tokens]

queries_max_target_len = int(max([len(i) for i in queries_train_tokens]) * 1.5) 
queries_train_tokens = [pad(i, queries_max_target_len, query_word2idx['<PAD>']) for i in queries_train_tokens]
queries_test_tokens = [pad(i, queries_max_target_len, query_word2idx['<PAD>']) for i in queries_test_tokens]

In [None]:
# Check the first 5 train, test tokens
# Print the number of tokens, feature length, the first feature
print(len(inputs_train_tokens), len(inputs_train_tokens[0]), inputs_train_tokens[0])
print(len(inputs_test_tokens), len(inputs_test_tokens[0]), inputs_test_tokens[0])
print(len(queries_train_tokens), len(queries_train_tokens[0]), queries_train_tokens[0])
print(len(queries_test_tokens), len(queries_test_tokens[0]), queries_test_tokens[0])

501 22 [0, 1, 2, 3, 4, 227, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228]
140 22 [2, 3, 48, 1, 20, 227, 227, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228]
501 166 [0, 1, 2, 3, 4, 2, 5, 2, 3, 6, 4, 7, 2, 3, 4, 8, 6, 4, 9, 2, 8, 4, 10, 11, 12, 13, 10, 6, 6, 6, 6, 14, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502]
1

# Data Loading

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader, default_collate

class JobsDataset(Dataset):
  def __init__(self, inputs, queries):
    self.inputs = inputs
    self.queries = queries

  def __len__(self):
      return len(self.inputs)

  def __getitem__(self, idx):
      return self.inputs[idx], self.queries[idx]

def build_datasets():
  jobs_train = JobsDataset(inputs=inputs_train_tokens, queries=queries_train_tokens)
  jobs_test = JobsDataset(inputs=inputs_test_tokens, queries=queries_test_tokens)
  return jobs_train, jobs_test

def collate(batch):
  src, tgt = default_collate(batch)
  return torch.stack(src), torch.stack(tgt)

def build_dataloaders(dataset_train, dataset_test, train_batch_size):
  dataloader_train = DataLoader(dataset_train, batch_size=train_batch_size, shuffle=True, collate_fn=collate)
  dataloader_test = DataLoader(dataset_test, batch_size=1, shuffle=False, collate_fn=collate)
  return dataloader_train, dataloader_test

In [None]:
# Build a dataset
jobs_train, jobs_test = build_datasets()
print(len(jobs_train[i][0]), len(jobs_train[i][1]))
print(len(jobs_test[i][0]), len(jobs_test[i][1]))

# Buid a dataloader
train_batch_size = 128
dataloader_train, dataloader_test = build_dataloaders(jobs_train, jobs_test, train_batch_size)

22 166
22 166


# Todo: Define model

## Encoder Implementation

In [None]:
import torch.nn as nn
import random

class Encoder(nn.Module):
    def __init__(self, input_dim: int, emb_dim: int, hid_dim: int, n_layers: int):
        super().__init__()

        self.embedding = nn.Embedding(num_embeddings=input_dim, embedding_dim=emb_dim)
        self.lstm = nn.LSTM(input_size=emb_dim, hidden_size=hid_dim, num_layers=n_layers)

    def forward(self, enc_input: torch.LongTensor): 
        '''
        Input: enc_input -> [feature length, batch size]
        Output: outputs  -> [feature length, batch size, hid dim]
                hidden   -> [n layers, batch size, hid dim]
                cell     -> [n layers, batch size, hid dim]
        '''
        embedding = self.embedding(enc_input) # [feature length, batch size, emb dim]
        outputs, (hidden, cell) = self.lstm(embedding)

        return outputs, hidden, cell 

In [None]:
# Use gpu or cpu 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Pre-define parameters
input_dim, output_dim = len(input_vocab), len(query_vocab)
emb_dim = 128
hidden_dim = 256
num_layers = 2

# Extract one batch from the dataloader_test
input, query = next(iter(dataloader_test))
print("input's shape = {}, query's shape = {}".format(input.shape, query.shape))

# Check the dimension of encoder's output
encoder = Encoder(input_dim, emb_dim, hidden_dim, num_layers).to(device)
enc_outputs, hidden, cell = encoder(input.to(device))
print(encoder)
print("enc_outputs's shape = {}, hidden's shape = {}, cell's shape = {}".format(enc_outputs.shape, hidden.shape, cell.shape))

input's shape = torch.Size([22, 1]), query's shape = torch.Size([166, 1])
Encoder(
  (embedding): Embedding(229, 128)
  (lstm): LSTM(128, 256, num_layers=2)
)
enc_outputs's shape = torch.Size([22, 1, 256]), hidden's shape = torch.Size([2, 1, 256]), cell's shape = torch.Size([2, 1, 256])


## Decoder Implementation

In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim: int, emb_dim: int, hid_dim: int, n_layers: int):
        super().__init__()

        self.hid_dim = hid_dim
        self.output_dim = output_dim

        self.embedding = nn.Embedding(num_embeddings=output_dim, embedding_dim=emb_dim)
        self.lstm = nn.LSTM(input_size=emb_dim, hidden_size=hid_dim, num_layers=n_layers)

    def forward(self, dec_input: torch.LongTensor, hidden: torch.FloatTensor, cell: torch.FloatTensor): 
        '''
        Input: dec_input -> [feature length, batch size]
        Output: pred     -> [feature length, batch size, hid dim]
                hidden   -> [n layers, batch size, hid dim]
                cell     -> [n layers, batch size, hid dim]
        '''
        # dec_input = dec_input.unsqueeze(0)
        embedding = self.embedding(dec_input) # [1, batch size, emb dim]
        outputs, (hidden, cell) = self.lstm(embedding, (hidden, cell))

        return outputs, hidden, cell 


In [None]:
# Check the dimension of decoder's output
decoder = Decoder(output_dim, emb_dim, hidden_dim, num_layers).to(device)
dec_outputs, hidden, cell = decoder(query.to(device), hidden, cell)
print(decoder)
print("dec_outputs's shape = {}, hidden's shape = {}, cell's shape = {}".format(dec_outputs.shape, hidden.shape, cell.shape))

Decoder(
  (embedding): Embedding(503, 128)
  (lstm): LSTM(128, 256, num_layers=2)
)
dec_outputs's shape = torch.Size([166, 1, 256]), hidden's shape = torch.Size([2, 1, 256]), cell's shape = torch.Size([2, 1, 256])


In [None]:
# Bahdanau Attention Mechanism
class Attention(nn.Module):
    def __init__(self, hid_dim: int, output_dim: int):
        super().__init__()
        
        self.w0 = nn.Linear(hid_dim, output_dim) # query vocab size
        self.w1 = nn.Linear(hid_dim, hid_dim)
        self.w2 = nn.Linear(hid_dim, hid_dim)
        self.tanh = nn.Tanh()
        self.softmax = nn.Softmax(dim=0)

    def forward(self, hidden: torch.FloatTensor, enc_outputs: torch.FloatTensor, dec_outputs: torch.FloatTensor):
        #  the current hidden state of decoder
        h_l_t = hidden[-1] # [batch size, hid dim]

        # Attention score with the k-th hidden state 
        score = self.softmax(enc_outputs * h_l_t)
        
        # n = torch.exp(enc_outputs * h_l_t)
        # m = torch.sum(torch.exp(enc_outputs * h_l_t), dim=0) 
        # score =  n / m

        # Attention
        # Context vector is computed by weighted sum of the hidden vectors in the encoder
        context_vec = torch.sum(score * enc_outputs, dim=0) # [batch size, hid dim]
        output = self.w0(self.tanh(self.w1(dec_outputs) + self.w2(context_vec)))

        return output

In [None]:
att_layer = Attention(hidden_dim, output_dim).to(device)
att_output = att_layer(hidden, enc_outputs, dec_outputs)
print(att_layer)
print(att_output.shape)

Attention(
  (w0): Linear(in_features=256, out_features=503, bias=True)
  (w1): Linear(in_features=256, out_features=256, bias=True)
  (w2): Linear(in_features=256, out_features=256, bias=True)
  (tanh): Tanh()
  (softmax): Softmax(dim=0)
)
torch.Size([166, 1, 503])


In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder: Encoder, decoder: Decoder, attention: Attention, device: torch.device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.attention = attention
        self.device = device

    def forward(self, input_batch: torch.LongTensor, query_batch: torch.LongTensor, teacher_forcing_ratio: float=1.0):

        query_size, batch_size = query_batch.shape
        query_vocab_size = self.decoder.output_dim

        # Initializae a tensor to store decoder's output
        outputs = torch.zeros(query_size, batch_size, query_vocab_size).to(self.device) # [166, batch size, 503]

        # Last hidden & cell state of the encoder is used as the decoder's initial hidden state
        enc_outputs, hidden, cell = self.encoder(input_batch)

        # Predict token by token
        query = query_batch[0].unsqueeze(0) # <s>, start of the sentence
        for i in range(1, query_size):
            
            dec_outputs, hidden, cell = self.decoder(query, hidden, cell)
            outputs[i] = self.attention(hidden, enc_outputs, dec_outputs)

            # apply teacher force
            best_pred = torch.argmax(outputs[i], dim=-1)
            teacher_forcing = random.random() < teacher_forcing_ratio
            query = query_batch[i].unsqueeze(0) if teacher_forcing else best_pred.unsqueeze(0)

        return outputs


In [None]:
# Check seq2seq
seq2seq = Seq2Seq(encoder, decoder, att_layer, device).to(device)
outputs = seq2seq(input.to(device), query.to(device))
print(seq2seq)
print("outputs's shape = {}".format(outputs.shape))

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(229, 128)
    (lstm): LSTM(128, 256, num_layers=2)
  )
  (decoder): Decoder(
    (embedding): Embedding(503, 128)
    (lstm): LSTM(128, 256, num_layers=2)
  )
  (attention): Attention(
    (w0): Linear(in_features=256, out_features=503, bias=True)
    (w1): Linear(in_features=256, out_features=256, bias=True)
    (w2): Linear(in_features=256, out_features=256, bias=True)
    (tanh): Tanh()
    (softmax): Softmax(dim=0)
  )
)
outputs's shape = torch.Size([166, 1, 503])


In [None]:
def create_model(input_dim, output_dim, device):
    emb_dim = 128
    hidden_dim = 256
    num_layers = 2

    attention = Attention(hidden_dim, output_dim).to(device)
    encoder = Encoder(input_dim, emb_dim, hidden_dim, num_layers).to(device)
    decoder = Decoder(output_dim, emb_dim, hidden_dim, num_layers).to(device)
    seq2seq = Seq2Seq(encoder, decoder, attention, device).to(device)

    return seq2seq

# Todo: Training and testing loops

In [None]:
QUERY_SOS_INDEX = query_word2idx['<s>']
QUERY_EOS_INDEX = query_word2idx['</s>']
QUERY_PAD_INDEX = query_word2idx['<PAD>']

print(QUERY_SOS_INDEX, QUERY_EOS_INDEX, QUERY_PAD_INDEX)

0 14 502


In [None]:
from timeit import default_timer as timer
def train(model, train_dataloader, num_epochs, device="cuda"):

    loss_fn = torch.nn.CrossEntropyLoss(ignore_index=QUERY_PAD_INDEX).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.003)
    
    # Initialize parameters for the model
    # def init_weights(m):
    #     for name, param in m.named_parameters():
    #         nn.init.normal_(param.data, mean=0, std=0.01)        
    # model.apply(init_weights)
    
    # Training loop
    for epoch in range(num_epochs):
        start_time = timer()

        model.train()
        epoch_loss = 0

        # Iterate the batches
        for input, query in train_dataloader:
            # Process the data in specified device
            input, query = input.to(device), query.to(device)
            # print(input.shape, query.shape)

            # Clear the gradient
            optimizer.zero_grad() 

            # Get logits and loss
            logits = model(input, query)

            # Compute loss
            loss = loss_fn(logits[1:].reshape(-1, logits.shape[-1]), query[1:].reshape(-1))

            # Update parameters
            loss.backward()
            optimizer.step()

            # Update the loss
            epoch_loss += loss.item()
        
        end_time = timer()

        train_loss = epoch_loss / len(train_dataloader)
        print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))

    return model


In [None]:
def evaluate(model, dataloader, device="cuda"):
  model.eval()
  cnt, total = 0, 0 
  flag = 1
  with torch.no_grad():
      for input, query in dataloader:
        input, query = input.to(device), query.to(device)
        logits = model(input, query, teacher_forcing_ratio=0)
        pred = torch.argmax(logits, dim=-1) # Find index with largest possibility for each row

        preds = []
        truth = []
        for p, q in zip(pred[1:].squeeze(), query[1:].squeeze()):
          if q.item() == QUERY_EOS_INDEX: # no need to count the paddings after </s>
            break
          
          if p == q:
            cnt += 1
          
          total += 1

          preds.append(query_idx2word[p.item()])
          truth.append(query_idx2word[q.item()])

        if flag <= 10:
          print("Predictions  ", " ".join(preds))
          print("True Queries ", " ".join(truth))
        flag += 1

  acc = cnt / total
  # print("cnt={}, total={}".format(cnt, total))
  return acc

# Run this!

Your outputs should look something like this (not exactly the same numbers, just in a similar ballpark and format).

```
Epoch: 1, Train loss: 4.590
Epoch: 2, Train loss: 1.871
Epoch: 3, Train loss: 1.424
...
Test Accuracy: 0.5195115804672241
```



In [None]:
def main():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    jobs_train, jobs_test = build_datasets()
    dataloader_train, dataloader_test = build_dataloaders(jobs_train, jobs_test, train_batch_size=64)
    model = create_model(input_dim = len(input_vocab), output_dim = len(query_vocab), device=device)
    model = train(model, dataloader_train, num_epochs=15, device=device)
    test_accuracy = evaluate(model, dataloader_test, device=device)
    print(f'Test Accuracy: {test_accuracy}')
    return model

m = main()


Epoch: 0, Train loss: 4.059, Epoch time = 5.462s
Epoch: 1, Train loss: 2.982, Epoch time = 2.916s
Epoch: 2, Train loss: 2.856, Epoch time = 2.817s
Epoch: 3, Train loss: 2.698, Epoch time = 2.801s
Epoch: 4, Train loss: 2.404, Epoch time = 2.811s
Epoch: 5, Train loss: 2.086, Epoch time = 2.793s
Epoch: 6, Train loss: 1.807, Epoch time = 2.782s
Epoch: 7, Train loss: 1.584, Epoch time = 2.776s
Epoch: 8, Train loss: 1.422, Epoch time = 2.818s
Epoch: 9, Train loss: 1.314, Epoch time = 2.759s
Epoch: 10, Train loss: 1.227, Epoch time = 2.724s
Epoch: 11, Train loss: 1.157, Epoch time = 2.739s
Epoch: 12, Train loss: 1.099, Epoch time = 2.804s
Epoch: 13, Train loss: 1.043, Epoch time = 2.845s
Epoch: 14, Train loss: 0.999, Epoch time = 2.769s
Predictions   answer ( a , ( job ( a ) , loc ( a , c ) , const ( c , ' bscs ' ) ) ) ) </s> (
True Queries  answer ( <UNK> , ( job ( <UNK> ) , area ( <UNK> , <UNK> ) , const ( <UNK> , ' tcp / ip ' ) ) ) )
Predictions   answer ( a , ( job ( a ) , loc ( a , c ) ,


Google Drive link for the file: [**link**](https://colab.research.google.com/drive/1MTy_Yxjbwxil7rAentYmy3G6eLUT3m9J?usp=share_link)

Video link: [**link**](https://drive.google.com/file/d/1vqQFgroQEtV3l_qL1ugmV8fSEI8HwcGg/view?usp=share_link)