# Data Pre-processing

In [1]:
# data from https://www.cs.utexas.edu/~ai-lab/pubs/cocktail-ecml-01.pdf
# and https://www.cs.utexas.edu/users/ml/nldata.html 

import regex as re
from nltk.stem import SnowballStemmer
from urllib.request import urlopen
from contextlib import closing
from sklearn.model_selection import train_test_split

ss = SnowballStemmer('english')

inputs = []
queries = []

# Extract the raw data from the URL
with closing(urlopen('ftp://ftp.cs.utexas.edu/pub/mooney/nl-ilp-data/jobsystem/jobqueries640')) as r:
  for line in r.readlines():
    line = line.decode('utf-8')
    input, query = line.lower().split('],')

    # parse input. lowercase, stem with nltk, add <s>
    input = input[7:-2].split(',')
    input = [ss.stem(x) for x in input]
    inputs.append(input)

    # parse query 
    query = query.strip('.\n')
    # https://stackoverflow.com/questions/43092970/tokenize-by-using-regular-expressions-parenthesis
    query = re.findall(r"\w+(?:'\w+)?|[^\w\s]", query)
    query = ["<s>"] + query + ["</s>"]
    queries.append(query)

# do train test split of 500 training and 140 test instances
inputs_train, inputs_test, queries_train, queries_test = train_test_split(inputs, queries, test_size=140, random_state=8)

In [2]:
# Check the first 5 raw data
for i in range(5):
  print(inputs_train[i])
  print(queries_train[i])

['what', 'job', 'are', 'there', 'use', 'tcl/tk']
['<s>', 'answer', '(', '_1973', ',', '(', 'job', '(', '_1973', ')', ',', 'language', '(', '_1973', ',', '_1990', ')', ',', 'const', '(', '_1990', ',', "'", 'tcl', '/', 'tk', "'", ')', ')', ')', ')', '</s>']
['show', 'me', 'the', 'job', 'use', 'c++', 'that', 'requir', 'a', 'bscs', 'but', 'desir', 'a', 'mscs']
['<s>', 'answer', '(', 'c', ',', '(', 'job', '(', 'c', ')', ',', 'language', '(', 'c', ',', 'l', ')', ',', 'const', '(', 'l', ',', "'", 'c', '+', '+', "'", ')', ',', 'req_deg', '(', 'c', ',', 'd', ')', ',', 'const', '(', 'd', ',', "'", 'bscs', "'", ')', ',', 'des_deg', '(', 'c', ',', 'e', ')', ',', 'const', '(', 'e', ',', "'", 'mscs', "'", ')', ')', ')', ')', '</s>']
['what', 'job', 'are', 'there', 'for', 'a', 'network', 'specialist']
['<s>', 'answer', '(', '_3359', ',', '(', 'job', '(', '_3359', ')', ',', 'area', '(', '_3359', ',', '_3378', ')', ',', 'const', '(', '_3378', ',', "'", 'networking', "'", ')', ')', ')', ')', '</s>']
['g

In [3]:
from collections import Counter

input_vocab = Counter()
for l in inputs_train:
  input_vocab.update(l)

# Update the input word2idx and idx2word
input_word2idx = {}
for w, c in input_vocab.items():
  if c >= 2:
    input_word2idx[w] = len(input_word2idx)
input_word2idx['<UNK>'] = len(input_word2idx) # Set the index for unkown word
input_word2idx['<PAD>'] = len(input_word2idx) # Set the index for padding word
input_idx2word = {i:word for word,i in input_word2idx.items()} # Build the idx2word based on word2idx

input_vocab = list(input_word2idx.keys()) 

# Update the query word2idx and idx2word
query_vocab = Counter()
for q in queries_train:
  query_vocab.update(q)
query_vocab['<UNK>'] = 0 
query_vocab['<PAD>'] = 0
query_idx2word = {i:word for i, word in enumerate(query_vocab.keys())}
query_word2idx = {word:i for i, word in query_idx2word.items()}

In [4]:
# Check the first 5 word2idx, idx2word
print(list(input_word2idx.items())[:10])
print(list(input_idx2word.items())[:10])
print(list(query_word2idx.items())[:10])
print(list(query_idx2word.items())[:10])

[('what', 0), ('job', 1), ('are', 2), ('there', 3), ('use', 4), ('show', 5), ('me', 6), ('the', 7), ('c++', 8), ('that', 9)]
[(0, 'what'), (1, 'job'), (2, 'are'), (3, 'there'), (4, 'use'), (5, 'show'), (6, 'me'), (7, 'the'), (8, 'c++'), (9, 'that')]
[('<s>', 0), ('answer', 1), ('(', 2), ('_1973', 3), (',', 4), ('job', 5), (')', 6), ('language', 7), ('_1990', 8), ('const', 9)]
[(0, '<s>'), (1, 'answer'), (2, '('), (3, '_1973'), (4, ','), (5, 'job'), (6, ')'), (7, 'language'), (8, '_1990'), (9, 'const')]


In [5]:
# Feature Construction
inputs_train_tokens = [[input_word2idx.get(w, input_word2idx['<UNK>']) for w in l] for l in inputs_train]
inputs_test_tokens = [[input_word2idx.get(w, input_word2idx['<UNK>']) for w in l] for l in inputs_test]

queries_train_tokens = [[query_word2idx.get(w, query_word2idx['<UNK>']) for w in l] for l in queries_train]
queries_test_tokens = [[query_word2idx.get(w, query_word2idx['<UNK>']) for w in l] for l in queries_test]

# Add paddings to each feature
def pad(input_seq, max_len, pad_token_idx):
  input_seq = input_seq[:max_len]
  padded_seq = input_seq + (max_len - len(input_seq)) * [pad_token_idx]
  return padded_seq

inputs_max_target_len = max([len(i) for i in inputs_train_tokens])
inputs_train_tokens = [pad(i, inputs_max_target_len, input_word2idx['<PAD>']) for i in inputs_train_tokens]
inputs_test_tokens = [pad(i, inputs_max_target_len, input_word2idx['<PAD>']) for i in inputs_test_tokens]

queries_max_target_len = int(max([len(i) for i in queries_train_tokens]) * 1.5) 
queries_train_tokens = [pad(i, queries_max_target_len, query_word2idx['<PAD>']) for i in queries_train_tokens]
queries_test_tokens = [pad(i, queries_max_target_len, query_word2idx['<PAD>']) for i in queries_test_tokens]

In [6]:
# Check the first 5 train, test tokens
# Print the number of tokens, feature length, the first feature
print(len(inputs_train_tokens), len(inputs_train_tokens[0]), inputs_train_tokens[0])
print(len(inputs_test_tokens), len(inputs_test_tokens[0]), inputs_test_tokens[0])
print(len(queries_train_tokens), len(queries_train_tokens[0]), queries_train_tokens[0])
print(len(queries_test_tokens), len(queries_test_tokens[0]), queries_test_tokens[0])

501 22 [0, 1, 2, 3, 4, 227, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228]
140 22 [2, 3, 48, 1, 20, 227, 227, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228]
501 166 [0, 1, 2, 3, 4, 2, 5, 2, 3, 6, 4, 7, 2, 3, 4, 8, 6, 4, 9, 2, 8, 4, 10, 11, 12, 13, 10, 6, 6, 6, 6, 14, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502]
1

# Data Loading

In [7]:
import torch
from torch.utils.data import Dataset, DataLoader, default_collate

class JobsDataset(Dataset):
  def __init__(self, inputs, queries):
    self.inputs = inputs
    self.queries = queries

  def __len__(self):
      return len(self.inputs)

  def __getitem__(self, idx):
      return self.inputs[idx], self.queries[idx]

def build_datasets():
  jobs_train = JobsDataset(inputs=inputs_train_tokens, queries=queries_train_tokens)
  jobs_test = JobsDataset(inputs=inputs_test_tokens, queries=queries_test_tokens)
  return jobs_train, jobs_test

def collate(batch):
  src, tgt = default_collate(batch)
  return torch.stack(src), torch.stack(tgt)

def build_dataloaders(dataset_train, dataset_test, train_batch_size):
  dataloader_train = DataLoader(dataset_train, batch_size=train_batch_size, shuffle=True, collate_fn=collate)
  dataloader_test = DataLoader(dataset_test, batch_size=1, shuffle=False, collate_fn=collate)
  return dataloader_train, dataloader_test

In [8]:
# Build a dataset
jobs_train, jobs_test = build_datasets()
print(len(jobs_train[i][0]), len(jobs_train[i][1]))
print(len(jobs_test[i][0]), len(jobs_test[i][1]))

# Buid a dataloader
train_batch_size = 128
dataloader_train, dataloader_test = build_dataloaders(jobs_train, jobs_test, train_batch_size)

22 166
22 166


# Todo: Define model

In [9]:
import torch.nn as nn
import random

class Encoder(nn.Module):
    def __init__(self, input_dim: int, emb_dim: int, hid_dim: int, n_layers: int, dropout: float):
        super().__init__()
        self.input_dim = input_dim # Input vocab size
        self.emb_dim = emb_dim # Embedding layer's dimension
        self.hid_dim = hid_dim # LSTM Hidden/Cell state's dimension
        self.n_layers = n_layers # Number of LSTM layers
        self.dropout = dropout # Dropout for the LSTM layer

        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)

    def forward(self, enc_input: torch.LongTensor): # feature length * batch size
        embedding = self.embedding(enc_input) # [feature len, batch size, emb dim]
        outputs, (hidden, cell) = self.lstm(embedding) # outputs -> [feature length, batch size, hidden dim * n directions]

        # LSTM hidden state， LSTM cell state
        return outputs, hidden, cell #  [n layers * n directions, batch size, hidden dim], [n layers * n directions, batch size, hidden dim]

In [10]:
# Use gpu or cpu 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Pre-define parameters
input_dim, output_dim = len(input_vocab), len(query_vocab)
emb_dim = 128
hidden_dim = 256
num_layers = 2
dropout_ratio = 0.1

# Extract one batch from the dataloader_test
input, query = next(iter(dataloader_test))
print("input's shape = {}, query's shape = {}".format(input.shape, query.shape))

# Check the dimension of encoder's output
encoder = Encoder(input_dim, emb_dim, hidden_dim, num_layers, dropout_ratio).to(device)
_, hidden, cell = encoder(input.to(device))
print(encoder)
print("hidden's shape = {}, cell's shape = {}".format(hidden.shape, cell.shape))

input's shape = torch.Size([22, 1]), query's shape = torch.Size([166, 1])
Encoder(
  (embedding): Embedding(229, 128)
  (lstm): LSTM(128, 256, num_layers=2, dropout=0.1)
)
hidden's shape = torch.Size([2, 1, 256]), cell's shape = torch.Size([2, 1, 256])


In [11]:
class Decoder(nn.Module):
    def __init__(self, output_dim: int, emb_dim: int, hid_dim: int, n_layers: int, dropout: float):
        super().__init__()
        self.output_dim = output_dim # Query vocab size.
        self.emb_dim = emb_dim # Embedding layer's dimension
        self.hid_dim = hid_dim # LSTM Hidden/Cell state's dimension
        self.n_layers = n_layers # Number of LSTM layers
        self.dropout = dropout # Dropout for the LSTM layer

        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.out = nn.Linear(hid_dim, output_dim)

    def forward(self, dec_input: torch.LongTensor, hidden: torch.FloatTensor, cell: torch.FloatTensor): # Batched tokenized source sentence of shape [batch size].
        embedding = self.embedding(dec_input) # [1, batch size, emb dim]
        outputs, (hidden, cell) = self.lstm(embedding, (hidden, cell))
        prediction = self.out(outputs.squeeze(0))
        return prediction, hidden, cell # [batch size, output dim], [n layers * n directions, batch size, hidden dim], [n layers * n directions, batch size, hidden dim]


In [12]:
# Check the dimension of decoder's output
decoder = Decoder(output_dim, emb_dim, hidden_dim, num_layers, dropout_ratio).to(device)
prediction, hidden, cell = decoder(query.to(device), hidden, cell)
print(decoder)
print("prediction's shape = {}, hidden's shape = {}, cell's shape = {}".format(prediction.shape, hidden.shape, cell.shape))

Decoder(
  (embedding): Embedding(503, 128)
  (lstm): LSTM(128, 256, num_layers=2, dropout=0.1)
  (out): Linear(in_features=256, out_features=503, bias=True)
)
prediction's shape = torch.Size([166, 1, 503]), hidden's shape = torch.Size([2, 1, 256]), cell's shape = torch.Size([2, 1, 256])


In [13]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder: Encoder, decoder: Decoder, device: torch.device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, input_batch: torch.LongTensor, query_batch: torch.LongTensor, teacher_forcing_ratio: float=1.0):

        query_size, batch_size = query_batch.shape
        query_vocab_size = self.decoder.output_dim

        # Initializae a tensor to store decoder's output
        outputs = torch.zeros(query_size, batch_size, query_vocab_size).to(self.device)

        # Last hidden & cell state of the encoder is used as the decoder's initial hidden state
        _, hidden, cell = self.encoder(input_batch)

        # Predict token by token
        query = query_batch[0].unsqueeze(0) # <S>, start of the sentence
        for i in range(1, query_size):
            pred, hidden, cell = self.decoder(query, hidden, cell)
            outputs[i] = pred

            # apply teacher force
            best_pred = pred.argmax(1)
            query = query_batch[i] if random.random() < teacher_forcing_ratio else best_pred
            query = query.unsqueeze(0)

        return outputs


In [14]:
# Check seq2seq
seq2seq = Seq2Seq(encoder, decoder, device).to(device)
outputs = seq2seq(input.to(device), query.to(device))
print(seq2seq)
print(outputs.shape)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(229, 128)
    (lstm): LSTM(128, 256, num_layers=2, dropout=0.1)
  )
  (decoder): Decoder(
    (embedding): Embedding(503, 128)
    (lstm): LSTM(128, 256, num_layers=2, dropout=0.1)
    (out): Linear(in_features=256, out_features=503, bias=True)
  )
)
torch.Size([166, 1, 503])


In [15]:
def create_model(input_dim, output_dim, device):
  emb_dim = 128
  hidden_dim = 256
  num_layers = 2
  dropout_ratio = 0.1

  encoder = Encoder(input_dim, emb_dim, hidden_dim, num_layers, dropout_ratio)
  decoder = Decoder(output_dim, emb_dim, hidden_dim, num_layers, dropout_ratio)
  seq2seq = Seq2Seq(encoder, decoder, device).to(device)

  return seq2seq

# Todo: Training and testing loops

In [16]:
QUERY_SOS_INDEX = query_word2idx['<s>']
QUERY_EOS_INDEX = query_word2idx['</s>']
QUERY_PAD_INDEX = query_word2idx['<PAD>']

print(QUERY_SOS_INDEX, QUERY_EOS_INDEX, QUERY_PAD_INDEX)

0 14 502


In [26]:
from timeit import default_timer as timer
def train(model, train_dataloader, num_epochs, device="cuda"):

  # Initialize a model
  loss_fn = torch.nn.CrossEntropyLoss(ignore_index=QUERY_PAD_INDEX)
  optimizer = torch.optim.Adam(model.parameters(), lr=0.005) 
  
  def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean=0, std=0.01)        
  model.apply(init_weights)
  
  # Training loop
  for epoch in range(num_epochs):
    start_time = timer()

    model.train()
    epoch_loss = 0

    # Iterate the batches
    for input, query in train_dataloader:
      # Process the data in specified device
      input, query = input.to(device), query.to(device)

      # Clear the gradient
      optimizer.zero_grad() 

      # Get logits and loss
      logits = model(input, query)

      # Compute loss
      loss = loss_fn(logits[1:].view(-1, logits.shape[-1]), query[1:].view(-1))

      # Update parameters
      loss.backward()
      optimizer.step()

      # Update the loss
      epoch_loss += loss.item()
    
    end_time = timer()

    train_loss = epoch_loss / len(train_dataloader)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))

  return model


In [24]:
def evaluate(model, dataloader, device="cuda"):
  model.eval()
  softmax = nn.Softmax(dim=0)
  cnt, total = 0, 0 
  flag = 1
  with torch.no_grad():
      for input, query in dataloader:
        input, query = input.to(device), query.to(device)
        # logits = softmax(model(input, query, teacher_forcing_ratio=0))
        logits = model(input, query, teacher_forcing_ratio=0)
        pred = torch.argmax(logits, dim=-1) # Find index with largest possibility for each row

        predw = []
        querw = []
        for p, q in zip(pred[1:].squeeze(), query[1:].squeeze()):
          if q.item() == QUERY_EOS_INDEX: # no need to count the paddings after </s>
            break
          
          if p == q:
            cnt += 1
          
          total += 1

          predw.append(query_idx2word[p.item()])
          querw.append(query_idx2word[q.item()])

        if flag <= 10:
          print(" ".join(predw))
          print(" ".join(querw))
        flag += 1

  acc = cnt / total
  print("cnt={}, total={}".format(cnt, total))
  return acc

# Run this!

Your outputs should look something like this (not exactly the same numbers, just in a similar ballpark and format).

```
Epoch: 1, Train loss: 4.590
Epoch: 2, Train loss: 1.871
Epoch: 3, Train loss: 1.424
...
Test Accuracy: 0.5195115804672241
```



In [27]:
def main():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    jobs_train, jobs_test = build_datasets()
    dataloader_train, dataloader_test = build_dataloaders(jobs_train, jobs_test, train_batch_size=20)
    model = create_model(input_dim = len(input_vocab), output_dim = len(query_vocab), device=device)
    model = train(model, dataloader_train, num_epochs=30, device=device)
    # test_accuracy = evaluate(model, dataloader_test, device=device)
    # print(f'Test Accuracy: {test_accuracy}')
    return model

m = main()


Epoch: 0, Train loss: 3.435, Epoch time = 4.739s
Epoch: 1, Train loss: 2.778, Epoch time = 4.666s
Epoch: 2, Train loss: 2.674, Epoch time = 4.668s
Epoch: 3, Train loss: 2.613, Epoch time = 4.714s
Epoch: 4, Train loss: 2.568, Epoch time = 4.731s
Epoch: 5, Train loss: 2.558, Epoch time = 4.759s
Epoch: 6, Train loss: 2.379, Epoch time = 5.282s
Epoch: 7, Train loss: 2.105, Epoch time = 4.714s
Epoch: 8, Train loss: 1.871, Epoch time = 4.797s
Epoch: 9, Train loss: 1.657, Epoch time = 5.157s
Epoch: 10, Train loss: 1.518, Epoch time = 4.654s
Epoch: 11, Train loss: 1.371, Epoch time = 4.678s
Epoch: 12, Train loss: 1.269, Epoch time = 4.709s
Epoch: 13, Train loss: 1.195, Epoch time = 4.586s
Epoch: 14, Train loss: 1.126, Epoch time = 4.569s
Epoch: 15, Train loss: 1.084, Epoch time = 4.652s
Epoch: 16, Train loss: 1.038, Epoch time = 4.660s
Epoch: 17, Train loss: 0.973, Epoch time = 4.659s
Epoch: 18, Train loss: 0.934, Epoch time = 4.681s
Epoch: 19, Train loss: 0.872, Epoch time = 4.643s
Epoch: 20,

In [28]:
test_accuracy = evaluate(m, dataloader_test, device=device)
print(f'Test Accuracy: {test_accuracy}')

answer ( a , ( job ( a ) , loc ( a , l ) , const ( l , ' austin ' ) , language ( a ,
answer ( <UNK> , ( job ( <UNK> ) , area ( <UNK> , <UNK> ) , const ( <UNK> , ' tcp / ip ' ) ) ) )
answer ( a , ( job ( a ) , loc ( a , b ) , const ( b , ' austin ' ) , language ( a ,
answer ( <UNK> , ( job ( <UNK> ) , language ( <UNK> , <UNK> ) , const ( <UNK> , ' c + + ' ) ) ) )
answer ( a , ( job ( a ) , loc ( a , l ) , const ( l , ' austin ' ) , language ( a , l ) , const ( c , ' austin ' ) , platform ( a , p ) , const
answer ( a , ( job ( a ) , loc ( a , b ) , const ( b , ' seattle ' ) , \ + ( ( company ( a , n ) , const ( n , ' microsoft ' ) ) ) ) ) )
answer ( a , ( job ( a ) , loc ( a , l ) , const ( l , ' austin ' ) , language ( a , l ) , const ( c , ' austin ' ) , platform ( a , p ) , const ( p , ' unix ' ) ,
answer ( a , ( job ( a ) , loc ( a , h ) , const ( h , ' houston ' ) , req_deg ( a , d ) , const ( d , ' bscs ' ) , req_exp ( a , e ) , const ( e , 1 ) ) ) )
answer ( a , ( job ( a ) , loc 