In [1]:
import torch
print(torch.__version__)
!nvidia-smi -L  

1.11.0+cu113
GPU 0: Tesla P100-PCIE-16GB (UUID: GPU-475a1ac1-caa8-dfdc-a3c5-d2e6ae21d7a9)


In [2]:
!pip install transformers
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
file_path = "/content/gdrive/MyDrive/Colab Notebooks/563Lab4/"

In [4]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score

In [5]:
from collections import defaultdict
intent_dic = {'find_restaurant':defaultdict(set), 'find_hotel':defaultdict(set)}
intent_dic

{'find_hotel': defaultdict(set, {}), 'find_restaurant': defaultdict(set, {})}

In [6]:
res_names_context = []
res_names_answers = []
hotel_names_context = []
hotel_names_answers = []
food_names_context = []
food_names_answer = []

with open(file_path + "WOZ_train_utt.txt") as f1:
  with open(file_path + "WOZ_train_ans.txt") as f2:
    utterances = f1.readlines()
    for i,line in enumerate(f2.readlines()):
        a_line = line.strip()
        ans = a_line.split('|')
        intent = ans[0]
        for a in ans[1:]:
            if 'restaurant-name' == a.split('=')[0]:
                res_names_context.append(utterances[i].strip())
                res_names_answers.append(a.split('=')[1])
            if 'restaurant-food' == a.split('=')[0]:
                food_names_context.append(utterances[i].strip())
                food_names_answer.append(a.split('=')[1])
            if 'hotel-name' == a.split('=')[0]:
                hotel_names_context.append(utterances[i].strip())
                hotel_names_answers.append(a.split('=')[1])
            intent_dic[intent][a.split('=')[0]].add(a.split('=')[1])
            
intent_dic

{'find_hotel': defaultdict(set,
             {'hotel-area': {'centre',
               'dontcare',
               'east',
               'north',
               'south',
               'west'},
              'hotel-internet': {'dontcare', 'no', 'yes'},
              'hotel-name': {'a and b guest house',
               'a and b quest house',
               'acorn guest house',
               'alexander bed and breakfast',
               'alexeander b&b',
               'allenbell',
               'alpha milton guest house',
               'alpha-milton guest house',
               'alyesbray lodge guest house',
               'arbury guesthouse and lodge',
               'arbury lodge',
               'arbury lodge guesthouse',
               'archway house',
               'ashley hotel',
               'autumn house',
               'avalon',
               'aylesbray lodge guest house',
               'b guesthouse',
               'bridge guest house',
               'cambridge belfr

**REST. NAME, FOOD NAME, HOTEL NAME classification**

In [7]:
food_names_context[:4]


['I am looking for a restaurant. I would like something cheap that has Chinese food.',
 'Yeah, could you recommend a good gastropub?',
 'I want to find an expensive restaurant and serves european food. Can i also have the address, phone number and its area. ?',
 "Where's a good place to eat crossover food in Cambridge?"]

In [8]:
food_names_answer[:4]

['chinese', 'gastropub', 'european', 'crossover']

In [9]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [10]:
def get_answer_span_tensor(question,context,answer):
    ques = (tokenizer.tokenize(question))
    # print(ques)
    context = (tokenizer.tokenize(context))
    # print(context)
    answer = (tokenizer.tokenize(answer))
    # print(answer)
    s = ["[CLS]"] + ques + ["[SEP]"] + context + ["[SEP]"]
    # print(s)
    start, end = 0, 0
    for id_a, token_a in enumerate(answer):
      for id_s, token_s in enumerate(s):
        if token_a == token_s:
          if answer == s[id_s:id_s+len(answer)]:
            start = id_s
            end = id_s + len(answer) - 1
            break
          elif len(s) - id_s + 1 <= len(answer):
            break
    return torch.tensor([start, end])

In [11]:
test_question = "what food?"
test_context = "I am looking for a restaurant. I would like something cheap that has Chinese food."
test_answer = "chinese"
bad_answer  = "indian"
span = get_answer_span_tensor(test_question,test_context,test_answer)
# print(span)
assert span.shape == (2,)
assert list(span) == [19,19]
span = get_answer_span_tensor(test_question,test_context,bad_answer)
assert list(span) == [0,0]
print('Success!')

Success!


In [12]:
def convert_to_BERT_tensors(questions, contexts):
    '''takes a parallel list of question strings and answer strings'''
    #your code here
    result = tokenizer(questions,contexts,  return_tensors="pt", padding='max_length', truncation=True,  max_length=512)
    return result['input_ids'], result['attention_mask']

In [13]:
test_questions = ["Why?", "How?"]
test_contexts = ["I think it is because we can bluminate", "It was done"" ".join(["very"]*1000) + " well"]

ids, mask = convert_to_BERT_tensors(test_questions,test_contexts)
assert ids.shape == (2,512) # 512 because that's the max allowed
assert ids[0][3] == 102 # fourth token is separator
assert list(ids[0][-100:]) == [0]*100 # first row is mostly padding
assert list(ids[1][-100:]) != [0]*100 # second row is not
assert list(mask[0][-100:]) == [0]*100 # first row padding is masked
assert list(mask[1][-100:]) != [0]*100 # second row is not padding, no mask
print("Success!")

Success!


In [14]:
a = [1,2]
b = [3,4]
a.extend(b)
a.extend([3]*4)
a

[1, 2, 3, 4, 3, 3, 3, 3]

In [15]:
food_question = ["What food?"]*len(food_names_context)
res_name_question = ["Which restaurant"]*len(res_names_context)
hotel_name_question = ["Which hotel"]*len(hotel_names_context)

len(food_names_context)

1103

In [16]:
#provided code
batch_size = 16

class QAdataset(Dataset):
    '''A dataset for housing QA data, including input_data, output_data, and padding mask'''
    def __init__(self, input_data, output_data,mask):
        self.input_data = input_data
        self.output_data = output_data
        self.mask = mask
        
    def __len__(self):
        return len(self.input_data)
    
    def __getitem__(self, index):
        target = self.output_data[index]
        data_val = self.input_data[index]
        mask = self.mask[index]
        return data_val,target,mask 

In [17]:

##BUILDING TRAIN


res_names_context = []
res_names_answers = []
hotel_names_context = []
hotel_names_answers = []
food_names_context = []
food_names_answer = []
all_context = []
with open(file_path + "WOZ_train_utt.txt") as f1:
  with open(file_path + "WOZ_train_ans.txt") as f2:
    utterances = f1.readlines()
    print(len(utterances))
    for i,line in enumerate(f2.readlines()):
        a_line = line.strip()
        ans = a_line.split('|')
        intent = ans[0]
        # if("hotel" not in intent):
        #     continue
        all_context.append(utterances[i].strip())
        resName = 'XXNOANSWERXX'
        foodName = 'XXNOANSWERXX'
        hotelName = 'XXNOANSWERXX'

        for a in ans[1:]:
            if 'restaurant-name' == a.split('=')[0]:
                resName = a.split('=')[1]
                
            if 'restaurant-food' == a.split('=')[0]:
                foodName = a.split('=')[1]
               
            if 'hotel-name' == a.split('=')[0]:
                hotelName = a.split('=')[1]
                
        res_names_answers.append(resName)
        food_names_answer.append(foodName)
        hotel_names_answers.append(hotelName)


print(len(res_names_answers))
print(len(food_names_answer))
print(len(hotel_names_answers))
questions = []
contexts = []

food_question = ["What kind of food?"]*len(food_names_answer)
res_name_question = ["Which restaurant"]*len(res_names_answers)
hotel_name_question = ["Which hotel"]*len(hotel_names_answers)

contexts = all_context
questions = food_question
QA_input, masks = convert_to_BERT_tensors(questions, contexts)
answers = food_names_answer
spans = [get_answer_span_tensor(questions[i], contexts[i], answers[i]) for i in range(len(questions))]


food_train_dataloader = DataLoader(QAdataset(QA_input, spans, masks), batch_size=batch_size, num_workers=4, shuffle=False)

questions = res_name_question
QA_input, masks = convert_to_BERT_tensors(questions, contexts)
answers = res_names_answers
spans = [get_answer_span_tensor(questions[i], contexts[i], answers[i]) for i in range(len(questions))]

rest_train_dataloader = DataLoader(QAdataset(QA_input, spans, masks), batch_size=batch_size, num_workers=4, shuffle=False)

questions = hotel_name_question
QA_input, masks = convert_to_BERT_tensors(questions, contexts)
answers = hotel_names_answers
spans = [get_answer_span_tensor(questions[i], contexts[i], answers[i]) for i in range(len(questions))]
hotel_train_dataloader = DataLoader(QAdataset(QA_input, spans, masks), batch_size=batch_size, num_workers=4, shuffle=False)

3760
3760
3760
3760


In [18]:
##BUILDING DEV


res_names_context = []
res_names_answers = []
hotel_names_context = []
hotel_names_answers = []
food_names_context = []
food_names_answer = []
all_context = []
with open(file_path + "WOZ_dev_utt.txt") as f1:
  with open(file_path + "WOZ_dev_ans.txt") as f2:
    utterances = f1.readlines()
    print(len(utterances))
    for i,line in enumerate(f2.readlines()):
        a_line = line.strip()
        ans = a_line.split('|')

        intent = ans[0]
        
        all_context.append(utterances[i].strip())
        resName = 'XXNOANSWERXX'
        foodName = 'XXNOANSWERXX'
        hotelName = 'XXNOANSWERXX'

        for a in ans[1:]:
            if 'restaurant-name' == a.split('=')[0]:
                resName = a.split('=')[1]
                
            if 'restaurant-food' == a.split('=')[0]:
                foodName = a.split('=')[1]
               
            if 'hotel-name' == a.split('=')[0]:
                hotelName = a.split('=')[1]
                
        res_names_answers.append(resName)
        food_names_answer.append(foodName)
        hotel_names_answers.append(hotelName)


print(len(res_names_answers))
print(len(food_names_answer))
print(len(hotel_names_answers))
questions = []
contexts = []

food_question = ["What kind of food?"]*len(food_names_answer)
res_name_question = ["Which restaurant"]*len(res_names_answers)
hotel_name_question = ["Which hotel"]*len(hotel_names_answers)

questions.extend(food_question)
questions.extend(res_name_question)
questions.extend(hotel_name_question)

contexts = all_context
questions = food_question
QA_input, masks = convert_to_BERT_tensors(questions, contexts)
answers = food_names_answer
spans = [get_answer_span_tensor(questions[i], contexts[i], answers[i]) for i in range(len(questions))]


food_dev_dataloader = DataLoader(QAdataset(QA_input, spans, masks), batch_size=batch_size, num_workers=4, shuffle=False)

questions = res_name_question
QA_input, masks = convert_to_BERT_tensors(questions, contexts)
answers = res_names_answers
spans = [get_answer_span_tensor(questions[i], contexts[i], answers[i]) for i in range(len(questions))]

rest_dev_dataloader = DataLoader(QAdataset(QA_input, spans, masks), batch_size=batch_size, num_workers=4, shuffle=False)

questions = hotel_name_question
QA_input, masks = convert_to_BERT_tensors(questions, contexts)
answers = hotel_names_answers
spans = [get_answer_span_tensor(questions[i], contexts[i], answers[i]) for i in range(len(questions))]
hotel_dev_dataloader = DataLoader(QAdataset(QA_input, spans, masks), batch_size=batch_size, num_workers=4, shuffle=False)

413
413
413
413


In [19]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
criterion = nn.CrossEntropyLoss()
# create checkpoint directory
import os
save_path = '/content/gdrive/MyDrive/Colab Notebooks/563Lab4/'
if os.path.exists(save_path) == False:
    os.makedirs(save_path)
from tqdm import tqdm, trange
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report, confusion_matrix

In [20]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

In [21]:
model_food = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")
model_rest = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")
model_hotel = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this mode

In [22]:
# Parameters:
lr = 0.00003
max_grad_norm = 1.0
epochs = 1
# warmup_proportion = 0.1
# num_training_steps  = 77558 * epochs
# num_warmup_steps = num_training_steps * warmup_proportion


model_food.to(device)
model_rest.to(device)
model_hotel.to(device)
optimizer_food = optim.Adam(model_food.parameters(), lr=lr)
optimizer_rest = optim.Adam(model_rest.parameters(), lr=lr)
optimizer_hotel = optim.Adam(model_hotel.parameters(), lr=lr)

In [23]:
from tqdm import tqdm
predicted_starts = []
gold_starts = []
predicted_ends = []
gold_ends = []
def train_n_evaluate(model, optimizer, train_itr, dev_itr):
    
    epoch_loss = 0
    l = 0
    best_start_accuracy = 0
    # Uncomment while training

    for epoch in range(epochs):
      model.train()
      loop = tqdm(train_itr, leave=True)
      for train_text_batch, train_span_batch, masks in loop:
          train_text_batch = train_text_batch.to(device)
          train_span_batch = train_span_batch.to(device)
          masks = masks.to(device)
          outputs = model(train_text_batch, 
                          attention_mask=masks,
                          start_positions=train_span_batch[:, 0],
                          end_positions=train_span_batch[:, 1])
          
              
          loss = outputs.loss 
          
          optimizer.zero_grad()
          loss.backward()
          optimizer.step()
          
          loop.set_description(f'Epoch {epoch}')
          l += 1
          epoch_loss += loss.cpu().item()
          overall_loss = epoch_loss/l
          loop.set_postfix(loss=loss.item())
          loop.set_postfix(overall_loss=overall_loss)
          # delete used variables to free GPU memory
          del train_text_batch, masks, train_span_batch

      model.eval()
      loop = tqdm(dev_itr)

      
      model.eval()

      for dev_text_batch, dev_span_batch, masks in loop:
          with torch.no_grad():
              dev_text_batch = dev_text_batch.to(device)
              dev_span_batch = dev_span_batch.to(device)
              masks = masks.to(device)
              outputs = model(dev_text_batch, attention_mask=masks,
                              start_positions = dev_span_batch[:, 0],
                              end_positions= dev_span_batch[:, 1])
              start_pred = torch.argmax(outputs['start_logits'], dim=1)
              end_pred = torch.argmax(outputs['end_logits'], dim=1)
              predicted_starts.extend(start_pred.tolist())
              gold_starts.extend(dev_span_batch[:, 0].tolist())
              predicted_ends.extend(end_pred.tolist())
              gold_ends.extend(dev_span_batch[:, 1].tolist())  
      
      a1 = accuracy_score(gold_starts,predicted_starts)
      a2 = accuracy_score(gold_ends,predicted_ends)
      # if(a1 + a2 > best_start_accuracy):
      #   best_start_accuracy = a1 + a2
      torch.save(model.state_dict(),'563.pt')
      print(f"Starts accuracy: {accuracy_score(gold_starts,predicted_starts)*100:0.2f}%")
      print(f"Ends accuracy: {accuracy_score(gold_ends,predicted_ends)*100:0.2f}%")

In [24]:
print(f"FOOD MODEL")
train_n_evaluate(model_food, optimizer_food, food_train_dataloader, food_dev_dataloader)

print(f"RESTAURANT MODEL")
train_n_evaluate(model_rest, optimizer_rest, rest_train_dataloader, rest_dev_dataloader)

print(f"HOTEL MODEL")
train_n_evaluate(model_hotel, optimizer_hotel, hotel_train_dataloader, hotel_dev_dataloader)

FOOD MODEL


Epoch 0: 100%|██████████| 235/235 [01:41<00:00,  2.32it/s, overall_loss=0.445]
100%|██████████| 26/26 [00:03<00:00,  6.66it/s]


Starts accuracy: 99.27%
Ends accuracy: 98.79%
RESTAURANT MODEL


Epoch 0: 100%|██████████| 235/235 [01:41<00:00,  2.32it/s, overall_loss=0.457]
100%|██████████| 26/26 [00:03<00:00,  6.64it/s]


Starts accuracy: 98.55%
Ends accuracy: 97.82%
HOTEL MODEL


Epoch 0: 100%|██████████| 235/235 [01:41<00:00,  2.32it/s, overall_loss=0.407]
100%|██████████| 26/26 [00:03<00:00,  6.67it/s]


Starts accuracy: 98.79%
Ends accuracy: 97.98%


In [25]:

from sklearn.feature_extraction.text import CountVectorizer
import numpy as np 
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [26]:
woz_directory = file_path
with open(woz_directory + "WOZ_train_utt.txt") as f:
    train =f.read().splitlines()

with open(woz_directory + "WOZ_dev_utt.txt") as f:
    dev = f.read().splitlines()

with open(woz_directory + "WOZ_test_utt.txt") as f:
    test = f.read().splitlines()

train_category = []
train_facility = []

with open(woz_directory + "WOZ_train_ans.txt") as f:
    for line in f:
        splitting = line.strip().split('|')
        train_category.append(splitting[0])
        train_facility.append(splitting[1:])
        
dev_category = []
dev_facility = []
        
with open(woz_directory + "WOZ_dev_ans.txt") as f:
    for line in f:
        splitting = line.strip().split('|')
        dev_category.append(splitting[0])
        dev_facility.append(splitting[1:])



lr = Pipeline([('vect', CountVectorizer(stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('lr', LogisticRegression(max_iter=1000)),])


lr.fit(train, train_category)
print("train_binary_score: ",lr.score(train, train_category))
print("dev_binary_score: ",lr.score(dev, dev_category))

test_binary_predictions = lr.predict(test)

train_binary_score:  0.9976063829787234
dev_binary_score:  0.9975786924939467


In [27]:
test[:5]

['Hello, I am looking for a restaurant in Cambridge. I believe it is called Golden Wok.',
 "Hi, I'm looking for a hotel to stay in that includes free wifi. I'm looking to stay in a hotel, not a guesthouse.",
 'I am looking for a place to stay in the north of the city. I would prefer a 4 star hotel please.',
 "I need a place to dine, and I'd like to know what's available so far as cheap Asian Oriental food.",
 'I need a five starts hotel close to a mall and main restaurants. The hotel should include free wifi in the room.']

In [28]:
test_binary_predictions[:5]

array(['find_restaurant', 'find_hotel', 'find_hotel', 'find_restaurant',
       'find_hotel'], dtype='<U15')

In [29]:
##BUILDING TEST


res_names_context = []
res_names_answers = []
hotel_names_context = []
hotel_names_answers = []
food_names_context = []
food_names_answer = []
all_context = []
with open(file_path + "WOZ_test_utt.txt") as f1:
  
  utterances = f1.readlines()
  total = len(utterances)
  print(total)
  for i,line in enumerate(utterances):
      
      all_context.append(utterances[i].strip())
      resName = 'XXNOANSWERXX'
      foodName = 'XXNOANSWERXX'
      hotelName = 'XXNOANSWERXX'
       
      res_names_answers.append(resName)
      food_names_answer.append(foodName)
      hotel_names_answers.append(hotelName)



questions = []
contexts = []

food_question = ["What kind of food?"]*len(food_names_answer)
res_name_question = ["Which restaurant"]*len(res_names_answers)
hotel_name_question = ["Which hotel"]*len(hotel_names_answers)

contexts = all_context
questions = food_question
QA_input, masks = convert_to_BERT_tensors(questions, contexts)
spans = [torch.tensor([0, 0])for i in range(len(questions))]

food_test_dataloader = DataLoader(QAdataset(QA_input, spans, masks), batch_size=batch_size, num_workers=4, shuffle=False)

questions = res_name_question
QA_input, masks = convert_to_BERT_tensors(questions, contexts)
rest_test_dataloader = DataLoader(QAdataset(QA_input, spans, masks), batch_size=batch_size, num_workers=4, shuffle=False)

questions = hotel_name_question
QA_input, masks = convert_to_BERT_tensors(questions, contexts)
hotel_test_dataloader = DataLoader(QAdataset(QA_input, spans, masks), batch_size=batch_size, num_workers=4, shuffle=False)


400


In [30]:
!pip install pulp



In [31]:
import pulp
# your code here
def select_best_answer_span(start_probs, end_probs, distance):
    
    if start_probs.shape[0] != end_probs.shape[0]:
        return
    
    startings = np.arange(start_probs.shape[1])
    endings = np.arange(end_probs.shape[1])
    result = []
    for i, start in enumerate(start_probs):
        
        log_prob_start = start
        log_prob_end = end_probs[i]
        start_or_end = np.arange(2)
        all_options = startings


        prob = pulp.LpProblem("QA_Problem", pulp.LpMaximize)
        x = pulp.LpVariable.dicts("x", (start_or_end, all_options), 0, 1, pulp.LpInteger)
        prob += pulp.lpSum(((x[0][i]*log_prob_start[i]) + ((x[1][i]*log_prob_end[i])) for i in startings))

        #only one possible ans
        prob += pulp.lpSum(x[0][i] for i in all_options) == 1
        prob += pulp.lpSum(x[1][i] for i in all_options) == 1

        for i in startings:
            prob += pulp.lpSum([[x[0][i]]] + [-x[1][dis] for dis in range(i, min(i+distance+1, len(startings)))]) <= 0
            
                
        prob.solve()
        # print(prob)
        # print(pulp.LpStatus[prob.status])
        # print(pulp.value(prob.objective))
        start = 0
        end = 0
        for index in startings:
            if x[0][index].value() == 1 :
                start = index
            if x[1][index].value() == 1 :
                end = index
        # if(start == 0):
        #     start = log_prob_start.argmax()
        # if(end == 0):
        #     end = log_prob_end.argmax()
        result.append((start, end))
    
    return result
test_starts = np.array([[0.1,0.5,0.2,0.1,0.1], [0.3,0.2,0.2,0.1,0.1]])
test_ends = np.array([[0.4,0.1,0.3,0.1,0.1], [0.1,0.1,0.1,0.1,0.6]])
select_best_answer_span(test_starts,test_ends,2)

[(1, 2), (2, 4)]

In [32]:
food_predictions= []
rest_predictions = []
hotel_predictions = []
loop = tqdm(food_test_dataloader)
for text_batch, span_batch, masks in loop:
  text_batch = text_batch.to(device)
  masks = masks.to(device)
  output = model_food(text_batch, masks)
  result = select_best_answer_span(output.start_logits,output.end_logits, 10)
  for i, (start_index, end_index) in enumerate (result):
    food_predictions.append(tokenizer.decode(text_batch[i][start_index:end_index+1]))
loop = tqdm(rest_test_dataloader)
for text_batch, span_batch, masks in loop:
  text_batch = text_batch.to(device)
  masks = masks.to(device)
  output = model_rest(text_batch, masks)
  result = select_best_answer_span(output.start_logits,output.end_logits, 10)
  for i, (start_index, end_index) in enumerate (result):
    rest_predictions.append(tokenizer.decode(text_batch[i][start_index:end_index+1]))
loop = tqdm(hotel_test_dataloader)
for text_batch, span_batch, masks in loop:
  text_batch = text_batch.to(device)
  masks = masks.to(device)
  output = model_hotel(text_batch, masks)
  result = select_best_answer_span(output.start_logits,output.end_logits, 10)
  for i, (start_index, end_index) in enumerate (result):
    hotel_predictions.append(tokenizer.decode(text_batch[i][start_index:end_index+1]))

100%|██████████| 25/25 [02:49<00:00,  6.78s/it]
100%|██████████| 25/25 [02:51<00:00,  6.84s/it]
100%|██████████| 25/25 [02:47<00:00,  6.71s/it]


In [33]:
questions[399]

'Which hotel'

In [34]:
list(zip(questions, hotel_predictions))[:5]

[('Which hotel', '[CLS]'),
 ('Which hotel', '[CLS]'),
 ('Which hotel', '[CLS]'),
 ('Which hotel', '[CLS]'),
 ('Which hotel', '[CLS]')]

In [35]:
hotel_slot_order = ["hotel-area","hotel-internet", "hotel-name", "hotel-parking","hotel-pricerange", "hotel-stars", "hotel-type"]
restaurant_slot_order = ["restaurant-area","restaurant-food","restaurant-name","restaurant-pricerange"]

In [36]:
import editdistance

def predict_area(utterance, establishment, filter=[]):
    
    utterance = utterance.lower()
    for f in filter:
        utterance = utterance.replace(f, "XXXX")
    dont_care = ["any area", "anywhere in town", "any part"]
    
    if "north" in utterance:
        return f"{establishment}-area=north"
    
    if "south" in utterance:
        return f"{establishment}-area=south"
    
    if "east" in utterance:
        return f"{establishment}-area=east"
    
    if "west" in utterance:
        return f"{establishment}-area=west"
    
    if "center" in utterance or "centr" in utterance or "downtown" in utterance:
        return f"{establishment}-area=centre"
    
    for word in utterance.split():
        if editdistance.eval("centre", word) == 1:
            return f"{establishment}-area=centre"
    
    for phrase in dont_care:
        if phrase in utterance:
            return f"{establishment}-area=dontcare"
        
    if "don't care" in utterance and ("what part" in utterance or "what area" in utterance.lower()):
        return f"{establishment}-area=dontcare"
    
    return ""

In [37]:
correct = 0
incorrect = 0

for utt, ans, cat in list(zip(dev, dev_facility, dev_category)):
    # print(ans)
    if ("-area" in " ".join(ans) and predict_area(utt, cat.split("_")[1]) in " ".join(ans)) or ("-area" not in " ".join(ans) and predict_area(utt, cat.split("_")[1]) == ""):
        correct += 1
    else:
        print(utt)
        print(ans)
        print(f'DETECTION {predict_area(utt, cat.split("_")[1])}')
        incorrect += 1
        
        
print(correct)
print(incorrect)

I'm looking for a expensive restaurant that serves north american food.
['restaurant-food=north american', 'restaurant-pricerange=expensive']
DETECTION restaurant-area=north
I need to find a barbeque restaurant in the centre of town.
['restaurant-food=barbeque']
DETECTION restaurant-area=centre
Please find a place to eat that serves South Indian food in the center.
['restaurant-area=centre', 'restaurant-food=south indian']
DETECTION restaurant-area=south
Can you help me locate an expensive restaurant offering Northern European cuisine?
['restaurant-food=northern european', 'restaurant-pricerange=expensive']
DETECTION restaurant-area=north
I want to find an expensive restaurant that has North American food.
['restaurant-food=north american', 'restaurant-pricerange=expensive']
DETECTION restaurant-area=north
I'd like to find a place that I can get some Northern European food.
['restaurant-food=northern european']
DETECTION restaurant-area=north
A friend recommended the City Centre North 

In [38]:
def predict_pricerange(utterance, establishment, filter=[]):
    
    utterance = utterance.lower()
    for f in filter:
        utterance = utterance.replace(f, "XXXX")
    dont_care = ["cost", "pricerange", "price range"]
    
    moderate = ["moderate", "medium", "mid range", "mid-range", "mid price",
                "mid-price", "middle price", "middle-price", "middle range",
                "middle-range", "too cheap", "too expensive", "decent"]
    
    single_word_moderate = [word for word in moderate if " " not in word]
    
    cheap = ["cheap", "inexpensive", "low price", "low-price", "lower price", "lower-price"]
    
    expensive = ["expensive", "pricy", "prici", "upscale", "high price", "high-price",
                 "an object", "an issue", "posh", "show off", "high scale", "high-scale",
                 "fancy", "high end", "high-end"]
    
    single_word_expensive = [word for word in expensive if " " not in word]
    
    negation = ["n't", "not"]
    
    if "don't care" in utterance:
        for phrase in dont_care:
            if phrase in utterance:
                distance = utterance.index(phrase) - utterance.index("don't care")
                if distance > 0 and distance < 30:
                    return f"{establishment}-pricerange=dontcare"
        if "how expensive" in utterance or "how much" in utterance:
            return f"{establishment}-pricerange=dontcare"
        
    for phrase in moderate:
        if phrase in utterance:
            return f"{establishment}-pricerange=moderate"
        elif "expensive" in utterance and "cheap" in utterance:
            return f"{establishment}-pricerange=moderate"

    for word1 in single_word_moderate:
        for word2 in utterance.split():
            if editdistance.eval(word1, word2) == 1:
                return f"{establishment}-pricerange=moderate"
        
    for phrase in cheap:
        if phrase in utterance:
            return f"{establishment}-pricerange=cheap"
    
    for phrase in expensive:
        if phrase in utterance:
            for neg in negation:
                if neg in utterance:
                    distance = utterance.index(phrase) - utterance.index(neg)
                    if distance > 0 and distance < 20:
                        return f"{establishment}-pricerange=cheap"
                    else:
                        return f"{establishment}-pricerange=expensive"
            return f"{establishment}-pricerange=expensive"
        
    for word1 in single_word_expensive:
        for word2 in utterance.split():
            if editdistance.eval(word1, word2) == 1 and word2 != "price" and word2 != "post":
                print(word1, word2)
                return f"{establishment}-pricerange=expensive"
            
    return ""

In [39]:
correct = 0
incorrect = 0

for utt, ans, cat in list(zip(dev, dev_facility, dev_category)):
    # print(ans)
    if ("-price" in " ".join(ans) and predict_pricerange(utt, cat.split("_")[1]) in " ".join(ans)) or ("-price" not in " ".join(ans) and predict_pricerange(utt, cat.split("_")[1]) == ""):
        correct += 1
    else:
        print(utt)
        print(ans)
        print(f'DETECTION {predict_pricerange(utt, cat.split("_")[1])}')
        incorrect += 1
        
        
print(correct)
print(incorrect)

Where are some restaurants that serve mediterranean dishes that are in the low to moderate price range?
['restaurant-food=mediterranean', 'restaurant-pricerange=cheap']
DETECTION restaurant-pricerange=moderate
Please find a cheap place to eat that serves light bites food.
['restaurant-food=light bites']
DETECTION restaurant-pricerange=cheap
411
2


In [40]:
def predict_stars(utterance, establishment, filter=[]):
    
    utterance = utterance.lower()
    for f in filter:
        utterance = utterance.replace(f, "XXXX")
    contains_star = False
    
    one = ["1", "one"]
    
    two = ["2", "two"]
    
    three = ["3", "three"]
    
    four = ["4", "four"]
    
    five = ["5", "five"]
    
    ratings = [one, two, three, four, five]
    
    for word in utterance.split():
        if "star" in word:
            contains_star = True
            break
    
    if contains_star == False:
        return ""
    
    if "star " in utterance:
        index = utterance.index("star ")
        
    else:
        index = utterance.index("star")
        
    for i, rating in enumerate(ratings):
        for rep in rating:
            if rep in utterance[index-7:index]:
                return f"{establishment}-stars={i+1}"
            
    return ""

In [41]:
correct = 0
incorrect = 0

for utt, ans, cat in list(zip(dev, dev_facility, dev_category)):
    # print(ans)
    if ("-star" in " ".join(ans) and predict_stars(utt, cat.split("_")[1]) in " ".join(ans)) or ("-star" not in " ".join(ans) and predict_stars(utt, cat.split("_")[1]) == ""):
        correct += 1
    else:
        print(utt)
        print(ans)
        print(f'DETECTION {predict_stars(utt, cat.split("_")[1])}')
        incorrect += 1
        
        
print(correct)
print(incorrect)

413
0


In [42]:
for utt, ans in list(zip(train, train_facility)):
    utt = utt.lower()
    if "internet=dont" in " ".join(ans):
        print(utt)

i'm looking for a place to stay. i'd like a hotel, not a guesthouse, and i don't care about internet.
i am looking for a hotel near the mall. the hotel should has free indoor parking, even it doesn't include internet.
i'm looking for a hotel with a 4 star rating to stay at and i don't need to have internet. can you help?
i'm looking for a place to stay in the centre. it doesn't need to include internet.
can you help me find a place to stay that has free parking? it doesn't need to include internet.
howdy, i'm looking for a place to stay. i don't care about internet.
i need a place to stay in the east. i don't need to have access to the internet.
can you give me info on four start places, it doesn;t matter if they have free wifi or not.
thank you. i just happen to be looking for a place to stay. the hotel should be located west and it doesn't need to have free parking or anything like that.
hello i am looking for a hotel on the west side of town. i'm not picky about wi-fi.
i am looking 

In [43]:
import re
s = "yes, i'm searching for a 4 star rated places to stay in town. it does not need to have any internet connection but parking "
"internet" in re.split(',|\.| but | although ', s)[2]

True

In [44]:
import string
def predict_internet(utterance, establishment, filter=[]):
    
    # utterance = utterance.lower().translate(str.maketrans("","",string.punctuation))
    
    for f in filter:
        utterance = utterance.replace(f, "XXXX")

    utterance = utterance.lower()
    utt = re.split(',|\.| but | although ', utterance)#utterance.strip().split(" ")
    
    
    internets = ["internet", "wifi", "wi-fi", "wi fi"]
    negation = ["not", "isn't", "is not", "don't", "do not", "hate", " no ", "without", "dont", "doesn't", "doesnt"]
    dont_care = ["care", "need", "necessary", "necessity"]
    
    for i in internets:
      for u in utt:
        if i in u:
          for r in negation:
            if r in u:
              for c in dont_care:
                if c in u:
                  return f"{establishment}-internet=dontcare"
              return f"{establishment}-internet=no"
          return f"{establishment}-internet=yes"
    
    
    return ""

In [45]:
correct = 0
incorrect = 0

for utt, ans, cat in list(zip(train, train_facility, train_category)):
    # print(ans)
    if ("-internet" in " ".join(ans) and predict_internet(utt, cat.split("_")[1]) in " ".join(ans)) or ("-internet" not in " ".join(ans) and predict_internet(utt, cat.split("_")[1]) == ""):
        correct += 1
    else:
        print(utt)
        print(ans)
        print(f'DETECTION {predict_internet(utt, cat.split("_")[1])}')
        incorrect += 1
        
        
print(correct)
print(incorrect)

I am looking for a place to stay. The hotel should be in the type of guesthouse and doesn't need to include internet
['hotel-type=guesthouse']
DETECTION hotel-internet=dontcare
I need a place to stay that doesn't have to have internet and is in the expensive price range please.
['hotel-pricerange=expensive']
DETECTION hotel-internet=dontcare
Yes, I'm looking to stay at a guesthouse while I'm in town. I don't need internet access, so don't worry about that.
['hotel-type=guesthouse']
DETECTION hotel-internet=dontcare
Hi. I'm looking for a place to spend the night, and I want somewhere without any hidden fees for parking or wifi.
['hotel-internet=yes', 'hotel-parking=yes']
DETECTION hotel-internet=no
I am looking for a hotel near the mall. The hotel should has free indoor parking, even it doesn't include internet.
['hotel-internet=dontcare', 'hotel-parking=yes']
DETECTION hotel-internet=no
Are there any hotels where the wifi doesn't cost extra?
['hotel-internet=yes']
DETECTION hotel-inter

In [46]:
def predict_parking(utterance, establishment, filter=[]):
    
    # utterance = utterance.lower().translate(str.maketrans("","",string.punctuation))
    
    for f in filter:
        utterance = utterance.replace(f, "XXXX")
    
    # utt = utterance
    # parkings = ["parking", "parkin", "pakring", "park"]
    # dont_care = ["doesnt need", "dont care", "not necessary", "does not need", "do not care"]
    # refuses = ["dont", "don't","dont need", "doesn't", "doesnt", "dont", "not"]
    
    # for p in parkings:
    #   if p in utt.strip().split(" "):
    #     for r in dont_care:
    #       if r in utt and ((utt.index(p) - utt.index(r)) > 0 and len(utt[utt.index(r) : utt.index(p)].split(' ')) <8 or (utt.index(p) - utt.index(r)) <= 0 and len(utt[utt.index(p) : utt.index(r)].split(' ')) <8) :
    #         return f"{establishment}-parking=dontcare"
    #     for r in refuses:
    #       if r in utt and ((utt.index(p) - utt.index(r)) > 0 and len(utt[utt.index(r) : utt.index(p)].split(' ')) <8 or (utt.index(p) - utt.index(r)) <= 0 and len(utt[utt.index(p) : utt.index(r)].split(' ')) <8) :
    #         return f"{establishment}-parking=no"
    #     return f"{establishment}-parking=yes"
    utterance = utterance.lower()
    utt = re.split(',|\.| but | although ', utterance)#utterance.strip().split(" ")
    
    
    parkings = ["parking", "parkin", "pakring", "park"]
    negation = ["not", "don't", "do not", " no ", "without", "dont", "doesn't", "doesnt"]
    dont_care = ["care", "necessary", "necessity", "need"]
    
    for i in parkings:
      for u in utt:
        if i in u:
          for r in negation:
            if r in u:
              for c in dont_care:
                if c in u:
                  return f"{establishment}-parking=dontcare"
              return f"{establishment}-parking=no"
          return f"{establishment}-parking=yes"
    
    
    return ""
    


In [47]:
correct = 0
incorrect = 0
x=0
for utt, ans, cat in list(zip(train, train_facility, train_category)):

    # if ("-parking=dont" in " ".join(ans)):
        # x += 1
        # print(x)
        # print(utt)
        # print(ans)
    if ("-parking" in " ".join(ans) and predict_parking(utt, cat.split("_")[1]) in " ".join(ans)) or ("-parking" not in " ".join(ans) and predict_parking(utt, cat.split("_")[1]) == ""):
        correct += 1
    else:
        print(utt)
        print(ans)
        print(f'DETECTION {predict_parking(utt, cat.split("_")[1])}')
        incorrect += 1
        
        
print(correct)
print(incorrect)

Hi. I'm looking for a place to spend the night, and I want somewhere without any hidden fees for parking or wifi.
['hotel-internet=yes', 'hotel-parking=yes']
DETECTION hotel-parking=no
Hi I need a hotel that is in the east. I don't need free parking.
['hotel-area=east', 'hotel-parking=no']
DETECTION hotel-parking=dontcare
Hello, I'm looking for a hotel that is cheap. It doesn't need to have free parking.
['hotel-parking=no', 'hotel-pricerange=cheap']
DETECTION hotel-parking=dontcare
Can you help me find a place to stay that has free parking? It doesn't need to include internet.
['hotel-area=dontcare', 'hotel-internet=dontcare', 'hotel-name=dontcare', 'hotel-parking=yes', 'hotel-type=dontcare']
DETECTION hotel-parking=dontcare
I'm looking for a place to stay. I need to find free parking and I only want to stay in a hotel and not any thing else other than that.
['hotel-parking=yes']
DETECTION hotel-parking=dontcare
Can you help me find a place to stay? I am looking for an expensive hotel

In [48]:

def predict_type(utterance, establishment, filter=[]):
    
    utterance = utterance.lower().translate(str.maketrans("","",string.punctuation))
    
    for f in filter:
        utterance = utterance.replace(f, "XXXX")

    utt = utterance.strip().split(" ")

    dont_care = ["doesn't matter", "don't care", "dont care"]
    types = ["guesthouse", "guest", "Guest", "guesthouses"]
    refuses = ["dont", "don't", "doesn't", "doesnt", "not"]
    maybes = ["maybe", "care"]


    for t in types:
      if t in utt:
        for r in dont_care:
          if r in utt and utt.index(t) - utt.index(r) <= 3:
            return f"{establishment}-type=dontcare"
        for r in refuses:
          if r in utt and utt.index(t) - utt.index(r) <= 3:
            return ""
        return f"{establishment}-type=guesthouse"
    
    
    return ""

In [49]:
correct = 0
incorrect = 0

for utt, ans, cat in list(zip(train, train_facility, train_category)):
  
  
    if ("-type" in " ".join(ans) and predict_type(utt, cat.split("_")[1]) in " ".join(ans)) or ("-type" not in " ".join(ans) and predict_type(utt, cat.split("_")[1]) == ""):
        correct += 1
    else:
        print(utt)
        print(ans)
        print(f'DETECTION {predict_type(utt, cat.split("_")[1])}')
        incorrect += 1
        
        
print(correct)
print(incorrect)

I am looking for a hotel named alyesbray lodge guest house.
['hotel-name=alyesbray lodge guest house']
DETECTION hotel-type=guesthouse
I am looking for a guesthouse that has free parking.
['hotel-parking=yes']
DETECTION hotel-type=guesthouse
I'm looking for a hotel called the a and b guest house. Can you help me out?
['hotel-name=a and b guest house']
DETECTION hotel-type=guesthouse
I am looking for a hotel call arbury lodge guesthouse.
['hotel-name=arbury lodge guesthouse']
DETECTION hotel-type=guesthouse
Yes, could you please tell me about the Alpha-Milton guest house?
['hotel-name=alpha-milton guest house']
DETECTION hotel-type=guesthouse
I am looking to get some information on the acorn guest house.
['hotel-name=acorn guest house']
DETECTION hotel-type=guesthouse
I need a place to stay in Cambridge. It should be either a hotel or guesthouse with 1 star and moderate price range. Can you help me with that?
['hotel-stars=1']
DETECTION hotel-type=guesthouse
Can you help me find the a a

In [50]:
kaggle_ans = []
for i,utt in enumerate(test):
  line = ''
  # print(utt)
  intent = test_binary_predictions[i]
  line += intent
  if("restaurant" in intent):
      filter = []
      filter.append(food_predictions[i])
      filter.append(rest_predictions[i])
      if(predict_area(utt,"restaurant", filter) != ""):
          line += "|"+predict_area(utt,"restaurant")
      if(food_predictions[i] != "[CLS]"):
          line += f"|restaurant-food={food_predictions[i]}"
      if(rest_predictions[i] != "[CLS]"):
          line += f"|restaurant-name={rest_predictions[i]}"
      if(predict_pricerange(utt,"restaurant", filter) != ""):
          line += "|"+predict_pricerange(utt,"restaurant")
  else:
      filter = []
      filter.append(hotel_predictions[i])
      if(predict_area(utt,"hotel", filter) != ""):
          line += "|"+predict_area(utt,"hotel")
      if(predict_internet(utt,"hotel", filter) != ""):
          line += "|"+predict_internet(utt,"hotel")
      if(hotel_predictions[i] != "[CLS]"):
          line += f"|hotel-name={hotel_predictions[i]}"
      if(predict_parking(utt,"hotel", filter) != ""):
          line += "|"+predict_parking(utt,"hotel")
      if(predict_pricerange(utt,"hotel", filter) != ""):
          line += "|"+predict_pricerange(utt,"hotel")
      if(predict_stars(utt,"hotel", filter) != ""):
          line += "|"+predict_stars(utt,"hotel")
      if(hotel_predictions[i] == "[CLS]" and predict_type(utt,"hotel", filter) != ""):
          line += "|"+predict_type(utt,"hotel")
  kaggle_ans.append(line)

In [51]:
print(len(kaggle_ans))
import csv
rows = zip(range(len(kaggle_ans)), kaggle_ans)
with open(file_path+'kaggle_tags.csv', "w") as f:
    writer = csv.writer(f)
    writer.writerow(('Id','Expected'))
    for row in rows:
        writer.writerow(row)

400
