# Preparation
Install necessary packages and import them

In [None]:
!pip install pandas numpy keras datasets transformers torch



In [None]:
# !pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl

In [None]:
import pandas as pd
import numpy as np
import random

import keras
from datasets import Dataset
from datasets import load_metric
import transformers
from transformers import Trainer
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from transformers import AutoModelForQuestionAnswering
import torch
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
from transformers import default_data_collator
from torch.nn.functional import softmax
from transformers import BertForNextSentencePrediction, BertTokenizer

# # imports the torch_xla package
# import torch_xla
# import torch_xla.core.xla_model as xm

# Load pretrained Model
check for GPU and move model to found device

In [None]:
if torch.cuda.is_available():
  print("GPU")  
  dev = "cuda:0" 
else:
  print("CPU")  
  dev = "cpu"  
device = torch.device(dev)  

# device = xm.xla_device()

GPU


In [None]:
from transformers import pipeline
>>> generator = pipeline('text-generation', model='EleutherAI/gpt-neo-1.3B', device=0)
>>> generator("EleutherAI has", do_sample=True, min_length=50)

Downloading:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.31G [00:00<?, ?B/s]

In [None]:
# load pretrained model and a pretrained tokenizer
model = BertForNextSentencePrediction.from_pretrained('EleutherAI/gpt-neo-1.3B')
tokenizer = BertTokenizer.from_pretrained('EleutherAI/gpt-neo-1.3B')
# model = BertForNextSentencePrediction.from_pretrained('bert-large-cased-whole-word-masking-finetuned-squad')
# tokenizer = BertTokenizer.from_pretrained('bert-large-cased-whole-word-masking-finetuned-squad')

model.to(device)

You are using a model of type gpt_neo to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


ValueError: ignored

# Load Data

In [None]:
#Load Train data
df_train = pd.read_csv("training.tsv",sep = '\t')
dataset_train = Dataset.from_pandas(df_train)
print(dataset_train,dataset_train[0])

In [None]:
#Load Test data
df_test = pd.read_csv('test_set.tsv',sep='\t',header=None)
df_test.rename(columns={0:'query'},inplace= True)
dataset_test = Dataset.from_pandas(df_test)
print(dataset_test,dataset_test[0])

In [None]:
#Load Questions
df_quest = pd.read_csv("question_bank.tsv",sep = '\t')
dataset_quest= Dataset.from_pandas(df_quest)
print(dataset_quest,dataset_quest[0])

In [None]:
#Load Answeres
df_ans = pd.read_csv("answer.txt",sep = '\t', header=None)
df_ans.rename(columns={0:'query'},inplace= True)
df_ans.rename(columns={1:'questions'},inplace= True)
dataset_ans= Dataset.from_pandas(df_ans)
print(dataset_ans,dataset_ans[0])

# Fine tuning the Model
Skip this complete block to just use the pretrained model

## Create and preprocess Inputs for Fine Tuning


In [None]:
num_querys = dataset_train.num_rows

### Creating postive samples
To create Positive sampels: Load complete set of training data for positive samples



In [None]:
# Build input lists and labels
sentence_a = []
sentence_b = []
label = []

#Positiv
for x in dataset_train:
    sentence_a.append(x["query"])
    sentence_b.append(x["clarifying_question"])
    label.append(0)


### Creating negative samples
To create the negative samples: load for each entry in the train data set a query with a randaom clarifying question, that does not belongs to the same query

In [None]:
#Negativ
for x in dataset_train:
    sentence_a.append(x["query"])
    rand = random.randint(0, num_querys-1)
    tmp = dataset_train[rand]
    while tmp["query"] == x["query"]:
        rand = random.randint(0, num_querys-1)
        tmp = dataset_train[rand]
    sentence_b.append(tmp["clarifying_question"])
    label.append(1)

### Tokenize created inputs

In [None]:
# Tokenize
inputs = tokenizer(sentence_a,sentence_b,return_tensors='pt',max_length=256,truncation=True,padding='max_length')

### provide Labels in the inputs

In [None]:
inputs['next_sentence_label'] = torch.LongTensor([label]).T

### Masked-Language Modeling(MLM)

In [None]:
#clone correct sentence in labels so that we can mask the inputs
inputs['labels'] = inputs.input_ids.detach().clone()

#create mask array
rand = torch.rand(inputs.input_ids.shape)
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * \
           (inputs.input_ids != 102) * (inputs.input_ids != 0)

selection = []

#get indices which will be masked
for i in range(inputs.input_ids.shape[0]):
    selection.append(
        torch.flatten(mask_arr[i].nonzero()).tolist()
    )

#apply these selection and set to masked token
for i in range(inputs.input_ids.shape[0]):
    inputs.input_ids[i, selection[i]] = 103

### Initilize Dataloader

In [None]:
#create own dataset Class
class OurDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [None]:
#Initialize this class with our data
dataset = OurDataset(inputs)

In [None]:
# initialize Dataloader
loader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)

## Train Model

In [None]:
#Setup training
from transformers import AdamW

# activate training mode
model.train()
# initialize optimizer
optim = AdamW(model.parameters(), lr=5e-5)

In [None]:
#Training Loop
from tqdm.notebook import tqdm  # for our progress bar

epochs = 3

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        next_sentence_label = batch['next_sentence_label'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        token_type_ids=token_type_ids,
                        next_sentence_label=next_sentence_label,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

# Predictions


### Single prediction function
Input a Query and a question to get a score how good the question fits the query

In [None]:
#get the percentage of how good the question fits the query
def getPrediction(query, question):
  # encode the two sequences. Particularly, make clear that they must be 
  # encoded as "one" input to the model by using 'seq_B' as the 'text_pair'
  encoded = tokenizer.encode_plus(query, text_pair=question, return_tensors='pt')

  encoded.to(device)
  # a model's output is a tuple, we only need the output tensor containing
  # the relationships which is the first item in the tuple
  seq_relationship_logits = model(**encoded)[0]

  # we still need softmax to convert the logits into probabilities
  # index 0: sequence B is a continuation of sequence A
  # index 1: sequence B is a random sequence
  probs = softmax(seq_relationship_logits, dim=1)
  return float(probs[0][0])

In [None]:
# getting predictions for a list of querys and clarifying questions
# Note: this function is not used
def getPredictions(data):
  for x in data:
    res = getPrediction(x["query"],x["clarifying_question"])
    if res < 0.6:
      print(x,res)


### Get Top 50 questions
Get the top 50 clarification questions from the question bank for the given query<br />
returns a list of Strings. For example `['Q01015','Q02316','Q00654','Q02389','Q02205', ... ]`

In [None]:
#Get top 50 suggested questions to a query
def getTop50Questions(query):
    predictions = {}
    results = []
    for quest in dataset_quest:
        predictions[quest["question_id"]] = getPrediction(query,quest["question"])
    sortedPred = sorted(predictions.items(), key=lambda x: x[1], reverse=True)
    for pred in sortedPred[:50]:
        results.append(pred[0])
    return results

    

### Function to predict a list of querys
uses datatype from dataset_test as input. <br />
returns a list of lists containing the top50 question for every query


In [None]:
#predictAll for test set
def predictAll(querys):
  results = []
  i= 0
  j =querys.num_rows
  for query in querys:
    i = i+1
    print(str(i)+"/"+str(j) +" Next")
    results.append(getTop50Questions(query["query"]))
  return results


### The actual prediction
calling the above function

In [None]:
# predict top50 clarification questions for each query in the test set
res = predictAll(dataset_test)

### Saving the results
write the predictions with the corresponding query in answer.txt 

In [None]:
#Saving predicted clarification questions with there corresponding query
def saveAnswer(res):
  with open('answer.txt', 'w') as f:
    for i in range(len(res)):
      f.write(dataset_test[i]["query"]+"\t%s\n" % ",".join(res[i]))

In [None]:
saveAnswer(res)

# Extras
Function to keep colab busy

In [None]:
# Idle function, to be "active" on colab. Prevents runtime shutdown(at least for a longer period as normal)
for i in range(1000):
  res1 = predictAll(dataset_test)

In [None]:
# Idle function, to be "active" on colab. Prevents runtime shutdown(at least for a longer period as normal)
# for i in range(1000):
#   res1 = predictAll(dataset_test)