In [24]:

## How tagger input is built?

"""
    This notebook will load squad_v2 dataset and put it as 
    conllu, each answer phrase will start with B-ANS 
    followed by I-ANS and end with O tag. Rest of the 
    tags are also going to be O tags.
"""

## How Parser input is built?

"""
    For parser input, the end of out answer will have head 
    as start of our answer, that means last I-ANS will have
    B-ANS as head of it, and the entire phrase will be connected
    in that way. So we are predicting answer as a phrase in 
    tagging, and also predicting it in parser. 
    Though in theory, only tagger should work for the same. 
"""



'\n    For parser input, the end of out answer will have head \n    as start of our answer, that means last I-ANS will have\n    B-ANS as head of it, and the entire phrase will be connected\n    in that way. So we are predicting answer as a phrase in \n    tagging, and also predicting it in parser. \n    Though in theory, only tagger should work for the same. \n'

In [25]:
## imports
from datasets import load_dataset
from mtrfg.utils import write_text, make_dir, load_json
from tqdm import tqdm
import spacy, re, string, random, os

In [26]:
## let's load the spacy tokenizer
nlp = spacy.load('en_core_web_md')
nlp.tokenizer = spacy.tokenizer.Tokenizer(nlp.vocab, suffix_search = re.compile(r'''\.|\,|\;|\(|\)|\$''').search)

In [27]:
## load squad_v2 from huggingface hub
dataset_name = 'squad_v2'
dataset = load_dataset(dataset_name)

Reusing dataset squad_v2 (/user/d.bhatt/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d)
100%|██████████| 2/2 [00:00<00:00, 1107.11it/s]


In [28]:
## write a function to turn a question, context and answer into a CoNLLU format. 
def get_que_ans_in_conllu(data_point):
    """
        We get squad_v2 datapoint as input
        and turn it into a CoNLLU format, we do 
        the following.
        1. As [CLS] token, followed by tokenized query
        2. [SEP] token after tokenized query
        3. Context token after [SEP] token
    """
    
    """
        Here we have everything in try and except block, because 
        there are some places where answers are incomplete, or
        answers are not present and so on, we don't want such 
        questions for our model, so we filter them out. 
    """
    try:

        ## only keep printable characters
        data_point['answers']['text'][0] = ''.join([char for char in data_point['answers']['text'][0] if char in string.printable])
        data_point['context'] = ''.join([char for char in data_point['context'] if char in string.printable])
        data_point['question'] = ''.join([char for char in data_point['question'] if char in string.printable])

        ## let's mask out the answer
        ans_len = len(data_point['answers']['text'][0])
        ans_start = data_point['answers']['answer_start'][0]
        ans_end = ans_start + ans_len

        masked_context = data_point['context'][0:ans_start] + "ANSWERMASK" + data_point['context'][ans_end:]
        question = data_point['question']
        
        ## remove characters which are not printable, so that they don't make a mess. 
        # context_unprintable = [char for char in masked_context if char not in string.printable]
        # question_unprintable = [char for char in question if char not in string.printable]
        # for chara in context_unprintable:
        #     masked_context = masked_context.replace(chara, '')    
        # for chara in question_unprintable:
        #     question = question.replace(chara, '')    

        ## let's space out all the punctuations, and remove other whitespaces
        for punc in string.punctuation:
            masked_context = masked_context.replace(punc, f' {punc} ')    
            question = question.replace(punc, f' {punc} ')    
        
        ## removing whitespace that could mess up data generation
        for ws in string.whitespace[1:]:
            masked_context = masked_context.replace(ws, '')    
            question = question.replace(ws, '')

        ## tokenize question and masked context
        que_tokenized, context_tokenized, ans_tokenized = [[word.text] for word in nlp(question)], [[word.text] for word in nlp(masked_context)], [word.text for word in nlp(data_point['answers']['text'][0]) ]

        ## get question and context as a list, use [SEP] token to separate question and contexts
        que_context_as_list = que_tokenized + [['[SEP]']] + context_tokenized

        ## remove whitespace tokens from the list
        que_context_as_list = [token for token in que_context_as_list if len(token[0].replace(' ', '')) >  0]

        ## tag list, edge list and head_list
        que_context_tags, que_context_edges, que_context_heads = [['O'] for i in range(len(que_context_as_list))], [['root'] for i in range(len(que_context_as_list))], [[0] for i in range(len(que_context_as_list))]

        ## index where answer is located!    
        ans_index = que_context_as_list.index(['ANSWERMASK'])
        
        ## head of the start of the answer
        ans_head = ans_index + len(ans_tokenized) ## heads are 1 indexed, last token of the answer is where first token of the answer points to

        ## answer tags, heads, edges
        ans_tags, ans_heads, ans_edges = ['B-ANS'] + ['I-ANS'] * (len(ans_tokenized) - 1), [ans_head] + [0] * (len(ans_tokenized) - 1), ['e'] + ['root'] * (len(ans_tokenized) - 1)
        
        ## let's get the actual answer
        que_context_as_list[ans_index] = ans_tokenized
        que_context_tags[ans_index] = ans_tags
        que_context_edges[ans_index] = ans_edges
        que_context_heads[ans_index] = ans_heads

        ## let's collapse the list of lists as lists
        que_context_as_list = [token if token is not None else '[NEW]' for token_list in que_context_as_list for token in token_list]
        que_context_tags = [token if token is not None else 'O' for token_list in que_context_tags for token in token_list]
        que_context_edges = [token if token is not None else 'root' for token_list in que_context_edges for token in token_list]
        que_context_heads = [token if token is not None else 0 for token_list in que_context_heads for token in token_list]
        
        ## que_context as string
        que_context_as_string = []
        i = 0
        for token, tag, edge, head in zip(que_context_as_list, que_context_tags, que_context_edges, que_context_heads):
            if not token.isspace():
                que_context_as_string.append(f'{i+1}\t{token}\t_\t_\t{tag}\t_\t{head}\t{edge}\t_\t_')
                i += 1

        que_context_as_string = '\n'.join(que_context_as_string)

        return que_context_as_string
    
    except:
        return None


In [7]:
## get all squad question/answers in CoNLLU format
squad_train_conllu = [get_que_ans_in_conllu(datapoint) for datapoint in tqdm(dataset['train'])]
squad_validation_conllu = [get_que_ans_in_conllu(datapoint) for datapoint in tqdm(dataset['validation'])]

## let's filter out None
squad_train_conllu = [train_point for train_point in squad_train_conllu if train_point is not None]
squad_validation_conllu = [val_point for val_point in squad_validation_conllu if val_point is not None]

## let's create a test dataset from train dataset as squad doesn't have any test dataset
## create testdatset of same size as validation dataset
random.shuffle(squad_train_conllu)

## test dataset 
squad_test_conllu = squad_train_conllu[-len(squad_validation_conllu):] 
squad_train_conllu = squad_train_conllu[:-len(squad_validation_conllu)]


  1%|          | 707/130319 [00:17<49:34, 43.58it/s]  

In [26]:
## save the dataset
# squad_data_dir = '/data/Multitask_RFG/squad_conllu_dataset_without_CLS/' ## data directory
# make_dir(squad_data_dir) 
# train_file, test_file, val_file = os.path.join(squad_data_dir, 'train.conllu'), os.path.join(squad_data_dir, 'test.conllu'), os.path.join(squad_data_dir, 'dev.conllu')

# ## saving
# write_text(train_file, '\n\n'.join(squad_train_conllu))
# write_text(test_file, '\n\n'.join(squad_test_conllu))
# write_text(val_file, '\n\n'.join(squad_validation_conllu))

In [29]:
## time to build recipe QA dataset
def buildDataPoint(question, recipe, answer):
    ### we have to build datapoint in a way that'd be able to pass it to the processing pipeline
    question, recipe, answer = question.lower(), recipe.lower(), answer.lower() ## everything lowercase must
    answer_start = recipe.find(answer)
    answer_end = len(answer) + answer_start
    assert recipe[answer_start: answer_end] == answer
    datapoint = {}
    datapoint['question'] = question 
    datapoint['answers'] = {}
    datapoint['answers']['text'] = [answer]
    datapoint['answers']['answer_start'] = [answer_start]
    datapoint['context'] = recipe

    return datapoint

In [30]:
## loading dataset!
filePath = '/user/d.bhatt/video_summarization_in_text/youcook2/youcookii_annotations_trainval.json'
recipeQA_data = load_json(filePath)['database']

## let's get question answers
questions, answers, context = [], [], []

for videoID in recipeQA_data:
    for segment in recipeQA_data[videoID]['segments']:
        for question in recipeQA_data[videoID]['segments'][segment]:
            ## getting questions, answers and context
            recipe = recipeQA_data[videoID]['context']
            if (isinstance(question, str) and isinstance(recipe, str) and isinstance(segment, str)):
                questions.append(question)
                answers.append(segment)
                context.append(recipe)

In [31]:
for question, recipe, answer in zip(questions, context, answers):
    if not (isinstance(question, str) and isinstance(recipe, str) and isinstance(answer, str)):
        print(question, recipe, answer)

In [32]:
## building QA dataset in CoNLLU
dataDict = [buildDataPoint(question, recipe, answer) for question, recipe, answer in tqdm(zip(questions, context, answers))]

## data in CoNLLU format
dataConLLU = [get_que_ans_in_conllu(dataPoint) for dataPoint in tqdm(dataDict)]

## Filtering out None
dataConLLU = [dataPoint for dataPoint in dataConLLU if dataPoint is not None]

53207it [00:00, 171132.61it/s]
100%|██████████| 53207/53207 [13:29<00:00, 65.72it/s]


In [33]:
### train, test, validation splits! 70% train, 15% validation, 15% train
trainFrac, testFrac, valFrac = 70, 15, 15
random.shuffle(dataConLLU)

## indices to split the data
train_index, test_index = len(dataConLLU) * trainFrac // 100, len(dataConLLU) * (trainFrac + valFrac) // 100 

## train, test, validation splits!
trainData, valData, testData = dataConLLU[:train_index], dataConLLU[train_index:test_index], dataConLLU[test_index:]

In [34]:
## save the dataset
recipe_data_dir = '/data/Multitask_RFG/recipeQAGPTdataset/' ## data directory
make_dir(recipe_data_dir) 
train_file, test_file, val_file = os.path.join(recipe_data_dir, 'train.conllu'), os.path.join(recipe_data_dir, 'test.conllu'), os.path.join(recipe_data_dir, 'dev.conllu')

## saving
write_text(train_file, '\n\n'.join(trainData))
write_text(test_file, '\n\n'.join(testData))
write_text(val_file, '\n\n'.join(valData))

In [35]:
print(len(trainData), len(testData), len(valData))

37244 7982 7981
