# Made in collaboration with Chu Fang, Xiangdi Lin, Rachel Finley, and Dawid Cichoki

In [1]:
from transformers import BertConfig, BertTokenizer, BertForMaskedLM
import json
import os
import torch
from tqdm import tqdm
import random
import numpy as np
from tokenizers import Tokenizer
from torch.optim.lr_scheduler import LinearLR

In [11]:
def save_json(json_file, dst, sub_folder = None):
    if sub_folder:
        os.makedirs(subfolder, exist_ok = True)
        dst = subfolder + dst
    with open(dst, 'w', encoding = 'utf-8') as file:
        file.write(json.dumps(json_file))

def load_json(src):
    with open(src, encoding= 'utf-8') as file:
        return json.load(file)

def entities_to_bio2_tags(questions, bio_tag_mapping = {'Inside': 2, 'Beginning': 1}):
    for question_id, question_dict in enumerate(questions):
        entities = [x.lower() for x in question_dict['entities'].values() if x !='']
        split_question = question_dict['question'].lower().split()
        bio_tags = [0] * len(split_question)
        for entity in entities:
            entity_chunks = entity.split()
            for idx, chunk in enumerate(split_question):
                if entity_chunks[0] in chunk:
                    bio_tags[idx] = bio_tag_mapping['Beginning']
                    for idy, entity_chunk in enumerate(entity_chunks[1:]):
                        if idx + idy + 1 < len(bio_tags):
                            bio_tags[idx + idy + 1] = bio_tag_mapping['Inside']
                    break
        questions[question_id]['bio_tags'] = bio_tags

In [12]:
from transformers import BertConfig

In [13]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [14]:
raw_data = load_json("new_3000_edit.json")

In [15]:
raw_data[0]

{'id': 1,
 'uid': 'q-dev-s3q44053',
 'question': 'Over How Many Laps Is The Indianapolis 500 Contested',
 'entities': {'entity_1': 'The Indianapolis 500',
  'entity_2': '',
  'entity_3': ''}}

In [16]:
question = raw_data[0]

In [17]:
[x.lower() for x in question['entities'].values() if x !='']

['the indianapolis 500']

In [18]:
question['question'].lower().split()

['over',
 'how',
 'many',
 'laps',
 'is',
 'the',
 'indianapolis',
 '500',
 'contested']

In [19]:
entities_to_bio2_tags(raw_data)

In [20]:
raw_data[2995]

{'id': 2996,
 'uid': 's-train-56dceb899a695914005b9476',
 'question': 'Who took control of the company during the transition from Youlou to Debat?',
 'entities': {'entity_1': 'Youlou', 'entity_2': 'Debat', 'entity_3': ''},
 'bio_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1]}

In [35]:
save_json(raw_data, 'questions_filled_filtered_tagged_2.json')

In [36]:
import pandas as pd


In [37]:
df = pd.read_json("questions_filled_filtered_tagged_2.json", encoding = 'utf-8')

In [38]:
df.head()

Unnamed: 0,id,uid,question,entities,bio_tags
0,1,q-dev-s3q44053,Over How Many Laps Is The Indianapolis 500 Con...,"{'entity_1': 'The Indianapolis 500', 'entity_2...","[0, 0, 0, 0, 0, 1, 2, 2, 0]"
1,2,s-train-57324e68b9d445190005ea13,When did women join the Protestant ministry?,"{'entity_1': 'Protestant', 'entity_2': '', 'en...","[0, 0, 0, 0, 0, 1, 0]"
2,3,s-dev-572a1ba46aef051400155292,What can it sometimes take up to 14 years to g...,"{'entity_1': '14 years', 'entity_2': '', 'enti...","[0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0]"
3,4,s-train-56f85519a6d7ea1400e17594,What was also pushed for by result of sarmatism?,"{'entity_1': 'sarmatism', 'entity_2': '', 'ent...","[0, 0, 0, 0, 0, 0, 0, 0, 1]"
4,5,q-train-s3q25163,Who invented dynamite,"{'entity_1': 'dynamite', 'entity_2': '', 'enti...","[0, 0, 1]"


In [39]:

df.index = df.index + 1
df['id'] = df.index


In [40]:
df.head()

Unnamed: 0,id,uid,question,entities,bio_tags
1,1,q-dev-s3q44053,Over How Many Laps Is The Indianapolis 500 Con...,"{'entity_1': 'The Indianapolis 500', 'entity_2...","[0, 0, 0, 0, 0, 1, 2, 2, 0]"
2,2,s-train-57324e68b9d445190005ea13,When did women join the Protestant ministry?,"{'entity_1': 'Protestant', 'entity_2': '', 'en...","[0, 0, 0, 0, 0, 1, 0]"
3,3,s-dev-572a1ba46aef051400155292,What can it sometimes take up to 14 years to g...,"{'entity_1': '14 years', 'entity_2': '', 'enti...","[0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0]"
4,4,s-train-56f85519a6d7ea1400e17594,What was also pushed for by result of sarmatism?,"{'entity_1': 'sarmatism', 'entity_2': '', 'ent...","[0, 0, 0, 0, 0, 0, 0, 0, 1]"
5,5,q-train-s3q25163,Who invented dynamite,"{'entity_1': 'dynamite', 'entity_2': '', 'enti...","[0, 0, 1]"


In [41]:
df.to_json("new_3000_edit.json", orient='records')