In [1]:
%load_ext autoreload
%autoreload 2

In [29]:
import os
from pathlib import Path
import transformers
from torch.utils.data import DataLoader, RandomSampler, DistributedSampler, SequentialSampler
import repo_training_codellm.FiD.src.data as src_data

In [4]:
# need to had been set to root of all code repos, like '/home/toolkit/code/'
os.environ['PYTHONPATH']

'/home/toolkit/code/'

In [53]:
opt = type("opt", (object, ), {})()
opt.passage_maxlength = 200
opt.question_maxlength = 40
opt.train_data = '/repo_data/open_domain_data/NQ/test.json'
opt.n_context = 3
opt.per_gpu_batch_size  = 2
opt.is_distributed = False

In [9]:
tokenizer = transformers.BertTokenizerFast.from_pretrained('bert-base-uncased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [33]:
collator_function = src_data.RetrieverCollator(
    tokenizer, 
    passage_maxlength=opt.passage_maxlength, 
    question_maxlength=opt.question_maxlength
)

In [21]:
train_examples = src_data.load_data(opt.train_data)

In [61]:
len(train_examples)

3610

In [54]:
train_dataset = src.data.Dataset(train_examples, opt.n_context)

In [55]:
train_dataset[0]

{'index': 0,
 'question': 'question: who got the first nobel prize in physics',
 'target': 'Wilhelm Conrad Röntgen </s>',
 'passages': ['title: Nobel Prize in Physics context: Nobel Prize in Physics The Nobel Prize in Physics () is a yearly award given by the Royal Swedish Academy of Sciences for those who have made the most outstanding contributions for mankind in the field of physics. It is one of the five Nobel Prizes established by the will of Alfred Nobel in 1895 and awarded since 1901; the others being the Nobel Prize in Chemistry, Nobel Prize in Literature, Nobel Peace Prize, and Nobel Prize in Physiology or Medicine. The first Nobel Prize in Physics was awarded to physicist Wilhelm Röntgen in recognition of the extraordinary services he',
  'title: Nobel Prize context: His son, George Paget Thomson, received the same prize in 1937 for showing that they also have the properties of waves. William Henry Bragg and his son, William Lawrence Bragg, shared the Physics Prize in 1915 fo

In [35]:
train_sampler = DistributedSampler(train_dataset) if opt.is_distributed else RandomSampler(train_dataset)
train_dataloader = DataLoader(
    train_dataset, 
    sampler=train_sampler, 
    batch_size=opt.per_gpu_batch_size, 
    drop_last=True, 
    num_workers=0, 
    collate_fn=collator_function
)

In [36]:
# return (index, question_ids, question_mask, passage_ids, passage_masks, scores)
batch = next(iter(train_dataloader))



In [65]:
batch = collator_function([train_dataset[0]] + [train_dataset[1]]+[train_dataset[2]])

In [59]:
batch[5].shape

torch.Size([2, 3, 200])

In [66]:
batch[5]

tensor([[1.0000, 1.0000, 1.0000],
        [0.5000, 0.5000, 0.5000],
        [0.3333, 0.3333, 0.3333]])

In [67]:
train_dataset[0]['question']

'question: who got the first nobel prize in physics'

In [68]:
train_dataset[1]['question']

'question: when is the next deadpool movie being released'

In [69]:
train_dataset[2]['question']

'question: which mode is used for short wave broadcast service'

In [9]:
import json
import pickle
from generate_hole_and_rule_contexts_mod import get_hole_context, get_default_prompt
from transformers import GPT2TokenizerFast
from context import *

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")


base_dir = '/repo_data/repo_preprocessed_data/'
split = 'train'

def create_file_line_char_mapping(repo):
    file_line_char_mapping = {}
    data = pickle.load(open(os.path.join(base_dir, split, repo, 'hole_data'), 'rb'))
    for k, v in data.items():
        for (l, c) in v:
            key = k + '_' + str(l)
            file_line_char_mapping[key] = c
    return file_line_char_mapping

def get_file_and_hole_pos(hole):
    hole_parts = hole.split('/')[-1].split('_')
    if len(hole_parts) > 3:
        new_hole_parts = hole_parts[:-2]
        filename = '_'.join(new_hole_parts)
        filename = [filename]
    else:
        filename = [hole_parts[0]]
    file = '/'.join(hole.split('/')[:-1] + filename)
    pos = (int(hole_parts[-2]), int(hole_parts[-1]))
    return file, pos
    

repos = os.listdir(os.path.join(base_dir, 'medium_' + split))
for repo in repos:
    print(repo)
    file_line_char_mapping = create_file_line_char_mapping(repo)
    f_out = open(os.path.join(base_dir, 'medium_' + split, repo, "char_hole_and_rule_contexts.json"), "w")
    lines = open(os.path.join(base_dir, 'medium_' + split, repo, 'hole_and_rule_contexts.json')).readlines()
    assert len(lines) <= 10000
    for line in lines:
        entry = json.loads(line)
        hole_id = entry['id']
        file_name, hole_pos = get_file_and_hole_pos(hole_id)
        hole_line = hole_pos[0]
        line_char = file_line_char_mapping[file_name + '_' + str(hole_line)]
        new_hole_pos = (hole_line, line_char)
        entry['id'] = file_name + '_' + str(new_hole_pos[0]) + '_' + str(new_hole_pos[1])
        hole_context, target = get_hole_context(file_name, new_hole_pos)
        entry['question'] = hole_context
        entry['target'] = target
        entry['answers'] = [target]
        default_context_obj = getContext(context_location='in_file',
                                tokenizer=tokenizer,
                                file=file_name,
                                context_len=4072,
                                context_scope='pre',\
                                context_type='lines',\
                                top_k=-1)
        entry['ctxs'][16]['text']=get_default_prompt(new_hole_pos, default_context_obj)
        f_out.write(json.dumps(entry))
        f_out.write("\n")
        f_out.flush()
    f_out.close()

SeanDecker1


Token indices sequence length is longer than the specified maximum sequence length for this model (1027 > 1024). Running this sequence through the model will result in indexing errors


ValeriyKnyazhev
My-DIGI-ID
CDZR0
TreeZhiyuan
slickqa
VladRomanchuk
baishuo
david2999999
eddiewgj
JDode
android-little-boy
leshiv
zli78122
kenichi-ando
sistcoop
DieguinhoHR
smallxiongxiong
DeyanZhelyazkov
wuxinlingluan
java-ea
opentok
MFunction96
MfromAzeroth
jannal
google
pengcash
FLxmw
akash-coded
tacticalrce
ot-maksim
DwArFeng
ToreAad
fengpod
dongjihui666
Manolomon
mariodavid
AvaN0x
HRI-EU
