In [1]:
# construct training data for reranker

In [2]:
# 模拟alpaca训练数据格式

In [3]:
import json
import datasets

In [4]:
data_path = '/home1/cxy/tart/rankgpt/data/marco-train-100k.jsonl'
data = [json.loads(line) for line in open(data_path)]
print('Num of training samples', len(data))

Num of training samples 100000


In [5]:
permutation = '/home1/cxy/tart/rankgpt/data/marco-train-100k-gpt3.5.json'
response = json.load(open(permutation))
print('Num of permutation samples', len(response))

Num of permutation samples 100000


In [42]:
data[28961]

{'query': 'define:semordnilap',
 'query_id': '129844',
 'positive_passages': [{'docid': '7817031',
   'title': '-',
   'text': ' desserts  /  stressed .  diaper  /  repaid 1  . A word, phrase, or sentence that has the property of forming another word, phrase, or sentence when its letters are reversed. 2  A semordnilap differs from a palindrome in that the word or phrase resulting from the reversal is different from the original word or phrase.'}],
 'negative_passages': [],
 'retrieved_passages': []}

In [43]:
response[28961]

"I'm sorry, but you have not provided any passages for me to rank. Please provide the passages so I can rank them based on their relevance to the search query."

In [8]:
# 1.  设计
# 	1. 总体思路：沿用RankGPT设置
# 	2. 模型输入：instruction+query+candidate passage
# 	3. 模型输出：序号列表
# 	4. 监督信号：GPT3.5
# 	5. 输入passage数量：10
# 	6. eval_size：4000
# 2. 细节实现
# 	1. 加入positive passage后如何安排序号？随机插入序列
# 	2. 输入的passage序列是否包含顺序信息？数据包含两部分信息：1）从20个retrieved_passages里随机挑选9个与positive不同的文档 2）更新gpt3.5排序信息，从rank映射到id（或者其他更容易查询的形式） 3）对挑选出的样本构成两类数据（1. 随机顺序，2.BM25顺序）先构造BM25顺序
# 	3. 输入10个文档后长度超过限制怎么办？1）计算构造好数据的token数量 2）如果未超过，则跳过 3）如果超过最大长度（2048），则计算文档去除instruction后的最大平均长度，对超过平均长度的文档截取到最大平均长度
# 3. 输出格式
# 	1. 保存为json格式
# 	2. 每个样本包含：1）instruction (query) 2）input (candidate passage list) 3）num (passage number) 4）label list (like [5] > [3] > [8] > [7] > [2] > [1] > [6] > [9] > [10] > [11])

In [9]:
# 构造BM25和GPT3.5结果的id list
def construct_order_id_list(data, i, label=False):
    id_list = []
    positive_passage = data[i]['positive_passages'][0]
    query = data[i]['query']
    query_id = data[i]['query_id']
    retrieved_passages = data[i]['retrieved_passages']
    if label:
        id_list.append(positive_passage['docid'])
    for item in retrieved_passages:
        if label and item['docid'] == positive_passage['docid']:
            continue
        id_list.append(item['docid'])
    passage_dict = {}
    for passage in retrieved_passages:
        passage_dict[passage['docid']] = passage['text']
    # add positive passage
    passage_dict[positive_passage['docid']] = positive_passage['text']
    return id_list, query_id, query, positive_passage, passage_dict

In [10]:
# from original rankgpt code
def receive_response(data, responses):
    def clean_response(response: str):
        new_response = ''
        for c in response:
            if not c.isdigit():
                new_response += ' '
            else:
                new_response += c
        new_response = new_response.strip()
        return new_response

    def remove_duplicate(response):
        new_response = []
        for c in response:
            if c not in new_response:
                new_response.append(c)
        return new_response

    new_data = []
    for item, response in zip(data, responses):
        response = clean_response(response)
        response = [int(x) - 1 for x in response.split()]
        response = remove_duplicate(response)
        passages = item['retrieved_passages']
        original_rank = [tt for tt in range(len(passages))]
        response = [ss for ss in response if ss in original_rank]
        response = response + [tt for tt in original_rank if tt not in response]
        new_passages = [passages[ii] for ii in response]
        new_data.append({'query': item['query'],
                         'query_id': item['query_id'],
                         'positive_passages': item['positive_passages'],
                         'retrieved_passages': new_passages})
    return new_data

gpt_data = receive_response(data, response)

In [11]:
gpt_data[1]

{'query': 'how was google founded',
 'query_id': '387441',
 'positive_passages': [{'docid': '5210298',
   'title': 'What year was Google founded?',
   'text': "Google was co-founded in by Larry Page and Sergey Brin while they were students at Stanford University. Google was first incorporated as a privately held company on September â\x80¦4th, 1998. The company name Google has its origin in the word Googol, which is the term for the number that represents 'one followed by hundred zeros'."}],
 'retrieved_passages': [{'docid': '1571948',
   'rank': 5,
   'text': "The name came from the search engine's use of back-links to determine page relevance. This is a patented algorithm known as PageRank. Brin and Page left Stanford and founded Google, Inc in September of 1998. Google was an instant hit, and by the year 2000, Google was the world's largest search engine. By 2001 it did something that eluded most of the dot.com business startups of the time. Google became profitable. How Google Make

In [44]:
import random
from tqdm import tqdm
def construct_data(tokenizer, save_path, data, response, num=10):
    instruction = "Rank the {num} passages based on their relevance to the search query. The passages will be listed in descending order using identifiers, and the most relevant passages should be listed first, and the output format should be [] > [] > etc\n\n### Query:\n{instruction}\n\n### Candidates:\n{input}\n\n### Response:\n"
    #for each sample in data and response, construct the order list
    assert len(data) == len(response)
    truncated_query_num = 0
    save_lst = []
    for i in tqdm(range(len(data))):
        bm25_id_list, query_id, query, positive_passage, passage_dict = construct_order_id_list(data, i, label=False)
        gpt_id_list, gpt_query_id, _, _, _ = construct_order_id_list(gpt_data, i, label=True)
        assert query_id == gpt_query_id
        # random select num-1 passages from bm25_id_list
        # if bbm25_id_list contain less than num-1 passages, print warning and skip
        if len(bm25_id_list) < num-1:
            print(f"Warning: {query_id} contain less than {num-1} passages")
            continue
        ordered_random_id_list = random.sample(range(len(bm25_id_list)), num)
        ordered_random_id_list.sort()
        bm25_selected_id_list = [bm25_id_list[i] for i in ordered_random_id_list]
        # if positive id not in bm25_selected_id_list , randomly delete one id and then insert positive passage randomly into bm25_selected_id_list
        if positive_passage['docid'] not in bm25_selected_id_list:
            del_idx = random.randint(0, num-1)
            del bm25_selected_id_list[del_idx]
            insert_idx = random.randint(0, num-1)
            bm25_selected_id_list.insert(insert_idx, positive_passage['docid'])
        label_idx_lst = [bm25_selected_id_list.index(item) if item in bm25_selected_id_list else -1 for item in gpt_id_list]
        # delete -1 in label_idx_lst
        label_idx_lst = [item+1 for item in label_idx_lst if item != -1]
        #3. 输出格式
        # 	1. 保存为json格式
        # 	2. 每个样本包含：1）instruction (query) 2）input (candidate passage list) 3）num (passage number) 4）label list (like [5] > [3] > [8] > [7] > [2] > [1] > [6] > [9] > [10] > [11])
        label_text = ' > '.join(['['+ str(item)+']' for item in label_idx_lst])
        input_text = '\n'.join([f'[{rank+1}] '+passage_dict[item] for rank, item in enumerate(bm25_selected_id_list)])
        instruction_text = query
        full_input_text = instruction.format(num=num, instruction=instruction_text, input=input_text)
        input_text_without_input_text = instruction.format(num=num, instruction=instruction_text, input='')
        # compute if the tokenized full_input_text is longer than 2048
        tokenized_full_input_text = tokenizer(
                                        full_input_text,
                                        truncation=False,
                                        max_length=2048,
                                        padding=False,
                                        return_tensors=None,
                                    )
        if len(tokenized_full_input_text['input_ids']) > 2047:
            # count the number of truncated query
            truncated_query_num += 1
            # compute the average length of the passage, which = (2048 - len(input_text_without_input_text)) / num
            average_passage_length = int((2047 - len(input_text_without_input_text)) / num)
            # for each passage in bm25_selected_id_list, truncate the passage to average_passage_length
            for idx, item in enumerate(bm25_selected_id_list):
                passage = passage_dict[item]
                passage_tokenized = tokenizer(
                                        passage,
                                        truncation=False,
                                        max_length=average_passage_length,
                                        padding=False,
                                        return_tensors=None,
                                    )['input_ids']
                if len(passage_tokenized) > average_passage_length:
                    passage_tokenized = passage_tokenized[:average_passage_length]
                    passage_dict[item] = tokenizer.decode(passage_tokenized)
                    # print query_id, item, len(passage_tokenized), len(tokenizer(passage)["input_ids"])
                    # print the length of the truncated passage and the original passage
                    print(f'query_id is {query_id}, passage_id is {item}')
                    print(f'the length of the truncated passage is {len(passage_tokenized)}')
                    print(f'the length of the original passage is {len(tokenizer(passage)["input_ids"])}')
            # recompute the input_text
            input_text = '\n'.join([f'[{rank+1}] '+passage_dict[item] for rank, item in enumerate(bm25_selected_id_list)])
            full_input_text = instruction.format(num=num, instruction=instruction_text, input=input_text)
            tokenized_full_input_text_truncated = tokenizer(
                                        full_input_text,
                                        truncation=False,
                                        max_length=2048,
                                        padding=False,
                                        return_tensors=None,
                                    )
            assert len(tokenized_full_input_text_truncated['input_ids']) <= 2047
        # print the important information of the first and second sample
        if i == 0 or i == 1:
            print(f'query_id is {query_id}, positive_passage_id is {positive_passage["docid"]}')
            # print id_list
            print(f'bm25_id_list is {bm25_selected_id_list}')
            print(f'gpt_id_list is {gpt_id_list}')
            # print label_idx_lst
            print(f'label_idx_lst is {label_idx_lst}')
            # print input_text
            print(f'input_text is {full_input_text}')
            # print label_text
            print(f'label_text is {label_text}')
            # print length of tokenized_full_input_text
            print(f'the length of tokenized_full_input_text is {len(tokenized_full_input_text["input_ids"])}')
            # if the length of tokenized_full_input_text is longer than 2048, print the length of tokenized_full_input_text_truncated
            if len(tokenized_full_input_text['input_ids']) > 2047:
                print(f'the length of tokenized_full_input_text_truncated is {len(tokenized_full_input_text_truncated["input_ids"])}')



        # add sample to save_lst
        sample = {'instruction': instruction_text, 'input': input_text, 'num': num, 'output': label_text}
        save_lst.append(sample)
    print(f'the number of truncated query is {truncated_query_num}')
    # save save_lst to save_path
    with open(save_path, 'w') as f:
        json.dump(save_lst, f)
    return save_lst

In [45]:
# 定义llama tokenizer
from transformers import LlamaForCausalLM, LlamaTokenizer
random.seed(42)
tokenizer = LlamaTokenizer.from_pretrained('/home1/cxy/alpaca-lora/llama-7b-hf')
save_path = '/home1/cxy/alpaca-lora/process_data/reranker_100k_bm25_2_gpt.json'
save_lst = construct_data(tokenizer, save_path, data, response, num=10)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LLaMATokenizer'. 
The class this function is called from is 'LlamaTokenizer'.
  0%|                                                                                                                                                                                                                               | 35/100000 [00:00<09:26, 176.61it/s]

query_id is 653897, positive_passage_id is 3430420
bm25_id_list is ['2028777', '1881698', '1307105', '7167426', '1480292', '1095947', '4914824', '8583068', '4008463', '3430420']
gpt_id_list is ['3430420', '8600899', '4008463', '2234858', '2028777', '8722220', '1881698', '1307105', '7134835', '5463894', '2322981', '4914824', '1480292', '1095946', '1095947', '7610677', '7140436', '7367129', '1515988', '7167426', '8583068']
label_idx_lst is [10, 9, 1, 2, 3, 7, 5, 6, 4, 8]
input_text is Rank the 10 passages based on their relevance to the search query. The passages will be listed in descending order using identifiers, and the most relevant passages should be listed first, and the output format should be [] > [] > etc

### Query:
what does time and a half mean

### Candidates:
[1] What does a drug's half-life mean? Half-life is the period of time it takes for a substance undergoing decay to decrease by half. A drug's shelf-life is determined by finding out how long it takes a medication or 

  4%|█████████▋                                                                                                                                                                                                                   | 4365/100000 [00:21<08:45, 181.98it/s]

query_id is 19578, passage_id is 6043396
the length of the truncated passage is 172
the length of the original passage is 251
query_id is 19578, passage_id is 1308917
the length of the truncated passage is 172
the length of the original passage is 279
query_id is 19578, passage_id is 4269955
the length of the truncated passage is 172
the length of the original passage is 263
query_id is 19578, passage_id is 1458028
the length of the truncated passage is 172
the length of the original passage is 291


  5%|███████████▊                                                                                                                                                                                                                 | 5342/100000 [00:26<08:38, 182.58it/s]

query_id is 1056864, passage_id is 8226301
the length of the truncated passage is 172
the length of the original passage is 200
query_id is 1056864, passage_id is 5209526
the length of the truncated passage is 172
the length of the original passage is 362
query_id is 1056864, passage_id is 5846509
the length of the truncated passage is 172
the length of the original passage is 298
query_id is 1056864, passage_id is 6243312
the length of the truncated passage is 172
the length of the original passage is 236
query_id is 1056864, passage_id is 6869799
the length of the truncated passage is 172
the length of the original passage is 194
query_id is 1056864, passage_id is 4913721
the length of the truncated passage is 172
the length of the original passage is 173


  7%|███████████████▍                                                                                                                                                                                                             | 6976/100000 [00:35<08:27, 183.40it/s]

query_id is 278965, passage_id is 6517753
the length of the truncated passage is 174
the length of the original passage is 206
query_id is 278965, passage_id is 6517757
the length of the truncated passage is 174
the length of the original passage is 205
query_id is 278965, passage_id is 6517754
the length of the truncated passage is 174
the length of the original passage is 292
query_id is 278965, passage_id is 6455675
the length of the truncated passage is 174
the length of the original passage is 195
query_id is 278965, passage_id is 2705398
the length of the truncated passage is 174
the length of the original passage is 178
query_id is 278965, passage_id is 1520902
the length of the truncated passage is 174
the length of the original passage is 499


  7%|████████████████▎                                                                                                                                                                                                            | 7377/100000 [00:37<08:35, 179.50it/s]

query_id is 317299, passage_id is 6240184
the length of the truncated passage is 174
the length of the original passage is 236
query_id is 317299, passage_id is 3888784
the length of the truncated passage is 174
the length of the original passage is 208
query_id is 317299, passage_id is 4740796
the length of the truncated passage is 174
the length of the original passage is 307
query_id is 317299, passage_id is 4740793
the length of the truncated passage is 174
the length of the original passage is 229
query_id is 317299, passage_id is 4740791
the length of the truncated passage is 174
the length of the original passage is 201
query_id is 317299, passage_id is 4740788
the length of the truncated passage is 174
the length of the original passage is 208
query_id is 317299, passage_id is 4740795
the length of the truncated passage is 174
the length of the original passage is 199


  9%|██████████████████▉                                                                                                                                                                                                          | 8542/100000 [00:43<08:31, 178.93it/s]

query_id is 98009, passage_id is 1987766
the length of the truncated passage is 174
the length of the original passage is 195
query_id is 98009, passage_id is 5510739
the length of the truncated passage is 174
the length of the original passage is 242
query_id is 98009, passage_id is 5702399
the length of the truncated passage is 174
the length of the original passage is 219
query_id is 98009, passage_id is 7673375
the length of the truncated passage is 174
the length of the original passage is 230
query_id is 98009, passage_id is 1987768
the length of the truncated passage is 174
the length of the original passage is 181
query_id is 98009, passage_id is 2805633
the length of the truncated passage is 174
the length of the original passage is 189
query_id is 98009, passage_id is 794927
the length of the truncated passage is 174
the length of the original passage is 183
query_id is 98009, passage_id is 4829369
the length of the truncated passage is 174
the length of the original passage 

 10%|█████████████████████▎                                                                                                                                                                                                       | 9641/100000 [00:49<08:37, 174.58it/s]

query_id is 269392, passage_id is 6407644
the length of the truncated passage is 173
the length of the original passage is 216
query_id is 269392, passage_id is 6407645
the length of the truncated passage is 173
the length of the original passage is 197
query_id is 269392, passage_id is 6190451
the length of the truncated passage is 173
the length of the original passage is 180
query_id is 269392, passage_id is 1106692
the length of the truncated passage is 173
the length of the original passage is 192
query_id is 269392, passage_id is 7051026
the length of the truncated passage is 173
the length of the original passage is 199
query_id is 269392, passage_id is 1956852
the length of the truncated passage is 173
the length of the original passage is 320
query_id is 269392, passage_id is 6190450
the length of the truncated passage is 173
the length of the original passage is 196
query_id is 269392, passage_id is 3741430
the length of the truncated passage is 173
the length of the original

 11%|████████████████████████▊                                                                                                                                                                                                   | 11299/100000 [00:58<08:26, 175.11it/s]

query_id is 777576, passage_id is 6868743
the length of the truncated passage is 172
the length of the original passage is 244
query_id is 777576, passage_id is 5059087
the length of the truncated passage is 172
the length of the original passage is 297
query_id is 777576, passage_id is 6356313
the length of the truncated passage is 172
the length of the original passage is 249
query_id is 777576, passage_id is 5718239
the length of the truncated passage is 172
the length of the original passage is 233
query_id is 777576, passage_id is 7974754
the length of the truncated passage is 172
the length of the original passage is 227
query_id is 777576, passage_id is 1948386
the length of the truncated passage is 172
the length of the original passage is 192


 13%|███████████████████████████▊                                                                                                                                                                                                | 12617/100000 [01:05<08:14, 176.74it/s]

query_id is 500236, passage_id is 8539072
the length of the truncated passage is 174
the length of the original passage is 189
query_id is 500236, passage_id is 1446602
the length of the truncated passage is 174
the length of the original passage is 232
query_id is 500236, passage_id is 4895651
the length of the truncated passage is 174
the length of the original passage is 233
query_id is 500236, passage_id is 6977044
the length of the truncated passage is 174
the length of the original passage is 280
query_id is 500236, passage_id is 897311
the length of the truncated passage is 174
the length of the original passage is 197
query_id is 500236, passage_id is 6785243
the length of the truncated passage is 174
the length of the original passage is 270
query_id is 500236, passage_id is 1901468
the length of the truncated passage is 174
the length of the original passage is 242


 13%|█████████████████████████████▎                                                                                                                                                                                              | 13345/100000 [01:08<08:23, 172.05it/s]

query_id is 41860, passage_id is 534913
the length of the truncated passage is 173
the length of the original passage is 174
query_id is 41860, passage_id is 534910
the length of the truncated passage is 173
the length of the original passage is 282
query_id is 41860, passage_id is 5276340
the length of the truncated passage is 173
the length of the original passage is 247
query_id is 41860, passage_id is 6326611
the length of the truncated passage is 173
the length of the original passage is 261
query_id is 41860, passage_id is 3370682
the length of the truncated passage is 173
the length of the original passage is 190
query_id is 41860, passage_id is 8390710
the length of the truncated passage is 173
the length of the original passage is 261
query_id is 41860, passage_id is 6608091
the length of the truncated passage is 173
the length of the original passage is 188


 16%|███████████████████████████████████▌                                                                                                                                                                                        | 16152/100000 [01:23<07:50, 178.18it/s]

query_id is 525599, passage_id is 594977
the length of the truncated passage is 173
the length of the original passage is 204
query_id is 525599, passage_id is 4772173
the length of the truncated passage is 173
the length of the original passage is 218
query_id is 525599, passage_id is 6168041
the length of the truncated passage is 173
the length of the original passage is 259
query_id is 525599, passage_id is 6414815
the length of the truncated passage is 173
the length of the original passage is 244
query_id is 525599, passage_id is 978801
the length of the truncated passage is 173
the length of the original passage is 287
query_id is 525599, passage_id is 8401904
the length of the truncated passage is 173
the length of the original passage is 214


 24%|████████████████████████████████████████████████████▌                                                                                                                                                                       | 23870/100000 [02:05<07:23, 171.56it/s]

query_id is 101630, passage_id is 1538261
the length of the truncated passage is 172
the length of the original passage is 253
query_id is 101630, passage_id is 2973586
the length of the truncated passage is 172
the length of the original passage is 290
query_id is 101630, passage_id is 7423546
the length of the truncated passage is 172
the length of the original passage is 194
query_id is 101630, passage_id is 2040016
the length of the truncated passage is 172
the length of the original passage is 271
query_id is 101630, passage_id is 6645879
the length of the truncated passage is 172
the length of the original passage is 237
query_id is 101630, passage_id is 5469190
the length of the truncated passage is 172
the length of the original passage is 188


 25%|███████████████████████████████████████████████████████▎                                                                                                                                                                    | 25131/100000 [02:12<07:13, 172.81it/s]

query_id is 32864, passage_id is 6729517
the length of the truncated passage is 173
the length of the original passage is 188
query_id is 32864, passage_id is 6367685
the length of the truncated passage is 173
the length of the original passage is 183
query_id is 32864, passage_id is 5844571
the length of the truncated passage is 173
the length of the original passage is 219
query_id is 32864, passage_id is 6729519
the length of the truncated passage is 173
the length of the original passage is 226
query_id is 32864, passage_id is 5732269
the length of the truncated passage is 173
the length of the original passage is 183
query_id is 32864, passage_id is 8007135
the length of the truncated passage is 173
the length of the original passage is 312
query_id is 32864, passage_id is 5844568
the length of the truncated passage is 173
the length of the original passage is 328


 29%|██████████████████████████████████████████████████████████████▋                                                                                                                                                             | 28504/100000 [02:31<07:03, 168.78it/s]

query_id is 511684, passage_id is 6170619
the length of the truncated passage is 173
the length of the original passage is 191
query_id is 511684, passage_id is 6170616
the length of the truncated passage is 173
the length of the original passage is 180
query_id is 511684, passage_id is 153237
the length of the truncated passage is 173
the length of the original passage is 179
query_id is 511684, passage_id is 6101838
the length of the truncated passage is 173
the length of the original passage is 209
query_id is 511684, passage_id is 5656649
the length of the truncated passage is 173
the length of the original passage is 324
query_id is 511684, passage_id is 3000555
the length of the truncated passage is 173
the length of the original passage is 316


 29%|███████████████████████████████████████████████████████████████▊                                                                                                                                                            | 28986/100000 [02:33<06:27, 183.48it/s]



 30%|██████████████████████████████████████████████████████████████████▋                                                                                                                                                         | 30308/100000 [02:40<06:55, 167.67it/s]

query_id is 99396, passage_id is 843197
the length of the truncated passage is 172
the length of the original passage is 251
query_id is 99396, passage_id is 1463493
the length of the truncated passage is 172
the length of the original passage is 236
query_id is 99396, passage_id is 1162590
the length of the truncated passage is 172
the length of the original passage is 253
query_id is 99396, passage_id is 6152254
the length of the truncated passage is 172
the length of the original passage is 253
query_id is 99396, passage_id is 498238
the length of the truncated passage is 172
the length of the original passage is 252
query_id is 99396, passage_id is 6980156
the length of the truncated passage is 172
the length of the original passage is 230
query_id is 99396, passage_id is 1720419
the length of the truncated passage is 172
the length of the original passage is 177


 38%|███████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                        | 37840/100000 [03:22<06:16, 164.94it/s]

query_id is 201010, passage_id is 1739377
the length of the truncated passage is 174
the length of the original passage is 358
query_id is 201010, passage_id is 1896728
the length of the truncated passage is 174
the length of the original passage is 182
query_id is 201010, passage_id is 6037861
the length of the truncated passage is 174
the length of the original passage is 218
query_id is 201010, passage_id is 1642334
the length of the truncated passage is 174
the length of the original passage is 184
query_id is 201010, passage_id is 6037860
the length of the truncated passage is 174
the length of the original passage is 244
query_id is 201010, passage_id is 1395083
the length of the truncated passage is 174
the length of the original passage is 185
query_id is 201010, passage_id is 6294944
the length of the truncated passage is 174
the length of the original passage is 250


 38%|███████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                        | 38086/100000 [03:24<05:57, 173.33it/s]

query_id is 2719, passage_id is 6500114
the length of the truncated passage is 157
the length of the original passage is 289
query_id is 2719, passage_id is 6500115
the length of the truncated passage is 157
the length of the original passage is 368
query_id is 2719, passage_id is 6500118
the length of the truncated passage is 157
the length of the original passage is 184
query_id is 2719, passage_id is 6500116
the length of the truncated passage is 157
the length of the original passage is 158
query_id is 2719, passage_id is 6059815
the length of the truncated passage is 157
the length of the original passage is 158
query_id is 2719, passage_id is 206837
the length of the truncated passage is 157
the length of the original passage is 220


 42%|████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                               | 42083/100000 [03:46<05:57, 162.10it/s]

query_id is 627681, passage_id is 847403
the length of the truncated passage is 174
the length of the original passage is 242
query_id is 627681, passage_id is 3396594
the length of the truncated passage is 174
the length of the original passage is 259
query_id is 627681, passage_id is 3795182
the length of the truncated passage is 174
the length of the original passage is 327
query_id is 627681, passage_id is 6486968
the length of the truncated passage is 174
the length of the original passage is 327
query_id is 627681, passage_id is 1233703
the length of the truncated passage is 174
the length of the original passage is 267
query_id is 627681, passage_id is 574337
the length of the truncated passage is 174
the length of the original passage is 242
query_id is 627681, passage_id is 6058374
the length of the truncated passage is 174
the length of the original passage is 199
query_id is 627681, passage_id is 1751486
the length of the truncated passage is 174
the length of the original p

 46%|██████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                      | 46400/100000 [04:10<05:19, 167.69it/s]

query_id is 764661, passage_id is 3019478
the length of the truncated passage is 174
the length of the original passage is 206
query_id is 764661, passage_id is 4027096
the length of the truncated passage is 174
the length of the original passage is 250
query_id is 764661, passage_id is 6589508
the length of the truncated passage is 174
the length of the original passage is 177
query_id is 764661, passage_id is 6009290
the length of the truncated passage is 174
the length of the original passage is 179
query_id is 764661, passage_id is 3019476
the length of the truncated passage is 174
the length of the original passage is 186
query_id is 764661, passage_id is 4027095
the length of the truncated passage is 174
the length of the original passage is 247
query_id is 764661, passage_id is 4336792
the length of the truncated passage is 174
the length of the original passage is 209
query_id is 764661, passage_id is 1845481
the length of the truncated passage is 174
the length of the original

 49%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                 | 48624/100000 [04:22<05:06, 167.59it/s]

query_id is 524941, passage_id is 8000481
the length of the truncated passage is 173
the length of the original passage is 322
query_id is 524941, passage_id is 1874798
the length of the truncated passage is 173
the length of the original passage is 229
query_id is 524941, passage_id is 794928
the length of the truncated passage is 173
the length of the original passage is 229
query_id is 524941, passage_id is 943696
the length of the truncated passage is 173
the length of the original passage is 281


 51%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                          | 51424/100000 [04:38<04:41, 172.49it/s]

query_id is 106104, passage_id is 4610085
the length of the truncated passage is 173
the length of the original passage is 204
query_id is 106104, passage_id is 1371072
the length of the truncated passage is 173
the length of the original passage is 196
query_id is 106104, passage_id is 6814246
the length of the truncated passage is 173
the length of the original passage is 200
query_id is 106104, passage_id is 1371071
the length of the truncated passage is 173
the length of the original passage is 273
query_id is 106104, passage_id is 594413
the length of the truncated passage is 173
the length of the original passage is 277
query_id is 106104, passage_id is 6327947
the length of the truncated passage is 173
the length of the original passage is 294


 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                      | 53267/100000 [04:48<04:26, 175.19it/s]

query_id is 110333, passage_id is 5664413
the length of the truncated passage is 173
the length of the original passage is 391
query_id is 110333, passage_id is 5664417
the length of the truncated passage is 173
the length of the original passage is 200
query_id is 110333, passage_id is 3486516
the length of the truncated passage is 173
the length of the original passage is 292
query_id is 110333, passage_id is 8294456
the length of the truncated passage is 173
the length of the original passage is 179
query_id is 110333, passage_id is 624952
the length of the truncated passage is 173
the length of the original passage is 193
query_id is 110333, passage_id is 3486525
the length of the truncated passage is 173
the length of the original passage is 180
query_id is 110333, passage_id is 3486518
the length of the truncated passage is 173
the length of the original passage is 236


 56%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                 | 55677/100000 [05:01<04:13, 174.76it/s]

query_id is 40065, passage_id is 1030212
the length of the truncated passage is 173
the length of the original passage is 178
query_id is 40065, passage_id is 6980088
the length of the truncated passage is 173
the length of the original passage is 260
query_id is 40065, passage_id is 6012283
the length of the truncated passage is 173
the length of the original passage is 223
query_id is 40065, passage_id is 1030214
the length of the truncated passage is 173
the length of the original passage is 184
query_id is 40065, passage_id is 4163846
the length of the truncated passage is 173
the length of the original passage is 258
query_id is 40065, passage_id is 1074784
the length of the truncated passage is 173
the length of the original passage is 189
query_id is 40065, passage_id is 6012285
the length of the truncated passage is 173
the length of the original passage is 229


 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                        | 59634/100000 [05:23<04:00, 167.59it/s]

query_id is 797854, passage_id is 4891788
the length of the truncated passage is 174
the length of the original passage is 175
query_id is 797854, passage_id is 4891790
the length of the truncated passage is 174
the length of the original passage is 210
query_id is 797854, passage_id is 4891791
the length of the truncated passage is 174
the length of the original passage is 184
query_id is 797854, passage_id is 4891793
the length of the truncated passage is 174
the length of the original passage is 222
query_id is 797854, passage_id is 4891787
the length of the truncated passage is 174
the length of the original passage is 253
query_id is 797854, passage_id is 7207486
the length of the truncated passage is 174
the length of the original passage is 251
query_id is 797854, passage_id is 180377
the length of the truncated passage is 174
the length of the original passage is 182
query_id is 797854, passage_id is 4278508
the length of the truncated passage is 174
the length of the original 

 62%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                  | 62486/100000 [05:39<03:48, 164.34it/s]

query_id is 829947, passage_id is 2241535
the length of the truncated passage is 173
the length of the original passage is 263
query_id is 829947, passage_id is 6095964
the length of the truncated passage is 173
the length of the original passage is 195
query_id is 829947, passage_id is 4085751
the length of the truncated passage is 173
the length of the original passage is 211
query_id is 829947, passage_id is 5092971
the length of the truncated passage is 173
the length of the original passage is 197
query_id is 829947, passage_id is 858695
the length of the truncated passage is 173
the length of the original passage is 235
query_id is 829947, passage_id is 5755601
the length of the truncated passage is 173
the length of the original passage is 188
query_id is 829947, passage_id is 3794772
the length of the truncated passage is 173
the length of the original passage is 272


 63%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                 | 62814/100000 [05:40<03:35, 172.90it/s]



 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                | 63550/100000 [05:45<03:35, 169.46it/s]

query_id is 1013137, passage_id is 4778078
the length of the truncated passage is 171
the length of the original passage is 279
query_id is 1013137, passage_id is 6999371
the length of the truncated passage is 171
the length of the original passage is 265
query_id is 1013137, passage_id is 3097059
the length of the truncated passage is 171
the length of the original passage is 306
query_id is 1013137, passage_id is 5917324
the length of the truncated passage is 171
the length of the original passage is 249
query_id is 1013137, passage_id is 701221
the length of the truncated passage is 171
the length of the original passage is 185
query_id is 1013137, passage_id is 926863
the length of the truncated passage is 171
the length of the original passage is 279
query_id is 1013137, passage_id is 7300573
the length of the truncated passage is 171
the length of the original passage is 185
query_id is 1013137, passage_id is 4477949
the length of the truncated passage is 171
the length of the or

 67%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                         | 66548/100000 [06:01<03:25, 163.08it/s]

query_id is 247146, passage_id is 416233
the length of the truncated passage is 171
the length of the original passage is 201
query_id is 247146, passage_id is 1617072
the length of the truncated passage is 171
the length of the original passage is 218
query_id is 247146, passage_id is 8270998
the length of the truncated passage is 171
the length of the original passage is 198
query_id is 247146, passage_id is 6115879
the length of the truncated passage is 171
the length of the original passage is 235
query_id is 247146, passage_id is 2724942
the length of the truncated passage is 171
the length of the original passage is 258
query_id is 247146, passage_id is 2494189
the length of the truncated passage is 171
the length of the original passage is 222
query_id is 247146, passage_id is 7808547
the length of the truncated passage is 171
the length of the original passage is 190
query_id is 247146, passage_id is 1656207
the length of the truncated passage is 171
the length of the original 

 67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                         | 66867/100000 [06:03<03:17, 167.71it/s]

query_id is 492169, passage_id is 3560678
the length of the truncated passage is 171
the length of the original passage is 248
query_id is 492169, passage_id is 652152
the length of the truncated passage is 171
the length of the original passage is 200
query_id is 492169, passage_id is 1587315
the length of the truncated passage is 171
the length of the original passage is 237
query_id is 492169, passage_id is 6168697
the length of the truncated passage is 171
the length of the original passage is 194
query_id is 492169, passage_id is 6505289
the length of the truncated passage is 171
the length of the original passage is 199
query_id is 492169, passage_id is 6318485
the length of the truncated passage is 171
the length of the original passage is 210
query_id is 492169, passage_id is 1587316
the length of the truncated passage is 171
the length of the original passage is 189
query_id is 492169, passage_id is 1553457
the length of the truncated passage is 171
the length of the original 

 76%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                     | 75651/100000 [06:52<02:25, 167.17it/s]

query_id is 692864, passage_id is 6294945
the length of the truncated passage is 172
the length of the original passage is 250
query_id is 692864, passage_id is 6868743
the length of the truncated passage is 172
the length of the original passage is 244
query_id is 692864, passage_id is 5059087
the length of the truncated passage is 172
the length of the original passage is 297
query_id is 692864, passage_id is 7974754
the length of the truncated passage is 172
the length of the original passage is 227
query_id is 692864, passage_id is 2975218
the length of the truncated passage is 172
the length of the original passage is 212
query_id is 692864, passage_id is 1723291
the length of the truncated passage is 172
the length of the original passage is 230
query_id is 692864, passage_id is 3025777
the length of the truncated passage is 172
the length of the original passage is 201
query_id is 692864, passage_id is 2176636
the length of the truncated passage is 172
the length of the original

 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                   | 84027/100000 [07:39<01:32, 172.91it/s]

query_id is 846456, passage_id is 8252106
the length of the truncated passage is 172
the length of the original passage is 242
query_id is 846456, passage_id is 6453416
the length of the truncated passage is 172
the length of the original passage is 295
query_id is 846456, passage_id is 8252108
the length of the truncated passage is 172
the length of the original passage is 202
query_id is 846456, passage_id is 8252107
the length of the truncated passage is 172
the length of the original passage is 267
query_id is 846456, passage_id is 8252105
the length of the truncated passage is 172
the length of the original passage is 215
query_id is 846456, passage_id is 6453415
the length of the truncated passage is 172
the length of the original passage is 193
query_id is 846456, passage_id is 5894122
the length of the truncated passage is 172
the length of the original passage is 193
query_id is 846456, passage_id is 7579340
the length of the truncated passage is 172
the length of the original

 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                 | 84790/100000 [07:43<01:39, 152.46it/s]

query_id is 189015, passage_id is 5413291
the length of the truncated passage is 173
the length of the original passage is 188
query_id is 189015, passage_id is 6089467
the length of the truncated passage is 173
the length of the original passage is 209
query_id is 189015, passage_id is 603302
the length of the truncated passage is 173
the length of the original passage is 187
query_id is 189015, passage_id is 52946
the length of the truncated passage is 173
the length of the original passage is 206
query_id is 189015, passage_id is 1901493
the length of the truncated passage is 173
the length of the original passage is 189
query_id is 189015, passage_id is 6239268
the length of the truncated passage is 173
the length of the original passage is 225
query_id is 189015, passage_id is 726275
the length of the truncated passage is 173
the length of the original passage is 306
query_id is 189015, passage_id is 3387588
the length of the truncated passage is 173
the length of the original pas

 87%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                            | 87142/100000 [07:56<01:16, 167.89it/s]

query_id is 805164, passage_id is 2178432
the length of the truncated passage is 172
the length of the original passage is 178
query_id is 805164, passage_id is 6814249
the length of the truncated passage is 172
the length of the original passage is 179
query_id is 805164, passage_id is 594416
the length of the truncated passage is 172
the length of the original passage is 212
query_id is 805164, passage_id is 1371071
the length of the truncated passage is 172
the length of the original passage is 273
query_id is 805164, passage_id is 594413
the length of the truncated passage is 172
the length of the original passage is 277
query_id is 805164, passage_id is 6327947
the length of the truncated passage is 172
the length of the original passage is 294
query_id is 805164, passage_id is 2226482
the length of the truncated passage is 172
the length of the original passage is 271


 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                  | 91750/100000 [08:22<00:46, 177.26it/s]



 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                | 92767/100000 [08:28<00:42, 168.67it/s]

query_id is 661497, passage_id is 1752072
the length of the truncated passage is 173
the length of the original passage is 258
query_id is 661497, passage_id is 3795182
the length of the truncated passage is 173
the length of the original passage is 327
query_id is 661497, passage_id is 5419009
the length of the truncated passage is 173
the length of the original passage is 266
query_id is 661497, passage_id is 1233703
the length of the truncated passage is 173
the length of the original passage is 267
query_id is 661497, passage_id is 574337
the length of the truncated passage is 173
the length of the original passage is 242


 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌              | 93439/100000 [08:32<00:38, 168.29it/s]

query_id is 465425, passage_id is 1896727
the length of the truncated passage is 174
the length of the original passage is 252
query_id is 465425, passage_id is 700658
the length of the truncated passage is 174
the length of the original passage is 226
query_id is 465425, passage_id is 6037861
the length of the truncated passage is 174
the length of the original passage is 218
query_id is 465425, passage_id is 700659
the length of the truncated passage is 174
the length of the original passage is 217
query_id is 465425, passage_id is 630894
the length of the truncated passage is 174
the length of the original passage is 184
query_id is 465425, passage_id is 1723291
the length of the truncated passage is 174
the length of the original passage is 230
query_id is 465425, passage_id is 6294944
the length of the truncated passage is 174
the length of the original passage is 250
query_id is 465425, passage_id is 6294945
the length of the truncated passage is 174
the length of the original pa

 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 99341/100000 [09:05<00:03, 164.98it/s]

query_id is 55368, passage_id is 6339868
the length of the truncated passage is 171
the length of the original passage is 320
query_id is 55368, passage_id is 2174215
the length of the truncated passage is 171
the length of the original passage is 238
query_id is 55368, passage_id is 498238
the length of the truncated passage is 171
the length of the original passage is 252
query_id is 55368, passage_id is 1668603
the length of the truncated passage is 171
the length of the original passage is 245
query_id is 55368, passage_id is 7048782
the length of the truncated passage is 171
the length of the original passage is 284


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100000/100000 [09:09<00:00, 182.14it/s]


the number of truncated query is 35


In [46]:
!wc -l /home1/cxy/alpaca-lora/process_data/reranker_100k_bm25_2_gpt.json

0 /home1/cxy/alpaca-lora/process_data/reranker_100k_bm25_2_gpt.json
