In [None]:
import json 
import pandas as pd
import random 

In [None]:
data_path = usr_path+ '/data/Eli5/Eli5_reranked/eli5_reranked.json'
output_folder = usr_path+ '/cross-encoder/eli5/splits/' 

In [None]:
# read dataset 
with open(data_path, 'r') as f:
    data = json.load(f)

data = pd.read_json(data, orient='records')

In [None]:
# randomly shuffle dataset
data = data.sample(frac=1).reset_index(drop=True)

In [None]:
num_positive_groups = 200000 # groups of similar passages
pos_neg_ratio = 4 # for every group of positive passages, there are 4 negative pairs  

In [None]:
# positive train & test pairs

positive_passages = []
for i in range(0, len(data)):
    passages = []
    for j in range(3):
        passages.append(data['passages'][i][j]['text']) # add top 3 relevant passages per input 
    positive_passages.append(passages)

data['positive_passages'] = positive_passages
positive_pairs = data[['input', 'positive_passages']]
positive_pairs_train = positive_pairs[:num_positive_groups]
positive_pairs_train['label'] = 1.0

num_positive_test = int(len(positive_pairs_train)*0.001)
positive_pairs_test = positive_pairs.tail(num_positive_test).reset_index(drop=True)

In [None]:
# negative train & test passages 

negative_passages = []
max_neg_test = 200 # number of groups of irrelevant passages
max_test_samples = 200 # for every positive pair, up to 200 negative examples

for i in range(0, len(data)):
    for j in range(3, 7):
        negative_passages.append(data['passages'][i][j]['text'])
        
random.shuffle(negative_passages)

negative_passages_train = negative_passages[:num_positive_groups*pos_neg_ratio]
negative_passages_test = negative_passages[len(negative_passages_train):]

negative_passages_train = [negative_passages_train[x:x+pos_neg_ratio] for x in range(0, len(negative_passages_train), pos_neg_ratio)]
negative_passages_test = [negative_passages_test[x:x+max_neg_test] for x in range(0, len(negative_passages_test), max_neg_test)]
negative_passages_test = negative_passages_test[:max_test_samples]

In [None]:
# negative train pairs

train_queries = list(positive_pairs_train['input'])

negative_pairs_train = []
for i in range(0, len(train_queries)):
    for j in range(0, pos_neg_ratio):
        neg_pair = [train_queries[i], negative_passages_train[i][j]]
        negative_pairs_train.append(neg_pair)

negative_pairs_train = pd.DataFrame(negative_pairs_train, columns = ['input', 'passage'])
negative_pairs_train['label'] = 0.0

In [None]:
# format train and test samples 
positive_pairs_train = positive_pairs_train.explode('positive_passages')
positive_pairs_train = positive_pairs_train.rename(columns = {'positive_passages':'passage'})
train_pairs = pd.concat([positive_pairs_train, negative_pairs_train])
train_pairs = train_pairs.sample(frac=1).reset_index(drop=True)

positive_pairs_test['negative'] = negative_passages_test
test_samples = positive_pairs_test.rename(columns={'input':'query', 'positive_passages': 'positive'})
test_samples = test_samples.sample(frac=1).reset_index(drop=True)

In [None]:
# save splits 
train_pairs.to_csv(output_folder + 'train_pairs.csv', index=False)
test_samples.to_csv(output_folder + 'test_samples.csv', index=False)