In [14]:
import json
import random
import uuid
import numpy as np

In [2]:
""" Loading all important files: The individual documents, the pairs and the golden data (with paraphrases)"""
with open('./data/documents_en_nometadata.json','r') as f:
    documents_all = json.load(f)
with open('./data/pairs_en.json','r') as f:
    pairs_all = json.load(f)
with open('./data/golden-data-new.json','r') as f:
    golden_data = json.load(f)

In [4]:
""" Check that no golden-data pair appears in the pair set """
gpairs_set = []
for g in golden_data:
    gpairs_set.append(set((g['reference']['id'],g['paraphrase']['id'])))
    gpairs_set.append(set((g['reference']['id'],g['high']['id'])))
    gpairs_set.append(set((g['reference']['id'],g['medium']['id'])))
    gpairs_set.append(set((g['reference']['id'],g['low']['id'])))
    gpairs_set.append(set((g['reference']['id'],g['none']['id'])))

pairs_set = []
for p in pairs_all:
    pairs_set.append(set((p['id1'],p['id2'])))
    
assert(any([(p in gpairs_set) for p in pairs_set]) == False)

In [21]:
def getRandomPairs(counter, n):
    p = [(6-c) for c in counter]
    p = [pp/sum(p) for pp in p]
    indexes = np.random.choice(a=list(range(len(pairs_all))),size=(1,3),replace=False,p=p)[0]
    pairs = []
    for i in indexes:
        pair = pairs_all[i]
        document1 = {"id": pair['id1']}
        document1['body'] = [d['text'] for d in documents_all if d['id'] == document1['id']][0]
        document2 = {"id": pair['id2']}
        document2['body'] = [d['text'] for d in documents_all if d['id'] == document2['id']][0]
        pair_p = {'document_1': document1, 'document_2': document2, 'g_id': 0}
        pairs.append((pair_p,i))
    return pairs

def getRandomGoldenPairs():
    relation_levels = ['none','low','medium','high']
    [gset1, gset2] = random.sample(golden_data,2)
    [level1, level2] = random.sample(list(range(4)),2)
    
    document1_1 = {"id": gset1['reference']['id'], 'body':gset1['reference']['text']}
    document1_2 = {"id": gset1[relation_levels[level1]]['id'], 'body':gset1[relation_levels[level1]]['text']}
    pair_1 = {
        'document_1': document1_1,
        'document_2': document1_2,
        'g_id': (2 if level1>level2 else 1)
    }
    
    document2_1 = {"id": gset2['reference']['id'], 'body':gset2['reference']['text']}
    document2_2 = {"id": gset2[relation_levels[level2]]['id'], 'body':gset2[relation_levels[level2]]['text']}
    pair_2 = {
        'document_1': document2_1,
        'document_2': document2_2,
        'g_id': (2 if level2>level1 else 1)
    }
    return (pair_1, pair_2)

def generateDocumentPairSet(counter, n=3):
    (gpair1, gpair2) = getRandomGoldenPairs()
    pairs = getRandomPairs(counter, n)
    return [gpair1] + [p for (p,i) in pairs] + [gpair2], [i for (p,i) in pairs]

def generateDocumentPairSets():
    counter = [0]*len(pairs_all)
    documentSets = []
    while (any([c < 5 for c in counter])):
        documentSet, indexes = generateDocumentPairSet(counter)
        documentSets.append(documentSet)
        for i in indexes:
            counter[i] = counter[i] + 1
    return documentSets, counter

In [22]:
""" Generates the sets of pairs of documents. Each pair will go on one HIT. We make sure each pair appears in at least 5 hits."""
ds, c = generateDocumentPairSets()
print(len(ds), c)

# Uncomment this to overwrite
#with open('./data/DocumentPairSetsForHITS.json','w+') as f:
#    json.dump(ds,f,indent=4)

203 [6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]


In [19]:
def getRandomRanking(counter, target):
    p = [max((target-c),0.01) for c in counter]
    p = [pp/sum(p) for pp in p]
    main_document = {"id": np.random.choice(a=pairs_all,p=p)['id1']}
    main_document['body'] = [d['text'] for d in documents_all if d['id'] == main_document['id']][0]
    
    documents = []
    indexes = []
    pairs_to_main_document_indexes = [i for (i,pair) in enumerate(pairs_all) if pair['id1'] == main_document['id']]
    p2 = [max((target-counter[i]),0.01) for i in pairs_to_main_document_indexes]
    p2 = [pp/sum(p2) for pp in p2]
    for i in np.random.choice(a=pairs_to_main_document_indexes,size=(1,3),replace=False,p=p2)[0]:
        documents.append({
            'id': pairs_all[i]['id2'],
            'body': [d['text'] for d in documents_all if d['id'] == pairs_all[i]['id2']][0],
            'g_id': 0
        })
        indexes.append(i)
        
    return {'main_document': main_document, 'documents': documents}, indexes

def getRandomGoldenRanking():
    relation_levels = ['none','low','medium','high','paraphrase']
    gset = random.choice(golden_data)
    levels = [0,random.choice([1,2]),4]
    
    main_document = {
        'id': gset['reference']['id'],
        'body': gset['reference']['text']
    }
    
    documents = []
    for l in levels:
        d = gset[relation_levels[l]]
        g_id = 0
        if (l==min(levels)):
            g_id = 1
        elif (l==max(levels)):
            g_id = 2
        documents.append({
            'id': d['id'],
            'body': d['text'],
            'g_id': g_id
        })
        
    return {'main_document': main_document, 'documents': documents}

def generateDocumentRankingSet(counter, target, n=4):
    granking = getRandomGoldenRanking()
    rankings = []
    while True:
        ranking = getRandomRanking(counter=counter, target=target)
        if any([ranking[0]['main_document']['id'] == r[0]['main_document']['id'] for r in rankings]):
            continue
        else:
            rankings.append(ranking)
            if len(rankings) == n:
                break
    return [granking] + [r for (r,i) in rankings], [i for (r,i) in rankings]

def generateDocumentRankingSets():
    target=25
    counter = [0]*len(pairs_all)
    documentSets = []
    while (any([c < target for c in counter])):
        documentSet, indexes = generateDocumentRankingSet(counter=counter, target=target)
        documentSets.append({'_id':uuid.uuid4(),'documents':documentSet})
        for i in indexes:
            for ii in i:
                counter[ii] = counter[ii] + 1
    return documentSets, counter

In [20]:
""" Same as with the pairs. We make sure each ranking appears at least in a target amount of HITs (default is 25, see above)"""
ds, c = generateDocumentRankingSets()
print(len(ds), c)

# Uncomment this to overwrite
#with open('./data/DocumentRankingSetsForHITS.json','w+') as f:
#    json.dump(ds,f,indent=4)

219 [25, 25, 25, 25, 26, 25, 25, 25, 25, 25, 25, 26, 25, 26, 25, 25, 25, 25, 25, 25, 25, 26, 25, 25, 26, 25, 26, 27, 26, 26, 26, 25, 26, 27, 25, 27, 27, 25, 26, 25, 27, 25, 27, 26, 26, 25, 27, 26, 27, 25, 25, 28, 27, 25, 26, 27, 26, 25, 25, 25, 26, 26, 25, 25, 25, 26, 25, 25, 25, 25, 26, 25, 25, 25, 25, 28, 26, 27, 26, 27, 28, 27, 30, 26, 27, 27, 27, 28, 25, 27, 25, 26, 25, 25, 26, 25, 26, 25, 25, 25, 26, 25]
