### This notebook outputs the difference between expansion terms of two given methods

In [1]:
import json
import tqdm
from pathlib import Path

In [2]:
def queries_exp_terms_list_to_dict(queries_exp_terms_list):
    queries_exp_terms_dict = {}
    for i in range(len(queries_exp_terms_list)):
        queries_exp_terms_dict[queries_exp_terms_list[i]['topicNumber']] = queries_exp_terms_list[i]
    return queries_exp_terms_dict

In [3]:
def methods_exp_terms_diff(method1_terms, method2_terms):
    elements_diff = set()
    method1_exclude_2 = set(method1_terms) - set(method2_terms)
    method2_exclude_1 = set(method2_terms) - set(method1_terms)
    elements_diff.update(method1_exclude_2)
    elements_diff.update(method2_exclude_1)
    return method1_exclude_2, method2_exclude_1

In [32]:
# Get the difference between the topk expansion terms of (method1 vs. method2) and (method2 vs. method1) 
def get_method_diff(method1, method2, topk=10):
    # iterate over queries
    diff_per_q = {}
    for q in method1:
        if q not in method2:
            print(q)
            continue
        method1_exp_terms = []
        method2_exp_terms = []
        for i in range(topk):
            method1_exp_terms.append(method1[q]['terms'][i]['word'])
            method2_exp_terms.append(method2[q]['terms'][i]['word'])
        method1_ex_2, method2_ex_1 = methods_exp_terms_diff(method1_exp_terms[:topk], method2_exp_terms[:topk])
        diff_per_q.setdefault(q, {})
        # method1_ex_2: method1 expansion terms excluding method2 expansion terms (expansion terms only in method1)
        # method2_ex_1: method2 expansion terms excluding method1 expansion terms (expansion terms only in method2)
        diff_per_q[q] = {'topicNumber':q, 'diff_terms':{'method1_ex_2':method1_ex_2, 'method2_ex_1':method2_ex_1}} 
    return diff_per_q

## Example

### Read query into a dictionary of {q_id:q_text}

In [13]:
queries_file = '../data/robust/stopped_queries_lower.txt'

In [14]:
queries = {}
with open(queries_file) as f:
    for line in tqdm.tqdm(f):
        q_id , q_text = line.strip().split('\t')
        queries[q_id] = q_text

250it [00:00, 191345.99it/s]


### Read expansion terms into a dictionary of {q_id: [expterm1, expterm2, ...]}

In [15]:
expterms_path = '../data/robust/expansion_terms/'

In [16]:
ceqe_maxpool = queries_exp_terms_list_to_dict(json.load(open(expterms_path+'bm25.ceqe-maxpool.pretrained.json')))

In [29]:
rm3_exp_terms = queries_exp_terms_list_to_dict(json.load(open(expterms_path+'bm25.rm3.json')))

In [30]:
len(ceqe_maxpool.keys())

250

In [31]:
len(rm3_exp_terms)

250

### Print the difference between expansion terms

In [33]:
diff_ceqeMaxpool_rm3 = get_method_diff(ceqe_maxpool, rm3_exp_terms, topk=20)

In [37]:
for q_id in queries:
    print(q_id, queries[q_id])
    print("Expansion terms only in rm3: ",diff_ceqeMaxpool_rm3[q_id]['diff_terms']['method1_ex_2'])
    print("Expansion terms only in Glove", diff_ceqeMaxpool_rm3[q_id]['diff_terms']['method2_ex_1'])
    print('---'*10)

301 international organized crime
Expansion terms only in rm3:  {'1993', 'europe', 'crimes', 'intelligence', 'world', 'east'}
Expansion terms only in Glove {'states', 'cooperation', 'economic', 'control', 'new', 'interior'}
------------------------------
302 poliomyelitis post polio
Expansion terms only in rm3:  {'problems', 'ipv', 'immunization', 'opv', 'person', 'china', 'people', 'million', 'year'}
Expansion terms only in Glove {'declining', 'hemisphere', 'states', 'areas', 'programme', 'mass', 'cent', 'gujarat', 'annually'}
------------------------------
303 hubble telescope achievements
Expansion terms only in rm3:  {'flight', 'optical', 'astronauts', 'hubbles', 'mirrors', 'telescopes'}
Expansion terms only in Glove {'two', 'april', 'batteries', 'three', 'new', 'years'}
------------------------------
304 endangered species mammals
Expansion terms only in rm3:  {'regulations', 'mammal', 'resources', 'fish', 'information', 'protected'}
Expansion terms only in Glove {'take', 'taking'