In [69]:
def get_topics(topics_file):
    queries = {}
    with open(topics_file, 'rt') as f:
        for line in f:
            q = json.loads(line)
            qid = q["qid"]
            queries[qid] = q
    return queries

In [70]:
def candidate_sets(topics_file):
    topic_doc_sets = {}
    queries = get_topics(topics_file)
    for id, query in queries.items():
        doc_set = set([])
        docs = query['documents']
        for d in docs:
            doc_set.add(d['doc_id'])
        topic_doc_sets[id] = doc_set
    return topic_doc_sets

In [94]:
''' refine PyTerrier rankings to: include missing docs, exclude docs not in candidate set, query scores sum to 1 

    you will also have to check that all the topics (qids) have documents in the ranking. 
    There was at least 1 topic that Terrier returned no documents for. You can just assign random relevance 
    scores for documents for queries that have no documents returned.     
'''
def get_relevance(res_location, topics_file):
    final_ranking = []
    candidates = candidate_sets(topics_file)
    current_topic = '0'
    with open(res_location) as f:
        topic_rank = []
        topic_scores = []
        for line in f:
            parts = line.strip().split()
            topic = int(parts[0])
            docno = parts[2]
            score = float(parts[4])
            
            if topic != current_topic:
                '''first document of topic found'''
                if current_topic != '0':
                    '''check remaining docs at end of a topic'''
                    for doc in topic_candidates:
                        topic_rank.append(doc)
                        topic_scores.append(topic_scores[-1]*0.9)
                    tscore = sum(topic_scores)
                    topic_scores = [w/tscore for w in topic_scores]
                    for i, d in enumerate(topic_rank):
                        final_ranking.append(str(current_topic) + " Q0 " + d + " " + str(i) + " " + str(topic_scores[i]) + " " + parts[5])
                    
                current_topic = topic
                topic_candidates = candidates[current_topic]
                topic_rank = []
                topic_scores = []
                
                inset = docno in topic_candidates
                if inset:
                    topic_rank.append(docno)
                    topic_scores.append(score)
                    topic_candidates.remove(docno)
                    print(topic_scores)
            
            else:
                inset = docno in topic_candidates
                if inset:
                    topic_rank.append(docno)
                    topic_scores.append(score)
                    topic_candidates.remove(docno)
                    print(topic_scores)
   
    for doc in topic_candidates:
        topic_rank.append(doc)
        topic_scores.append(topic_scores[-1]*0.9)
    tscore = sum(topic_scores)
    topic_scores = [w/tscore for w in topic_scores]
    for i, d in enumerate(topic_rank):
        final_ranking.append(str(current_topic) + " Q0 " + d + " " + str(i) + " " + str(topic_scores[i]) + " " + parts[5])
            
    return final_ranking


In [72]:
def write_res(res, res_path):
    with open(res_path, 'wt') as f:
        for inst in res:
            f.write(inst + "\n")

In [97]:
ranking = get_relevance("tfidf_1.res" , "TREC-Fair-Ranking-training-sample.json")
write_res(ranking, "new.res")

[10.198426177299014]
[10.198426177299014, 9.993020656139358]
[10.198426177299014, 9.993020656139358, 9.943577060979429]
[10.198426177299014, 9.993020656139358, 9.943577060979429, 9.885493144055346]
[10.198426177299014, 9.993020656139358, 9.943577060979429, 9.885493144055346, 9.859026276239746]
[10.198426177299014, 9.993020656139358, 9.943577060979429, 9.885493144055346, 9.859026276239746, 9.855139484406328]
[10.198426177299014, 9.993020656139358, 9.943577060979429, 9.885493144055346, 9.859026276239746, 9.855139484406328, 9.845174977699617]
[10.198426177299014, 9.993020656139358, 9.943577060979429, 9.885493144055346, 9.859026276239746, 9.855139484406328, 9.845174977699617, 9.832836888857017]
[10.198426177299014, 9.993020656139358, 9.943577060979429, 9.885493144055346, 9.859026276239746, 9.855139484406328, 9.845174977699617, 9.832836888857017, 9.71648636312768]
[10.198426177299014, 9.993020656139358, 9.943577060979429, 9.885493144055346, 9.859026276239746, 9.855139484406328, 9.8451749776

In [None]:
ranking = get_relevance("path/to/your/PyTerrier.res", "path/to/your/TREC-Fair-Ranking-training-sample.json")
write_res(ranking, "path/to/save/new.res")