### TREC overview of the results of a run

In [1]:
def success_at_1 (relevant, retrieved):
    if len(retrieved) > 0 and retrieved[0] in relevant:
        return 1
    else:
        return 0
    
def success_at_5(relevant, retrieved):

    for item in range(0, 5):
        if retrieved[item] in relevant:
            return 1
    return 0

def success_at_10(relevant, retrieved):

    for item in range(0, 10):
        if retrieved[item] in relevant:
            return 1
    return 0

def precision_at_k(relevant, retrieved, k):

    relevant_at_k = []
    for doc in retrieved[:k]:
        if doc in relevant:
            relevant_at_k.append(doc)
    return len(relevant_at_k) / k

def r_precision(relevant, retrieved):

    R = len(relevant)
    relevant_at_R = []
    for doc in retrieved[:R]:
        if doc in relevant:
            relevant_at_R.append(doc)
    if R>0:
        return len(relevant_at_R) / R
    else:
        return 0.0
    
def average_precision(relevant, retrieved):

    relevant_retrieved = 0
    precision_sum = 0.0
    
    for i, doc in enumerate(retrieved):
        if doc in relevant:
            relevant_retrieved += 1
            precision_sum += relevant_retrieved / (i + 1)
    
    if relevant_retrieved == 0:
        return 0.0
    
    return precision_sum / relevant_retrieved
    
def interpolated_precision_at_recall_X (relevant, retrieved, X):

    relevant_retrieved = 0
    precision_at_recall = []

    for i, doc in enumerate(retrieved):
        if doc in relevant:
            relevant_retrieved += 1
        recall = relevant_retrieved / len(relevant)
        precision = relevant_retrieved / (i + 1)
        if recall >= X:
            precision_at_recall.append(precision)
    if precision_at_recall:
        return max(precision_at_recall)
    else:
        return 0.0
    
def mean_metric(measure, all_relevant, all_retrieved):
    total = 0
    count = 0
    for qid in all_relevant:
        relevant  = all_relevant[qid]
        retrieved = all_retrieved.get(qid, [])
        value = measure(relevant, retrieved)
        total += value
        count += 1
    return "mean " + measure.__name__, total / count


def trec_eval(qrels_file, run_file):

    def precision_at_1(rel, ret): return precision_at_k(rel, ret, k=1)
    def precision_at_5(rel, ret): return precision_at_k(rel, ret, k=5)
    def precision_at_10(rel, ret): return precision_at_k(rel, ret, k=10)
    def precision_at_50(rel, ret): return precision_at_k(rel, ret, k=50)
    def precision_at_100(rel, ret): return precision_at_k(rel, ret, k=100)
    def precision_at_recall_00(rel, ret): return interpolated_precision_at_recall_X(rel, ret, X=0.0)
    def precision_at_recall_01(rel, ret): return interpolated_precision_at_recall_X(rel, ret, X=0.1)
    def precision_at_recall_02(rel, ret): return interpolated_precision_at_recall_X(rel, ret, X=0.2)
    def precision_at_recall_03(rel, ret): return interpolated_precision_at_recall_X(rel, ret, X=0.3)
    def precision_at_recall_04(rel, ret): return interpolated_precision_at_recall_X(rel, ret, X=0.4)
    def precision_at_recall_05(rel, ret): return interpolated_precision_at_recall_X(rel, ret, X=0.5)
    def precision_at_recall_06(rel, ret): return interpolated_precision_at_recall_X(rel, ret, X=0.6)
    def precision_at_recall_07(rel, ret): return interpolated_precision_at_recall_X(rel, ret, X=0.7)
    def precision_at_recall_08(rel, ret): return interpolated_precision_at_recall_X(rel, ret, X=0.8)
    def precision_at_recall_09(rel, ret): return interpolated_precision_at_recall_X(rel, ret, X=0.9)
    def precision_at_recall_10(rel, ret): return interpolated_precision_at_recall_X(rel, ret, X=1.0)

    (all_relevant, all_retrieved) = read_eval_files(qrels_file, run_file)
    
    unknown_qids = set(all_retrieved.keys()).difference(all_relevant.keys())
    if len(unknown_qids) > 0:
        raise ValueError("Unknown qids in run: {}".format(sorted(list(unknown_qids))))

    metrics = [success_at_1,
               success_at_5,
               success_at_10,
               r_precision,
               precision_at_1,
               precision_at_5,
               precision_at_10,
               precision_at_50,
               precision_at_100,
               precision_at_recall_00,
               precision_at_recall_01,
               precision_at_recall_02,
               precision_at_recall_03,
               precision_at_recall_04,
               precision_at_recall_05,
               precision_at_recall_06,
               precision_at_recall_07,
               precision_at_recall_08,
               precision_at_recall_09,
               precision_at_recall_10,
               average_precision]

    return [mean_metric(metric, all_relevant, all_retrieved) for metric in metrics]


def print_trec_eval(qrels_file, run_file):
    results = trec_eval(qrels_file, run_file)
    print("Results for {}".format(run_file))
    for (metric, score) in results:
        print("{:<30} {:.4}".format(metric, score))

### Read the file

In [2]:
def read_qrels_file(qrels_file):  # reads the content of he qrels file
    trec_relevant = dict()  # query_id -> set([docid1, docid2, ...])
    with open(qrels_file, 'r') as qrels:
        for line in qrels:
            (qid, q0, doc_id, rel) = line.strip().split()
            if qid not in trec_relevant:
                trec_relevant[qid] = set()
            if (rel == "1"):
                trec_relevant[qid].add(doc_id)
    return trec_relevant

def read_run_file(run_file):  
    # read the content of the run file produced by our IR system 
    # (in the following exercises you will create your own run_files)
    trec_retrieved = dict()  # query_id -> [docid1, docid2, ...]
    with open(run_file, 'r') as run:
        for line in run:
            (qid, q0, doc_id, rank, score, tag) = line.strip().split()
            if qid not in trec_retrieved:
                trec_retrieved[qid] = []
            trec_retrieved[qid].append(doc_id) 
    return trec_retrieved
    

def read_eval_files(qrels_file, run_file):
    return read_qrels_file(qrels_file), read_run_file(run_file)

(all_relevant, all_retrieved) = read_eval_files('data01/FIR-s05-training-qrels.txt', 'data01/baseline.run')

### Collection indexing

In [3]:
import elasticsearch
import elasticsearch.helpers
import json

def read_documents(file_name):
    """
    Returns a generator of documents to be indexed by elastic, read from file_name
    """
    with open(file_name, 'r') as documents:
        for line in documents:
            doc_line = json.loads(line)
            if ('index' in doc_line):
                id = doc_line['index']['_id']
            elif ('PMID' in doc_line):
                doc_line['_id'] = id
                yield doc_line
            else:
                raise ValueError('Woops, error in index file')

def create_index(es, index_name, body={}):
    # delete index when it already exists
    es.indices.delete(index=index_name, ignore=[400, 404])
    # create the index 
    es.indices.create(index=index_name, body=body)
                
def index_documents(es, collection_file_name, index_name, body={}):
    create_index(es, index_name, body)
    # bulk index the documents from file_name
    return elasticsearch.helpers.bulk(
        es, 
        read_documents(collection_file_name),
        index=index_name,
        chunk_size=2000,
        request_timeout=30
    )

### Making your own TREC run

In [4]:
#THIS IS GRADED!

def make_trec_run(es, topics_file_name, run_file_name, index_name="genomics", run_name="test"):
    with open(run_file_name, 'w') as run_file:
        with open(topics_file_name, 'r') as test_queries:
            for line in test_queries:
                (qid, query) = line.strip().split('\t')
                # BEGIN ANSWER
                body = {
                    "query":{
                        "multi_match":{
                            "query":query,
                            "fields":["TI"]
                        }
                    }
                }
                
                response = es.search(index = index_name, body = body,size =1000)
                                    
                rank = 0
                for hit in response['hits']['hits']:
                    pmid = hit['_source']['PMID']
                    score = hit['_score']

                    run_file.write("{} Q0 {} {} {} {}\n".format(str(qid), str(pmid), str(rank), str(score), run_name))
                    rank+=1
                # END ANSWER
                

### Jelinek-Mercer Model

In [8]:
#THIS IS GRADED!

lmjelinekmercer = {
  # BEGIN ANSWER
    "settings": {
        "index":{
            "similarity":{
                "LMJ_similarity":{
                    "type":"LMJelinekMercer",
                    "lambda":0.2
                }

            },
            "number_of_shards": 1
        }
    },
    "mappings": {
        "properties": {
            "TI": {
                "type": "text",
                "copy_to": "all",
                "similarity": "LMJ_similarity"
            },
            "AB": {
                "type": "text",
                "copy_to": "all",
                "similarity": "LMJ_similarity"
            },
            "all": {
                "type": "text",
                "similarity": "LMJ_similarity"
            }
        }
    }
  # END ANSWER
}


In [9]:
es = elasticsearch.Elasticsearch('localhost',timeout=60)

index_documents(es, 'data01/FIR-s05-medline.json', 'genomics-jm', body=lmjelinekmercer)
make_trec_run(es, 'data01/FIR-s05-training-queries-simple.txt', 'lmjelinekmercer.run', 'genomics-jm')



### N-gram Tokenizer with Jelinek-Mercer Model

In [13]:
ngram = {
    "settings": {
    "index": {
        "max_ngram_diff": 10,
        "number_of_shards": 1,
        "similarity": {
            "LMJ_similarity": {
                "type": "LMJelinekMercer",
                "lambda": 0.2
            }
        }
    },
    "analysis": {
        "tokenizer": {
            "ngram_tokenizer": {
                "type": "ngram",
                "min_gram": 3,
                "max_gram": 10,
                "token_chars": ["letter", "digit"]
            }
        },
      "analyzer": {
          "ngram_analyzer": {
              "type": "custom",
              "tokenizer": "ngram_tokenizer",
              "filter": ["lowercase"]
          }
      }
    }
    },
    "mappings": {
        "properties": {
            "TI": {
                "type": "text",
                "analyzer": "ngram_analyzer",
                "similarity": "LMJ_similarity",
                "copy_to": "all"
            },
            "AB": {
                "type": "text",
                "analyzer": "ngram_analyzer",
                "similarity": "LMJ_similarity",
                "copy_to": "all"
            },
            "all": {
                "type": "text",
                "similarity": "LMJ_similarity"
            }
        }
    }
}

In [14]:
es = elasticsearch.Elasticsearch('localhost',timeout=60)

index_documents(es, 'data01/FIR-s05-medline.json', 'genomics_ngram', body = ngram)
make_trec_run(es, 'data01/FIR-s05-training-queries-simple.txt', 'ngram.run', "genomics_ngram")

### BM25 Score

In [16]:
body = {
  "query": {
    "match" : { "TI" : "structure refinement" }
  }
}
explain = es.explain(index="genomics-jm", id="3", body=body)
print ("BM25 score with lmjelinekmercer computed by Elasticsearch:",explain['explanation']['value'])  # BM25 score computed by ElasticSearch

BM25 score with lmjelinekmercer computed by Elasticsearch: 14.918538


In [17]:
body = {
  "query": {
    "match" : { "TI" : "structure refinement" }
  }
}

explain = es.explain(index="genomics_ngram", id="3", body=body)
print("BM25 score with ngram_analyzer computed by Elasticsearch:", explain['explanation']['value'])

BM25 score with ngram_analyzer computed by Elasticsearch: 385.84772


In [18]:
def perform_query_on_base(es, query_text):
    """
    Perform a search on the genomics-base index and return the BM25 score and retrieved documents
    """
    search_body = {
        "query": {
            "match": {
                "TI": query_text
            }
        }
    }
    response = es.search(index="genomics-jm", body=search_body)
    return response

result_base = perform_query_on_base(es, "structure refinement")

scores_base = [(hit['_id'], hit['_score']) for hit in result_base['hits']['hits']]

print("\nResults for genomics-jm tokenization:")
for doc_id, score in scores_base:
    print(f"Doc ID: {doc_id}, BM25 Score: {score}")


Results for genomics-jm tokenization:
Doc ID: 36131, BM25 Score: 16.557785
Doc ID: 36137, BM25 Score: 16.500984
Doc ID: 144161, BM25 Score: 15.938251
Doc ID: 488190, BM25 Score: 15.809368
Doc ID: 48538, BM25 Score: 15.688313
Doc ID: 219434, BM25 Score: 15.466249
Doc ID: 3, BM25 Score: 14.918538
Doc ID: 289779, BM25 Score: 14.692461
Doc ID: 198386, BM25 Score: 11.182945
Doc ID: 130401, BM25 Score: 10.846478


In [19]:
def perform_query_on_base(es, query_text):
    """
    Perform a search on the genomics-base index and return the BM25 score and retrieved documents
    """
    search_body = {
        "query": {
            "match": {
                "TI": query_text
            }
        }
    }
    response = es.search(index="genomics_ngram", body=search_body)
    return response

result_base = perform_query_on_base(es, "structure refinement")

scores_base = [(hit['_id'], hit['_score']) for hit in result_base['hits']['hits']]

print("\nResults for genomics_ngram tokenization:")
for doc_id, score in scores_base:
    print(f"Doc ID: {doc_id}, BM25 Score: {score}")


Results for genomics_ngram tokenization:
Doc ID: 48538, BM25 Score: 441.34824
Doc ID: 36137, BM25 Score: 421.822
Doc ID: 144161, BM25 Score: 415.12033
Doc ID: 488190, BM25 Score: 410.2574
Doc ID: 36131, BM25 Score: 408.253
Doc ID: 219434, BM25 Score: 402.56943
Doc ID: 171805, BM25 Score: 401.3041
Doc ID: 182229, BM25 Score: 398.067
Doc ID: 31435, BM25 Score: 396.339
Doc ID: 404615, BM25 Score: 389.55188


### Model Comparison

In [15]:
print_trec_eval('data01/FIR-s05-training-qrels.txt', 'ngram.run')
print("\n")
print_trec_eval('data01/FIR-s05-training-qrels.txt', 'lmjelinekmercer.run')

Results for ngram.run
mean success_at_1              0.05263
mean success_at_5              0.1579
mean success_at_10             0.2105
mean r_precision               0.0335
mean precision_at_1            0.05263
mean precision_at_5            0.03684
mean precision_at_10           0.02895
mean precision_at_50           0.01474
mean precision_at_100          0.008421
mean precision_at_recall_00    0.1029
mean precision_at_recall_01    0.0996
mean precision_at_recall_02    0.09241
mean precision_at_recall_03    0.08849
mean precision_at_recall_04    0.06121
mean precision_at_recall_05    0.05289
mean precision_at_recall_06    0.03606
mean precision_at_recall_07    0.03212
mean precision_at_recall_08    0.02542
mean precision_at_recall_09    0.02531
mean precision_at_recall_10    0.02531
mean average_precision         0.07881


Results for lmjelinekmercer.run
mean success_at_1              0.1579
mean success_at_5              0.1842
mean success_at_10             0.2368
mean r_precisio