In [None]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
stop_list=stopwords.words('english')
porter = PorterStemmer()
import urllib
import requests
import json
from collections import Counter
from IPython.display import clear_output,display

from elasticsearch import Elasticsearch
es = Elasticsearch()

from pprint import pprint
import math
import pandas as pd

index_term="test"
index_entity="test_entities"

In [None]:
def modify_text(text):
    tokens = es.indices.analyze(index = index_term, body = {
        'analyzer': 'my_english_analyzer',
        'text': text.replace("@en", '')
    })['tokens']

    return (" ".join([token['token'] for token in tokens]))

In [None]:
#Loading the train queries

QUERIES_FILE = "data/queries.txt"
QUERIES_FILE_2 = "data/queries2.txt"

def load_queries(query_file):
    queries = {}
    with open(query_file, "r") as fin:
        for line in fin.readlines():
            qid= line.strip().split("\t")[0]
            query=line.strip().split("\t")[1]
            #print(qid,"-",query)
            queries[qid] = query
    return queries

queries = load_queries(QUERIES_FILE_2)
#queries

In [None]:
# queries

In [None]:
#Prepocessing the queries (stemming,stopwords)

def preprocess(queries):
    for q_id,query in queries.items():
        query_=[]
        for term in query.split():
            if term not in stop_list:
                query_.append(porter.stem(term))
        queries[q_id]=" ".join(query_)        
    return queries

preprocessed_q=preprocess(queries)
#preprocessed_q

In [None]:
# preprocessed_q

## Calculating MLM score

In [None]:

def get_prob(index, term,field):
    """Returns the probability of the term given the field LM."""
    # first need to find a document that contains the term
    hits = es.search(index=index, q = term, df = field, _source=False, size=1).get("hits", {}).get("hits", {})    
    doc_id = hits[0]["_id"] if len(hits) > 0 else None
    if doc_id is not None:
        # ask for global term statistics when requesting the term vector of that doc
        tv = es.termvectors(index=index, doc_type="_doc", id=doc_id, fields=field, term_statistics=True)["term_vectors"][field]
#         print(term)
#         pprint(tv)
        ttf = tv["terms"].get(term, {}).get("ttf", 0)  # total term count in the collection (in that field)
        sum_ttf = tv["field_statistics"]["sum_ttf"]
        return ttf / sum_ttf

    return 0

In [None]:
#MLM score for each document-query pair

def get_mlm_score(query,index, doc_id, fields, weights, mu=2000):
    clear_output()
    FIELD_WEIGHT={}
    for i in range(len(fields)):
        FIELD_WEIGHT[fields[i]]=weights[i]
      
    tv = es.termvectors(index=index, doc_type="_doc", id=doc_id, fields=fields, term_statistics=True).get('term_vectors', {})
    print(query, list(tv.keys()),doc_id)

    tf = {} # tf[field][t] holds the frequency of term `t` in a given document field; extract the values from `tv`
    
    score = 0  # holds log P(q|d)
    for term in query.split():  # this is the main summation over query terms
        ptd = 0
        for field in fields:
#             clear_output()
            pttdi = 0  

            if field not in tv.keys():
                print("No", field)
                ptdi=0
                pttdi=0
            elif term in tv[field]["terms"].keys():
#                 pprint(tv[field]["terms"][term])
                
                len_d = sum([stats['term_freq'] for term, stats in tv[field]['terms'].items()])
                ptdi = tv[field]["terms"][term]["term_freq"] /len_d 
        
                ptci = get_prob(index, term,field)  
                pttdi = (ptdi + mu * ptci )/(len_d*mu) 
                
            ptd += FIELD_WEIGHT[field] * math.log(pttdi+1) if pttdi>0 else 0
        score += ptd
    return score

In [None]:

def search(indexname, query, field, size):
    url = "/".join([API, indexname, "_search"]) + "?" \
          + urllib.parse.urlencode({"q": query, "df": field, "size": size})
    response = requests.get(url).text
    return json.loads(response)


def rerank(preprocessed_q,fields,weights):
    i = 0
    mlm_scores={}
    for q_id, query in preprocessed_q.items():
        i+=1
        print("\n--------------------------------------------------------------------")

        print(q_id, query)
        res = es.search(index=index_term, q=query, df=fields[0], _source=False, size = 150).get('hits', {})
        
        # Re-rank documents using MLM
        scores = {}
        for doc in res.get("hits"):
            
            doc_id = doc.get("_id")
            scores[doc_id] = get_mlm_score(query,index_term, doc_id, fields, weights)

        mlm_scores[q_id]=scores

        clear_output()
        
    return(mlm_scores)


In [None]:
mlm_scores=rerank(preprocessed_q,fields=["label","content"],weights=[0.2,0.8])
clear_output()

In [None]:
def sort_scores(preprocessed_q, scores_):
    sorted_scores = {}
    score_df = {}
    i = 0
    for q_id, query in preprocessed_q.items():
        i+=1
        scores = scores_[q_id]
        scores = sorted(scores.items(), key=lambda score: score[1], reverse = True)[:100]
        sorted_scores[q_id] = scores

        for score in scores:
            clear_output()
            score_df['QueryId'] = score_df.get('QueryId', []) + [q_id]

            res = es.search(
                index = index_term, 
                body = {
                    'from': 0,
                    'size': 1, # Maximum number of results to return
                    "query": {
                        "match": {
                            "_id": score[0]
                        }
                    }
                }
            )
            id_ = "<dbpedia:"+res['hits']['hits'][0]['_source']['links'][0].split("/")[-1]

            print(q_id, query, score[0], id_)
            score_df['EntityId'] = score_df.get('EntityId', []) + [id_]
            
    return (sorted_scores, pd.DataFrame(score_df))

In [None]:
sorted_mlm_scores, score_df = sort_scores(preprocessed_q, mlm_scores)

In [None]:
score_df.to_csv("ranking_mlm_q2.csv", index = False, sep = ",")

In [None]:
#calculating the score for unigram SDM

SDM_unigram_scores=rerank(preprocessed_q,fields=["content"],weights=[1])

## Calculating the score for ordered bigrams

In [None]:
from nltk import ngrams

def get_bigrams(preprocessed_q):
    bigram_queries={}
    for q_id,query in preprocessed_q.items():
        bigarms = list(ngrams(query.split(), 2))
        bigram_queries[q_id]=bigarms
        #print(title_ngrams)
    return(bigram_queries)

bigram_queries=get_bigrams(preprocessed_q)

In [None]:
def count_ordered_bigram_matches(text, bigram):
    """Counts the number of bigram matches in text. Both text and bigram are represented as a list of terms."""
    count = 0
    for i in range(len(text) - 1):
        if text[i] == bigram[0]:
            if text[i + 1] == bigram[1]:
                count += 1
    return count

In [None]:
def count_unordered_bigram_matches(text, bigram, w):
    """Counts the number of unordered bigram matches in text within a given window size. 
    Both text and bigram are represented as a list of terms."""
    count = 0
    for i in range(len(text) - 1):
        if text[i] in bigram:
            
            other_term = bigram[0] if text[i] == bigram[1] else bigram[1]
            if other_term in text[i+1:i+w]:
                count += 1
    return count


In [None]:

def get_term_sequence(es, doc_id, field):
    tv = es.termvectors(index=index_term, id=doc_id, fields=[field])
    # We first put terms in a position-indexed dict.
    pos = {}
    for term, tinfo in tv['term_vectors'][field]['terms'].items():
        for token in tinfo['tokens']:
            pos[token['position']] = term
    # Then, turn that dict to a list.
    seq = [None] * (max(pos.keys()) + 1)
    for p, term in pos.items():
        seq[p] = term
    return seq


def get_bigram_counts(bigram_queries,method,field):

    bigram_count={}
    for q_id, query in bigram_queries.items():
#         clear_output()
        print("\n--------------------------------------------------------------------")
        print(q_id, query)
        bigrams={}
        for q in query:
            docs={}
    #         print("\n--------------------------------------------------------------------")
            q=" ".join(q)
            res = es.search(index=index_term, body={'query': {'match': {field: q}}})
    #         pprint(res)
            docs={}
            for hit in res['hits']['hits']:
    #             pprint(hit)
                doc_id = hit['_id']
                text = get_term_sequence(es, doc_id, field)
    #             print(text)
                if method=="ordered":
                    count = count_ordered_bigram_matches(text, q.split())
                else:
                    count = count_unordered_bigram_matches(text, q.split(),4)


                docs[doc_id]=count
            bigrams[q]= docs
        bigram_count[q_id]=bigrams
        clear_output()
    return(bigram_count)



In [None]:
order_bigram_count=get_bigram_counts(bigram_queries, method="ordered", field="content")
#order_bigram_count

In [None]:
unorder_bigram_count=get_bigram_counts(bigram_queries,method="unordered", field="content")

In [None]:
def get_bigram_scores(bigram_dictionary,q_id,bigram,doc_id,mu,field):
    
    try:
        sum_doclen, doc_lens = length_doc_bigram(bigram_dictionary,q_id,bigram,field=field)
        print(doc_lens)
        P_bigram=sum(bigram_dictionary[q_id][bigram].values())/sum_doclen
        return (math.log((bigram_dictionary[q_id][bigram][doc_id]+mu *P_bigram)/(mu +doc_lens[doc_id])))
    except:
        return 0
    


In [None]:

def length_doc_bigram (bigram_dictionary,q_id,bigram,field):
    doc_lens = {}
    
    documents=bigram_dictionary[q_id][bigram].keys()
    sum_doclen=0
    for doc_id in documents:
        print(doc_id)

        doc_lens[doc_id]=0
        if bigram_dictionary[q_id][bigram][doc_id]>0:
            res = es.search(
                index = index_term, 
                body = {
                    'from': 0,
                    'size': 1, # Maximum number of results to return
                    "query": {
                        "match": {
                            "_id": doc_id
                        }
                    }
                }
            )
            doc_lens[doc_id] = len(res['hits']['hits'][0]['_source'][field].split())

        sum_doclen += doc_lens[doc_id]

        
        print(doc_lens[doc_id])
    return (sum_doclen, doc_lens)

In [None]:
#Calculating f_o 

def rerank_(bigram_dictionary, field = 'content'):
    doc_ids_={}
    for q_id,bigram_dict in bigram_dictionary.items():
        doc_ids=[]
        for bigram,doc_dict in bigram_dict.items():
            doc_ids = doc_ids + list(doc_dict.keys())
        doc_ids_[q_id] = list(set(doc_ids_.get(q_id, []) + doc_ids))
    # doc_ids_

    score_q_doc={}
    for q_id,bigram_dict in bigram_dictionary.items():
        score_doc={}
        for doc_id in doc_ids_[q_id]:
            score=0
            #print(type(doc_id))
            for bigram in bigram_dict.keys():
                clear_output()
                score+=get_bigram_scores(bigram_dictionary,q_id,bigram,doc_id,mu=2000,field=field)
            score_doc[doc_id]=score
        score_q_doc[q_id]=score_doc
    return(score_q_doc)


In [None]:
order_bigram_score=rerank_(order_bigram_count, field = 'content')

## Calculating the score for unordered bigrams

In [None]:
un_order_bigram_score=rerank_(unorder_bigram_count, field = 'content')

In [None]:
#Combining the scores query,document from different methods
def Merge(dict1, dict2): 
#     res = {**dict1, **dict2} 
    for key, value in dict2.items():
        if key in dict1.keys():
            dict1[key] = dict1[key] + dict2[key]
        else:
            dict1[key] = dict2[key]
    return dict1


In [None]:
#SDM_unigram_scores=>unigrams
#order_bigram_score=>ordered bigrams
#un_order_bigram_score=>unordered bigrams

SDM_score={}
for q_id,query in preprocessed_q.items():
    #print(q_id)
    SDM_unigram_scores[q_id].update((x, y*0.8/len(preprocessed_q[q_id])) for x, y in SDM_unigram_scores[q_id].items())
    order_bigram_score[q_id].update((x, y*0.05/(len(preprocessed_q[q_id])-1)) for x, y in order_bigram_score[q_id].items())
    un_order_bigram_score[q_id].update((x, y*0.05/(len(preprocessed_q[q_id])-1)) for x, y in un_order_bigram_score[q_id].items())
    merged_dict1=Merge(SDM_unigram_scores[q_id], order_bigram_score[q_id])
    merged_dict_final=Merge(merged_dict1, un_order_bigram_score[q_id])
    SDM_score[q_id]=merged_dict_final




In [None]:
sorted_sdm_scores, score_df = sort_scores(preprocessed_q, SDM_score)

In [None]:
score_df.to_csv("ranking_sdm_q2.csv", index = False, sep = ",")

## Computing ELR

In [None]:
#Loading query entities
import json

def load_entity_q(filepath):
    with open(filepath) as json_file:
        data = json.load(json_file)
        json_file.close()

    return(data)
data=load_entity_q('data/entity_annotations.json')
#print(data)

In [None]:


def score_ELR(entity,doc_id,weights=[0.2,0.8],lmbda=0.1):
    entity = entity.lower()
    fields=["label","content"]
    
    sum_score_field = 0
    for i, weight in enumerate(weights):
        tfe=0
        dfe = 0
        field=fields[i]
        tv = es.mtermvectors(
            index=index_entity, 
            doc_type='_doc',
            body=dict(ids=[doc_id],parameters=dict(term_statistics=True,field_statistics=True,fields=[field])))
        
        
        
        pprint(tv['docs'][0]['term_vectors'].keys())
        
        field_len = 0
        
        if field not in tv['docs'][0]['term_vectors'].keys():
            score_field = 0
        else:
            for terms in list(tv['docs'][0]['term_vectors'][field]['terms'].keys()):
                field_len +=tv['docs'][0]['term_vectors'][field]['terms'][terms]['term_freq']

            if entity in list(tv['docs'][0]['term_vectors'][field]['terms'].keys()):
                tfe=1
                dfe += 1

            score_field = weight *((1-lmbda)*tfe +lmbda*(dfe/field_len))
        sum_score_field+=score_field
#         print(tfe,dfe, field_len)
    return math.log(sum_score_field+1)

    
# 

In [None]:

def preprocess(text):
    text_words = []
    for term in text.replace("<","").replace(">","").split("_"):
        if term not in stop_list:
            text_words.append(porter.stem(term))
    
    return "_".join(text_words)

In [None]:
def rerank_ELR(data):
    score_query={}
    for q_id,entities in data.items():
        clear_output()
        
        all_entity_scores=[]

        for entity,values in entities.items():
            entity_=preprocess(entity)
            res = es.search(index=index_entity, body={
                'from': 0,
                'size': 10, 
                "query": {
                    "multi_match": {
                        "fields": ["label", "content"],
                        "query": entity_
                    }
                }
            })
            clear_output()
            print("\n", entity_)
            docs={}
            for hit in res['hits']['hits']:
                doc_id = hit['_id']
                #print(doc_id)
                score_doc=data[q_id][entity]['score']*score_ELR(entity_,doc_id)
                docs[doc_id] = score_doc

            if len(docs) > 0:
                all_entity_scores.append(docs)
        score_query[q_id]=all_entity_scores
    # pprint(score_query)
    
    elr_score = {}
    x={}
    for q_id,scores in score_query.items():
        for score in scores:
            x= Merge(x,score)
        elr_score[q_id]=dict(x)
    clear_output()
    return(elr_score)

In [None]:
elr_score=rerank_ELR(data)

In [None]:
#Final score SDR+ELR

SDM_ELR_score={}
for q_id,query in preprocessed_q.items():
    #print(q_id)
    weight_entity=0
    for entity in data[q_id].keys():
        weight_entity+=data[q_id][entity]["score"]
 
    elr_score[q_id].update((x, y*0.1/weight_entity) for x, y in elr_score[q_id].items())
    merged_dict=Merge(elr_score[q_id],SDM_score[q_id])
    SDM_ELR_score[q_id]=merged_dict

In [None]:
sorted_sdm_elr_scores, score_df = sort_scores(preprocessed_q, SDM_ELR_score)

In [None]:
score_df.to_csv("ranking_sdm_elr_q2.csv", index = False, sep = ",")

## MLM + ELR


In [None]:
MLM_ELR_score={}
for q_id,query in preprocessed_q.items():
    #print(q_id)
    weight_entity=0
    for entity in data[q_id].keys():
        weight_entity += data[q_id][entity]["score"]
 
    elr_score[q_id].update((x, y*0.1/weight_entity) for x, y in elr_score[q_id].items())
    
    merged_dict=Merge(elr_score[q_id],mlm_scores[q_id])
    
    MLM_ELR_score[q_id]=merged_dict

In [None]:
sorted_MLM_ELR_scores, score_df = sort_scores(preprocessed_q, MLM_ELR_score)

In [None]:
score_df.to_csv("ranking_mlm_elr_q2.csv", index = False, sep = ",")