In [1]:
def take_query_input():
    '''
    function takes no arguments
    takes input from user and converts all characters to lower case 
    returns lower case string
    '''
    query = input("Enter you query : ")
    return query.lower()

In [2]:
import json
import pandas as pd

def openindex_choice(choice):
    '''
    function takes 1 integer argument - choice
    based on choice, loads a pre computed index of the form {term : {doc_id : term_frequency}} from a json file
    returns the dictionary
    '''
    if(choice==1):
        f=open('indt.json')
        index=json.load(f)
    else:
        f=open('indwot.json')
        index=json.load(f)
    for word,string in index.items():
        index[word]=json.loads(index[word])
    return index

In [3]:
def openpostinglist_choice(choice):
    '''
    function takes 1 integer argument - choice
    based on choice, loads a pre computed posting list of the form {doc_id : {term : term_frequency}} from a json file
    returns the dictionary
    '''
    if(choice==1):
        q=open('polt.json')
        postinglist=json.load(q)
    else:
        q=open('polwot.json')
        postinglist=json.load(q)
    for docid,string in postinglist.items():
        postinglist[docid]=json.loads(postinglist[docid])
    return postinglist,len(postinglist.keys())

In [4]:
def opentitle():
    '''
    function takes no arguments
    loads a {doc_id: title} dictionary from a json file
    returns the dictionary
    '''
    q=open('title.json')
    title=json.load(q)
    for docid,string in title.items():
        title[docid]=list(json.loads(string).keys())[0].title()
    return title

In [5]:
from spellchecker import SpellChecker

def correctquery(query):
    '''
    takes 1 string argument - query
    returns the correctly spelled string
    '''
    spell=SpellChecker()
    spell.distance=1
    words=query.split(" ")
    corrected=[]
    for word in words:
        corrected.append(spell.correction(word))
    return " ".join(corrected)

In [6]:
def build_champion_list(r,index):
    '''
    takes 2 inputs - r and index
    return a championlist for each word based on the term frequency 
    '''
    champion_list={}
    for word,word_index in index.items():
        word_index_temp={k: v for k, v in sorted(word_index.items(), key=lambda item: item[1],reverse=True)}
        count=r
        champion_list[word]=[]
        for docid in word_index_temp.keys():
            if(count==0):
                break
            else:
                champion_list[word].append(docid)
                count=count-1
    return champion_list

In [7]:
import math

def form_query_vector(query,idf_values):
    '''
    takes 2 inputs - query(string) and idf_scores
    it forms the query vector which is a dictionary of the format {word : norm(tfscore*idfscore)}
    '''
    query_tf={}
    for w in query.strip().split(" "):
        if(w in query_tf.keys()):
            freq=query_tf[w];
            query_tf[w]=freq+1;
        else:
            query_tf[w]=1;
    query_vector={}
    for key,value in query_tf.items():
        if(key in idf_values.keys()):
            query_vector[key]=(1+math.log(value,10))*(idf_values[key])
        else:
            query_vector[key]=(1+math.log(value,10))
    query_norm_vector={}
    values=query_vector.values();
    values=[x*x for x in values]
    for key,value in query_vector.items():
        query_norm_vector[key]=(value/math.pow(sum(values),0.5))
    return query_norm_vector

In [8]:
def build_idf_vector(index,no_of_docs):
    '''
    takes 2 inputs - index and no_of_docs
    return idf_vector with idf_values of all words
    '''
    idf_values={}
    for word,word_index in index.items():
        length=len(index[word].keys())
        idf_values[word]=math.log((no_of_docs/length),10)
    return idf_values

In [9]:
def build_doc_vector_for_a_word(word,index,doc_dict):
    '''
    takes 3 inputs - word,index,doc_dict
    doc_dict contains the doc_vector for all docs(computed till the present moment)
    doc_dict is of the form {doc_id : {word:norm(tfscore)}
    it adds the word into the doc_vector of all docs who has the word present in it
    '''
    for docid,tf in index[word].items():
        if(docid in doc_dict.keys()):
            if(word not in doc_dict[docid].keys()):
                doc_dict[docid][word]=1+math.log(tf,10)
            else:
                pass
        else:
            doc_dict[docid]={};
            doc_dict[docid][word]=1+math.log(tf,10);

In [10]:
def build_entire_doc_dict(query,index,posting_list):
    '''
    takes 3 inputs - query, index and posting_list
    it builds the entire doc_dict for all the documents and returns it
    '''
    doc_dict={}
    for w in query.strip().split(" "):
        if(w in index.keys()):
            build_doc_vector_for_a_word(w,index,doc_dict)
        else:
            pass
    for docid in doc_dict.keys():
        doc_vector=doc_dict[docid]
        overlap=len(doc_vector.keys())
        for word,tf in doc_vector.items():
            doc_vector[word]=tf*overlap
    for docid in doc_dict.keys():
        posting_list_for_docid=posting_list[docid]
        for word,tfscore in posting_list_for_docid.items():
            if(word in doc_dict[docid].keys()):
                pass
            else:
                doc_dict[docid][word]=1+math.log(tfscore,10);
    for key,doc_vector in doc_dict.items():
        values=doc_dict[key].values()
        values=[x*x for x in values]
        div=math.pow(sum(values),0.5)
        for word,tfwt in doc_vector.items():
            tfscore=doc_vector[word]
            doc_vector[word]=tfscore/div
    
        
    return doc_dict

In [11]:
def build_doc_vector_using_championlist(word,index,doc_dict,champion_list,posting_list):
    '''
    takes 5 inputs - word,index,doc_dict,champion_list,posting_list
    doc_dict contains the doc_vector for all docs(computed till the present moment)
    doc_dict is of the form {doc_id : {word:norm(tfscore)}
    it adds the word into the doc_vector of only the docs which are present in the championlist
    '''
    for docid in champion_list[word] + list(doc_dict.keys()):
        if(docid in doc_dict.keys()):
            if((word not in doc_dict[docid].keys()) and (word in posting_list[docid].keys())):
                doc_dict[docid][word]=1+math.log(index[word][docid],10)
            else:
                pass
        else:
            doc_dict[docid]={};
            doc_dict[docid][word]=1+math.log(index[word][docid],10);

In [12]:
def complete_doc_vector_using_championlist(word,index,doc_dict,champion_list,posting_list):
    '''
    takes 5 inputs - word,index,doc_dict,champion_list,posting_list
    it searches for the word present in docs which are present in doc_dict but not in the champion list
    '''
    for docid in list(doc_dict.keys()):
        if(docid in doc_dict.keys()):
            if((word not in doc_dict[docid].keys()) and (word in posting_list[docid].keys())):
                doc_dict[docid][word]=1+math.log(index[word][docid],10)
            else:
                pass
        else:
            doc_dict[docid]={};
            doc_dict[docid][word]=1+math.log(index[word][docid],10);

In [13]:
def build_entire_doc_dict_using_championlist(query,index,postinglist,champion_list):
    '''
    takes 4 inputs - query,index,postinglist,championlist
    it returns the entire doc_dict built
    '''
    doc_dict={}
    for w in query.strip().split(" "):
        if(w in index.keys()):
            build_doc_vector_using_championlist(w,index,doc_dict,champion_list,postinglist)
        else:
            pass
    for w in query.strip().split(" "):
        if(w in index.keys()):
            complete_doc_vector_using_championlist(w,index,doc_dict,champion_list,postinglist)
        else:
            pass
    for docid in doc_dict.keys():
        doc_vector=doc_dict[docid]
        overlap=len(doc_vector.keys())
        for word,tf in doc_vector.items():
            doc_vector[word]=tf*overlap
    for docid in doc_dict.keys():
        posting_list_for_docid=postinglist[docid]
        for word,tfscore in posting_list_for_docid.items():
            if(word in doc_dict[docid].keys()):
                pass
            else:
                doc_dict[docid][word]=1+math.log(tfscore,10);
    for key,doc_vector in doc_dict.items():
        values=doc_dict[key].values()
        values=[x*x for x in values]
        div=math.pow(sum(values),0.5)
        for word,tfwt in doc_vector.items():
            tfscore=doc_vector[word]
            doc_vector[word]=tfscore/div
    
        
    return doc_dict

In [14]:
def build_entire_doc_dict_using_championlist_choice(query,index,postinglist,champion_list,choice):
    '''
    it takes 5 inputs - query,index,postinglist,champion_list,choice
    and decides to build the entire dict based on the choice
    choice==1 with championlist
    choice==0 without championlist
    '''
    doc_dict={}
    for w in query.strip().split(" "):
        if(w in index.keys()):
            build_doc_vector_using_championlist(w,index,doc_dict,champion_list,postinglist)
        else:
            pass
    for w in query.strip().split(" "):
        if(w in index.keys()):
            complete_doc_vector_using_championlist(w,index,doc_dict,champion_list,postinglist)
        else:
            pass
    if(choice==1):
        for docid in doc_dict.keys():
            doc_vector=doc_dict[docid]
            overlap=len(doc_vector.keys())
            for word,tf in doc_vector.items():
                doc_vector[word]=tf*overlap
    else:
        pass
    for docid in doc_dict.keys():
        posting_list_for_docid=postinglist[docid]
        for word,tfscore in posting_list_for_docid.items():
            if(word in doc_dict[docid].keys()):
                pass
            else:
                doc_dict[docid][word]=1+math.log(tfscore,10);
    for key,doc_vector in doc_dict.items():
        values=doc_dict[key].values()
        values=[x*x for x in values]
        div=math.pow(sum(values),0.5)
        for word,tfwt in doc_vector.items():
            tfscore=doc_vector[word]
            doc_vector[word]=tfscore/div
    
        
    return doc_dict

In [15]:
def calculate_score_for_a_document(docid,doc_dict_for_docid,query_vector,doc_ranking):
    '''
    it takes 4 inputs - docid,document_vector,query_vector,doc_ranking
    it computes the score between query_vector and doc_vector and adds it into the doc_ranking dictionary
    doc_ranking dictionary is of the type {docid : score}
    '''
    rank_score=0
    for word,score in query_vector.items():
        if(word in doc_dict_for_docid.keys()):
            rank_score+=(score)*(doc_dict_for_docid[word])
        else:
            pass
    doc_ranking[docid]=rank_score

In [16]:
import time

def calculate_all(k):
    '''
    This is the python shell implementation of our function(kind of like the main function)
    It takes the k value which takes as to how many documents you want to see
    It will ask the user for a count which gives the user the choice to examine our IR model using various modifications
    count=1 implements all the modifications simultaneously
    count=2,3,4 implements the modifications individually so that you can examine the differnce
    '''
    count=input("1:Examine normal IR(with all modifications)\n2:Examine championlist modification\n3:Examine title term modification\n4:Examine query overlap modification\n")
    query=take_query_input()
    query=correctquery(query)
    if(int(count) != 3):
        index=openindex_choice(1)
        title=opentitle()
        posting_list,no_of_docs=openpostinglist_choice(1)
    else:
        index=openindex_choice(1)
        title=opentitle()
        posting_list,no_of_docs=openpostinglist_choice(1)
        index0=openindex_choice(0)
        posting_list0,no_of_docs1=openpostinglist_choice(0)
    champion_list=build_champion_list(10,index)
    idf_values=build_idf_vector(index,no_of_docs)
    query_vector=form_query_vector(query,idf_values)
    #print(query_vector)
    begin=time.time()
    if(int(count)==1):
        doc_dict1=build_entire_doc_dict_using_championlist(query,index,posting_list,champion_list)
        end1=time.time()
    elif(int(count)==2):
        doc_dict1=build_entire_doc_dict_using_championlist(query,index,posting_list,champion_list)
        end1=time.time()
        doc_dict2=build_entire_doc_dict(query,index,posting_list)
        end=time.time()
    elif(int(count)==3):
        doc_dict1=build_entire_doc_dict_using_championlist(query,index,posting_list,champion_list)
        end1=time.time()
        doc_dict2=build_entire_doc_dict_using_championlist(query,index0,posting_list0,champion_list)
        end=time.time()
    else:
        doc_dict1=build_entire_doc_dict_using_championlist_choice(query,index,posting_list,champion_list,1)
        end1=time.time()
        doc_dict2=build_entire_doc_dict_using_championlist_choice(query,index,posting_list,champion_list,0)
        end=time.time()
    
    doc_ranking1={}
    for docid,docid_vector in doc_dict1.items():
        calculate_score_for_a_document(docid,docid_vector,query_vector,doc_ranking1)
    doc_ranking1={k: v for k, v in sorted(doc_ranking1.items(), key=lambda item: item[1],reverse=True)}
    if(k>len(doc_ranking1.keys())):
        k=len(doc_ranking1.keys())
    first_k=list(doc_ranking1.keys())[0:k]
    df=pd.DataFrame(columns=[['DOCID','Title','Score']])
    for key in first_k:
        df.loc[len(df.index)]=[key,title[key],doc_ranking1[key]]
    print("-----------------")
    print(df)
    print("-----------------")
    print(end1-begin)
    
    if(int(count)!=1):
        doc_ranking2={}
        for docid,docid_vector in doc_dict2.items():
            calculate_score_for_a_document(docid,docid_vector,query_vector,doc_ranking2)
        doc_ranking2={k: v for k, v in sorted(doc_ranking2.items(), key=lambda item: item[1],reverse=True)}
        if(k>len(doc_ranking2.keys())):
            k=len(doc_ranking2.keys())
        first_k=list(doc_ranking2.keys())[0:k]
        df1=pd.DataFrame(columns=[['DOCID','Title','Score']])
        for key in first_k:
            df1.loc[len(df1.index)]=[key,title[key],doc_ranking2[key]]
        print("-----------------")
        print(df1)
        print("-----------------")
        print(end-end1)
    else:
        pass

In [25]:
calculate_all(10)

1:Examine normal IR(with all modifications)
2:Examine championlist modification
3:Examine title term modification
4:Examine query overlap modification
1
Enter you query : Hello Paul
-----------------
      DOCID                                              Title     Score
0  51149587                     Paul Townsend (Disambiguation)  0.229736
1  51138661                               Paul Slane (Cyclist)  0.198746
2  51160635                                Hello Kimberly Tour  0.190369
3  51129397  We Want Our Daddy Dear, Back Home (Hello Centr...  0.171035
4  51172913                                          Paul Augé  0.170823
5  51181066                                       Maria Makino  0.140529
6  51181183                                         Akane Haga  0.123714
7  51133265                    Paul Gallagher (Trade Unionist)  0.106238
8  51155104  Carl Reiner And Mel Brooks At The Cannes Film ...  0.086502
9  51176631                             Paul Weir (Basketball)  0.0863

 # Demonstration

## 1) Basic IR System with all modifications
####      Query : CEO of brandyourself

In [47]:
calculate_all(10)

1:Examine normal IR(with all modifications)
2:Examine championlist modification
3:Examine title term modification
4:Examine query overlap modification
1
Enter you query : Ceo of brandyourself
-----------------
      DOCID                                 Title     Score
0  51168474                        Patrick Ambron  0.334853
1  51135825  Australian Foundation For Disability  0.113395
2  51161862                        Diane Karusisi  0.101936
3  51153753                        Kelly Murumets  0.098057
4  51177161       Fort Bragg Federal Credit Union  0.095585
5  51130761                          Punit Renjen  0.073755
6  51129304                            Rr Auction  0.070688
7  51176883                            Don Slager  0.068495
8  51188774          New Entrepreneurs Foundation  0.066059
9  51130430        John J. Mcgrath (Entrepreneur)  0.064669
-----------------
0.010970830917358398


## 2) Examining the champion list modification
####     Query : Potter first album in 1997 

In [48]:
calculate_all(10)

1:Examine normal IR(with all modifications)
2:Examine championlist modification
3:Examine title term modification
4:Examine query overlap modification
2
Enter you query : Potter first album in 1997
-----------------
      DOCID                                              Title     Score
0  51190117                                   Catherine Potter  0.538644
1  51178109                                         Lisa Haley  0.275269
2  51155926                          Warner Bros. Studio Tours  0.268574
3  51150018                              Telemedicine Act 1997  0.231708
4  51149963                           Computer Crimes Act 1997  0.227113
5  51143603                     Generation Y (Kunto Aji Album)  0.223151
6  51149749                         Digital Signature Act 1997  0.217320
7  51134795                                  Ugadi (1997 Film)  0.212734
8  51154548  Warner Bros. Studio Tour London - The Making O...  0.203773
9  51170136                                     Kodama

## 3) Examining the Title term modification
####     Query : Diverse architecture

In [49]:
calculate_all(10)

1:Examine normal IR(with all modifications)
2:Examine championlist modification
3:Examine title term modification
4:Examine query overlap modification
3
Enter you query : Diverse architecture
-----------------
      DOCID                                              Title     Score
0  51137214  Spectrum-Diverse Unified Neuroevolution Archit...  0.126060
1  51131447                                      St Rose Music  0.118857
2  51158012                              Vasthuvidya Gurukulam  0.084558
3  51145584                                         Soho Radio  0.068928
4  51129395                       Maltese Baroque Architecture  0.067579
5  51183221               Webster County Courthouse (Nebraska)  0.063359
6  51184343                                  Knowledge Inertia  0.062729
7  51134202                                     Dolors Lamarca  0.057465
8  51135958           Waterville Main Street Historic District  0.050998
9  51138023                                    Elsa Law Revi

## 4) Examining the query overlap modification
####     Query: dutch-born politician

In [50]:
calculate_all(10)

1:Examine normal IR(with all modifications)
2:Examine championlist modification
3:Examine title term modification
4:Examine query overlap modification
4
Enter you query : dutch-born politician
-----------------
      DOCID                               Title     Score
0  51128943         Paul Zimmerman (Politician)  0.167927
1  51171957       Harold Armstrong (Politician)  0.146833
2  51172884            Harry Innes (Politician)  0.132314
3  51137297         John Bridgeman (Politician)  0.102142
4  51145357         Fred M. Wilcox (Politician)  0.099372
5  51129249             Don Barker (Politician)  0.067302
6  51157716       Mercedes Alvarez (Politician)  0.065830
7  51170479      Richard G. Austin (Politician)  0.061770
8  51131935          Mehmet Yıldız (Politician)  0.057904
9  51135046  John Bailey (Victorian Politician)  0.050349
-----------------
0.0009646415710449219
-----------------
      DOCID                               Title     Score
0  51171957       Harold Armstrong 