In [137]:
import string, sys, pickle, datetime, math
import nltk
import Stemmer
# nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import *
from bisect import bisect_left
from heapq import nlargest 

In [138]:
STOPWORDS = set(stopwords.words('english')) 
URL_STOP_WORDS = set(["http", "https", "www", "ftp", "com", "net", "org", "archives", "pdf", "html", "png", "txt", "redirect", "align", "realign", "valign", "nonalign", "malign", "unalign", "salign", "qalign", "halign", "font", "fontsiz", "fontcolor", "backgroundcolor", "background", "style", "center", "text"])
fields= ['t:', 'i:', 'c:', 'b:', 'r:', 'l:']

In [139]:
def readTitles():
    f = open("./Titles",'rb')
    db = pickle.load(f)
    
#     for key, value in db.items():
#         print (key, " : ", value)
    return db

In [140]:
def read_secondaryIndex():
    f = open("./SecondaryIndex",'rb')
    db = pickle.load(f)
    
#     for key, value in db.items():
#         print (key, " : ", value)
    return db

In [141]:
def evaluate_tfidf(wordPostings):
    tfidfScores ={}
    for word in wordPostings.keys():
        posting_lists= wordPostings[word].split('|')
        numOfDocs= len(posting_lists)
        totalDocs= len(Titles)
        for pl in range(0, numOfDocs):
            pageID= posting_lists[pl].split('-')[0]
            numOfOccurences= int(posting_lists[pl].split('x')[1])
            tfidf= math.log10(1+numOfOccurences) * math.log10(totalDocs/(numOfDocs+1))
            posting_lists[pl]= posting_lists[pl].split('x')[0]+ 'x'+ str(tfidf)
            
            if pageID not in tfidfScores:
                tfidfScores[pageID]= tfidf
            else:
                tfidfScores[pageID] += tfidf

        wordPostings[word]= '|'.join(posting_lists)
        
    return wordPostings, dict(sorted(tfidfScores.items(), key=lambda t: t[1],reverse=True))

In [142]:
def get_postingLists(fileWordMap):
    wordPostings= {}
    for file_num, words in fileWordMap.items():
        file_ptr= open("./inverted_index/FinalIndex{}".format(file_num), "rb")
        data= pickle.load(file_ptr)
        for w in words:
            if w in data:
                wordPostings[w]= data[w]
            else:
                wordPostings[w]= ''
        file_ptr.close()
    return wordPostings       

In [143]:
def get_indexFile(secondaryIndex_keys, word):
    i = bisect_left(secondaryIndex_keys, word)
    if i:
        if (secondaryIndex_keys[i] == word):
            return i+1
        else:
            return i
    else:
          return -1

In [144]:
def isEnglish(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

In [145]:
def cleanQuery(query):
    stemmer= Stemmer.Stemmer("english")
    query= query.lower()
    query = re.sub(r'<(.*?)>','',query) #Remove tags if any
    query = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', query, flags=re.MULTILINE) #Remove Url
    query = re.sub(r'{\|(.*?)\|}', '', query, flags=re.MULTILINE) #Remove CSS
    query = re.sub(r'\[\[file:(.*?)\]\]', '', query, flags=re.MULTILINE) #Remove File
    query = re.sub(r'[^\w\s]' , '', query) #Remove Punctuations & Special Characters
    query = query.split()
    query = [x for x in query if x not in STOPWORDS and x not in URL_STOP_WORDS and (x.isdigit() and (len(x)<=2 or len(x)>=5)) ==False and bool(re.match('^(?=.*[a-zA-Z])(?=.*[0-9])', x)) ==False and isEnglish(x)] 
    query = [stemmer.stemWord(word) for word in query]
    
    return query

In [146]:
def search_fieldQuery(documents, field_type):
    fieldDocs= []
    documents= documents.split("|")
    for doc in documents:
        if field_type in doc:
            fieldDocs.append(doc)
    return fieldDocs

In [147]:
def process_fieldQuery(query):
    fieldInfo= {}
    
    for f in fields:
        field= query.find(f)
        if field !=-1:
            fieldInfo[field]= f
    
    fieldInfo= sorted(fieldInfo.items())
    fieldInfo.append((1234567890, ""))
    print(fieldInfo)
    i=0
    while i+1 <len(fieldInfo):
        fieldQuery = (query[fieldInfo[i][0]+2 : fieldInfo[i+1][0]]).lower()
        fieldQuery = cleanQuery(fieldQuery)
        for word in fieldQuery:
            if word not in Index:
                print (word, " : ", [])
            else:
                value= Index[word]
                print(word, " : ", search_fieldQuery(value, fieldInfo[i][1][:1]))
            print()
        i +=1
    

In [148]:
Titles= readTitles()
secondaryIndex= read_secondaryIndex()
queries= ["553 slip 1990 jutzi sent kai lash stategeorgia"]

start = datetime.datetime.now()

for query in queries:
    fileWordMap= {}
    kRelevantTitles= []
    if any(f in query for f in fields): 
        process_fieldQuery(query)
    else:
        query = cleanQuery(query)
        for word in query:
            file_num= get_indexFile(list(secondaryIndex.keys()), word)
            if file_num not in fileWordMap:
                fileWordMap[file_num] =[word]
            else:
                fileWordMap[file_num].append(word)
                
        wordPostings, tfidfScores= evaluate_tfidf(get_postingLists(fileWordMap))
        kRelevant= nlargest(5, tfidfScores, key = tfidfScores.get)
        for pageId in kRelevant:
            kRelevantTitles.append(Titles[int(pageId)])
            
    

end = datetime.datetime.now()
secs  = (end-start).seconds
hr = int(secs/(60*60))
rm = int(secs%(60*60))
mn = int(rm/60)
rm=int(rm%60)
secs = int(rm)

print("\Searching Time : ",hr," hrs ",mn," mns",secs," secs")

\Searching Time :  0  hrs  0  mns 0  secs


In [149]:
for key, value in wordPostings.items():
    print (key, ":", value)

553 : 85-b1x0.5308447212559941
1990 : 2-i2b8x0.9898589900243991|7-b1x0.28613341703060285|10-i1x0.28613341703060285|16-b3x0.5722668340612057|18-b1x0.28613341703060285|65-b2x0.4535107361967138|70-b4x0.664381219889479|71-b2x0.4535107361967138|72-b3x0.5722668340612057|73-b1x0.28613341703060285|80-b1x0.28613341703060285|85-b1x0.28613341703060285
slip : 2-b1x0.5308447212559941
sent : 1-b18x1.1743177349632192|2-b1x0.27644486193486334|7-b1x0.27644486193486334|10-i2x0.43815473968379576|13-b3x0.5528897238697267|16-b1x0.27644486193486334|67-b1x0.27644486193486334|69-b5x0.7145996016186591|70-b1x0.27644486193486334|73-b3x0.5528897238697267|85-b1x0.27644486193486334|86-b3x0.5528897238697267|94-b5x0.7145996016186591
stategeorgia : 65-b3x1.0616894425119883
jutzi : 18-b1x0.5308447212559941
kai : 1-b1x0.5308447212559941
lash : 69-b1x0.5308447212559941


In [150]:
for key, value in tfidfScores.items():
    print (key, ":", value)

2 : 1.7971485732152568
1 : 1.7051624562192134
65 : 1.5152001787087022
69 : 1.2454443228746532
85 : 1.0934230002214604
70 : 0.9408260818243424
16 : 0.848711695996069
73 : 0.8390231409003295
18 : 0.816978138286597
10 : 0.7242881567143986
94 : 0.7145996016186591
72 : 0.5722668340612057
7 : 0.5625782789654662
13 : 0.5528897238697267
86 : 0.5528897238697267
71 : 0.4535107361967138
80 : 0.28613341703060285
67 : 0.27644486193486334


In [151]:
kRelavant

['2', '1', '65', '69', '85']

In [152]:
kRelevantTitles

['andre agassi', 'apollo', 'alabama', 'abraham lincoln', 'algeria']