In [19]:
import string, sys, pickle, datetime, math
import nltk
import Stemmer
# nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import *
from bisect import bisect_left
from heapq import nlargest 
from collections import OrderedDict

In [20]:
STOPWORDS = set(stopwords.words('english')) 
URL_STOP_WORDS = set(["http", "https", "www", "ftp", "com", "net", "org", "archives", "pdf", "html", "png", "txt", "redirect", "align", "realign", "valign", "nonalign", "malign", "unalign", "salign", "qalign", "halign", "font", "fontsiz", "fontcolor", "backgroundcolor", "background", "style", "center", "text"])
fields= ['t:', 'i:', 'c:', 'b:', 'r:', 'l:']

In [21]:
def readTitles():
    f = open("./temp_index/Titles",'rb')
    db = pickle.load(f)
    return db

In [22]:
def read_secondaryIndex():
    f = open("./temp_index/SecondaryIndex",'rb')
    db = pickle.load(f)
    return db

In [23]:
def cal_FieldFrequency(posting_list, current_field):
    Fields= ['t', 'b', 'i', 'c', 'r', 'l']
    Fields.remove(current_field)
    fieldWeights= [0.20, 0.10, 0.10, 0.07, 0.03]
    
    numOfOccurences= 0
    for f in range (0, len(Fields)):
        index= posting_list.find(Fields[f])
        if index !=-1:
            index +=1
            count =''
            while index< len(posting_list) and posting_list[index].isdigit():
                count += posting_list[index]
                index +=1
            numOfOccurences += int(count) *  fieldWeights[f]
    
    index= posting_list.find(current_field)
    if index !=-1:
        index +=1
        count =''
        while index< len(posting_list) and posting_list[index].isdigit():
            count += posting_list[index]
            index +=1
        numOfOccurences += int(count)*0.5
    return numOfOccurences

In [24]:
def cal_PlainFrequency(posting_list):
    Fields= ['t', 'b', 'i', 'c', 'r', 'l']
    
    fieldWeights= [0.35, 0.25, 0.20, 0.10, 0.07, 0.03]
    
    numOfOccurences= 0
    for f in range (0, len(Fields)):
        index= posting_list.find(Fields[f])
        if index !=-1:
            index +=1
            count =''
            while index< len(posting_list) and posting_list[index].isdigit():
                count += posting_list[index]
                index +=1
            numOfOccurences += int(count) *  fieldWeights[f]
        
    return numOfOccurences

In [25]:
def evaluateField_tfidf(wordPostings):
    tfidfScores ={}
    for word in wordPostings.keys():
        current_field= wordPostings[word][1]
        posting_lists= wordPostings[word][0].split('|')
        totalDocs= len(posting_lists)
        numOfDocs= 0
        for pl in range(0, totalDocs):
            if current_field in posting_lists[pl]:
                numOfDocs +=1
        
        for pl in range(0, totalDocs):
            if posting_lists[pl] != '':
                pageID= posting_lists[pl].split('-')[0]
                numOfOccurences= cal_FieldFrequency(posting_lists[pl].split('-')[1], current_field)
                totalWordsOfDoc= Titles[int(pageID)][1]
                tf = numOfOccurences/totalWordsOfDoc
                idf= totalDocs/(numOfDocs+1)
                tfidf= tf * math.log10(idf)

                if pageID not in tfidfScores:
                    tfidfScores[pageID]= tfidf
                else:
                    tfidfScores[pageID] +=tfidf
        
    return tfidfScores

In [26]:
def getField_postingLists(fileWordMap):
    wordPostings= {}
    for file_num, entry in fileWordMap.items():
        file_ptr= open("./temp_index/FinalIndex{}".format(file_num), "rb")
        data= pickle.load(file_ptr)
        for word, field in entry:
            if word in data:
                wordPostings[word]= (data[word], field)
            else:
                wordPostings[word]= ('','')
        file_ptr.close()
        
    return wordPostings       

In [27]:
def evaluatePlain_tfidf(wordPostings):
    tfidfScores ={}
    for word in wordPostings.keys():
        posting_lists= wordPostings[word].split('|')
        numOfDocs= len(posting_lists)
        totalDocs= len(Titles)
        for pl in range(0, numOfDocs):
            if posting_lists[pl] != '':
                pageID= posting_lists[pl].split('-')[0]
                numOfOccurences= cal_PlainFrequency(posting_lists[pl].split('-')[1])
                totalWordsOfDoc= Titles[int(pageID)][1]
                tf= numOfOccurences/totalWordsOfDoc
                idf= totalDocs/(numOfDocs+1)
                tfidf= tf * math.log10(idf)

                if pageID not in tfidfScores:
                    tfidfScores[pageID]= tfidf
                else:
                    tfidfScores[pageID] +=tfidf
      
    return tfidfScores

In [28]:
def getPlain_postingLists(fileWordMap):
    wordPostings= {}
    for file_num, words in fileWordMap.items():
        file_ptr= open("./temp_index/FinalIndex{}".format(file_num), "rb")
        data= pickle.load(file_ptr)
        for w in words:
            if w in data:
                wordPostings[w]= data[w]
            else:
                wordPostings[w]= ''
        file_ptr.close()
    return wordPostings       

In [29]:
def get_indexFile(secondaryIndex_keys, word):
    i = bisect_left(secondaryIndex_keys, word)
    if i<len(secondaryIndex_keys):
        if (secondaryIndex_keys[i] == word):
            return i+1
        else:
            return i
    else:
          return i               

In [30]:
def isEnglish(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

In [31]:
def cleanQuery(query):
    stemmer= Stemmer.Stemmer("english")
    query= query.lower()
    query = re.sub(r'<(.*?)>','',query) #Remove tags if any
    query = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', query, flags=re.MULTILINE) #Remove Url
    query = re.sub(r'{\|(.*?)\|}', '', query, flags=re.MULTILINE) #Remove CSS
    query = re.sub(r'\[\[file:(.*?)\]\]', '', query, flags=re.MULTILINE) #Remove File
    query = re.sub(r'[^\w\s]' , '', query) #Remove Punctuations & Special Characters
    query = query.split()
    query = [x for x in query if x not in STOPWORDS and x not in URL_STOP_WORDS and (x.isdigit() and (len(x)<=2 or len(x)>=5)) ==False and bool(re.match('^(?=.*[a-zA-Z])(?=.*[0-9])', x)) ==False and isEnglish(x)] 
    query = [stemmer.stemWord(word) for word in query]
    
    return query

In [32]:
def process_fieldQuery(query):
    fieldInfo= {}
    fileWordMap= {}
    
    for f in fields:
        field= query.find(f)
        if field !=-1:
            fieldInfo[field]= f
    
    fieldInfo= sorted(fieldInfo.items())
    fieldInfo.append((12345678901234567890123456789, ""))            #fake dummy entry
    i=0
    while i+1 <len(fieldInfo):
        field= fieldInfo[i][1].strip(":")
        fieldQuery = (query[fieldInfo[i][0]+2 : fieldInfo[i+1][0]]).lower()
        fieldQuery = cleanQuery(fieldQuery)

        for word in fieldQuery:
            file_num= get_indexFile(list(secondaryIndex.keys()), word)
            if file_num not in fileWordMap:
                fileWordMap[file_num] =[(word, field)]
            else:
                fileWordMap[file_num].append((word, field))
        i +=1
    return fileWordMap

In [33]:
Titles= readTitles()
secondaryIndex= read_secondaryIndex()
queries= open("./queries.txt").readlines()

for query in queries:
    start = datetime.datetime.now()
    query= query.strip('\n')
    K= int(query.split(',')[0])
    k=K
    query= query.split(',')[1]
    fileWordMap= {}
    if any(f in query for f in fields): 
        fileWordMap= process_fieldQuery(query)
        tfidfScores= evaluateField_tfidf(getField_postingLists(fileWordMap))
        
    else:
        query = cleanQuery(query)
        for word in query:
            file_num= get_indexFile(list(secondaryIndex.keys()), word)
            if file_num not in fileWordMap:
                fileWordMap[file_num] =[word]
            else:
                fileWordMap[file_num].append(word)
                
        tfidfScores= evaluatePlain_tfidf(getPlain_postingLists(fileWordMap))
        
        
    tfidfScores= OrderedDict(sorted(tfidfScores.items(), key=lambda t: t[1], reverse= True))
    print (len(tfidfScores))
    kRelevant= []
    for key, value in tfidfScores.items():
        if(K ==0):
            break
        kRelevant.append((key, Titles[int(key)][0]))
        K -=1
            
    end = datetime.datetime.now()
    secs  = (end-start).seconds
    print("\nSearching Time : ",secs," secs")
    
    f= open("./queries_op.txt", "a")
    for pageID, title in kRelevant:
        f.write(str(pageID)+","+title+"\n")
    f.write(str(secs)+"\n")
    f.write(str(secs/k)+"\n\n")
    f.close()



24

Searching Time :  0  secs
43

Searching Time :  0  secs
7

Searching Time :  0  secs


In [34]:
# for key, value in wordPostings.items():
#     print (key, ":", value)

In [35]:
# for key, value in tfidfScores.items():
#     print (key, ":", value)

In [36]:
kRelevant

[('53', 'list of bluetooth protocols'),
 ('60', 'autism'),
 ('161', 'john appold'),
 ('58', 'wikipedia:articles for deletion/peter delgrosso'),
 ('198', 'ampeg svt')]