In [1]:
import os
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
sw = stopwords.words("english")
import snowballstemmer
from itertools import islice


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nuzha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# PREPROCESSING TEXT
def ParseText(text):
    text = text.replace("\n", " ")
    text = text.lower()
    text = " ".join(text.split())

    tokens = text.split(" ")
    tokens[:] = [word for word in tokens if word not in sw]

    tokens[:] = [re.sub("[^a-zA-Z]", '', word) for word in tokens]
    tokens = list(filter(None, tokens))

    stemmer = snowballstemmer.stemmer('english')
    tokens = stemmer.stemWords(tokens)
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

In [3]:
# CREATE INVERTED INDEX
def CreateInvertedIndex(docs, path):
    index = {}
    if doc_id == 0:
        file = open("docInfo.txt", "x")
    else:
        file = open("docInfo.txt", "a+")
    id_ = doc_id
    for doc in docs:
        docinfo = []
        id_ = id_ + 1
        docinfo.append(id_)
        docinfo.append(doc)
        docinfo.append(os.stat(path + doc).st_size)
        info = [str(int) for int in docinfo]
        info = ",".join(info)
        file.write(info + "\n")
        name, extension = os.path.splitext(doc)
        #print(doc)
        #print (name, extension)
        if extension == ".txt":
            continue
        try:
            html = open(path + doc).read()
        except:
            continue
        soup = BeautifulSoup(html, features="html.parser")

        for script in soup(["script", "style"]):
            script.extract()

        # get text
        try:
            text = soup.body.get_text(" ")
        except:
            continue

        # remove extra lines and spaces
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        text = '\n'.join(chunk for chunk in chunks if chunk)

        tokens = ParseText(text)
        pos = 0
        for token in tokens:
            posting = index.get(token, None)
            if posting is None:      
                posting = {id_ : [1, pos]}
            else:    
                if id_ not in posting: # You can also keep TF here
                    #posting.append(doc) 
                    posting[id_] = [1, pos]
                else:
                    posting[id_][0] = posting[id_][0] + 1
                    #print(posting[doc])
                    posting[id_].append(pos)

            index[token]= posting
            pos = pos + 1
    file.close()
    return index, id_


In [4]:
# SAVE INDEX TO FILE
def createIndexFile(indexfile, posting, index):
    p = open(posting, "x") #posting list
    i = open(indexfile, "x") #index

    for key, value in index.items():
        write = []
        write.append(len(index[key]))
        prev = 0
        for k, v in value.items():
            if prev != 0:
                write.append(k - prev)
            else:
                write.append(k)
            write.append(len(v))
            prev = k
            previn = 0
            for item in v:
                if previn != 0:
                    write.append(item-previn)
                else:
                    write.append(item)
                previn = item
        #print(" THIS = ", write)
        lol = [str(int) for int in write]
        lol = ",".join(lol)
        #print(lol)
        a = p.tell()
        s = key + "," + str(a)
        i.write(s + "\n")

        p.write(lol+"\n")
    p.close()
    i.close()

In [30]:
# DRIVER CODE
doc_id = 0 
# get directory
path = "C:\\Users\\Nuzha\\Desktop\\corpus1\\" 
for i in range(3):
    docs = os.listdir(path + str(i + 1))
    #create index of 1 block at a time
    print(doc_id)
    index, doc_id = CreateInvertedIndex(docs, path + str(i + 1) + "\\")
    #sort index
    index = dict(sorted(index.items(), key=lambda item: item[0]))
    #save the index in files
    indexfile = 'index_' + str(i + 1) + '_terms.txt'
    posting = 'index_' + str(i + 1) + '_postings.txt'
    createIndexFile(indexfile, posting, index)


0
1161
2266


In [2]:
# HELPER FUNCTIONS FOR MERGER
def Decode(posting1):
    a = 1
    prev = 0
    posting1[0] = int(posting1[0])
    while a < len(posting1):
        posting1[a] = int(posting1[a]) + prev
        prev = posting1[a]
        a = a + 1
        b = 0
        prev2 = 0
        if a == len(posting1):
            break
        size = int(posting1[a])
        posting1[a] = size
        a = a + 1
        while b < size:
            posting1[a] = prev2 + int(posting1[a])
            prev2 = posting1[a]
            a = a + 1
            b = b + 1
    
    return posting1

def MergePostings(posting1, posting2):
    size = posting1[0] + posting2[0]
    k = 0
    index1 = 1
    index2 = 1
    output_posting = []
    output_posting.append(size)
    while k < size:
        if index1 == len(posting1):
            while index2 != len(posting2):
                output_posting.append(posting2[index2]) #id
                index2 = index2 + 1
                pos = posting2[index2] #no. of positions
                output_posting.append(posting2[index2])
                p = 0
                index2 = index2 + 1
                while p < pos:
                    output_posting.append(posting2[index2])
                    p = p + 1
                    index2 = index2 + 1
            break
        elif index2 == len(posting2):
            while index1 != len(posting1):
                output_posting.append(posting1[index1]) #id
                index1 = index1 + 1
                pos = posting1[index1] #no. of positions
                output_posting.append(posting1[index1])
                p = 0
                index1 = index1 + 1
                while p < pos:
                    output_posting.append(posting1[index1])
                    p = p + 1
                    index1 = index1 + 1
            break
        if posting1[index1] < posting2[index2]:
            output_posting.append(posting1[index1]) #id
            index1 = index1 + 1
            pos = posting1[index1] #no. of positions
            output_posting.append(posting1[index1])
            p = 0
            index1 = index1 + 1
            while p < pos:
                output_posting.append(posting1[index1])
                p = p + 1
                index1 = index1 + 1
        elif posting1[index1] > posting2[index2]:
            output_posting.append(posting2[index2]) #id
            index2 = index2 + 1
            pos = posting2[index2] #no. of positions
            output_posting.append(posting2[index2])
            p = 0
            index2 = index2 + 1
            while p < pos:
                output_posting.append(posting2[index2])
                p = p + 1
                index2 = index2 + 1
        k = k + 1
    return output_posting

def Encode(output_posting):
    a = 1
    prev = 0
    posting1 = []
    posting1.append(output_posting[0])

    while a < len(output_posting):
        posting1.append(output_posting[a] - prev)
        prev = output_posting[a]
        a = a + 1
        b = 0
        prev2 = 0
        if a == len(output_posting):
            break
        size = output_posting[a]
        posting1.append(size)
        a = a + 1
        while b < size:
            posting1.append(output_posting[a] - prev2)
            prev2 = output_posting[a]
            a = a + 1
            b = b + 1
    return posting1

In [3]:
def MergeIndexFiles():

    # define the name of the file to read from
    output_in = open("inverted_index_terms.txt", "x")
    output_post = open("inverted_index_postings.txt", "x")

    p1 = open("index_1_postings.txt", "r")
    p2 = open("index_2_postings.txt", "r")
    p3 = open("index_3_postings.txt", "r")

    index_file1 = open('index_1_terms.txt', 'r')
    index_file2 = open('index_2_terms.txt', 'r')
    index_file3 = open('index_3_terms.txt', 'r')

    # define the number of lines to read
    number_of_lines = 50
    flag1 = 0
    flag2 = 0
    flag3 = 0

    while True:

        if flag1 == 0:
            in1 = list(islice(index_file1, number_of_lines))
            #print(in1)
            i = 0
            if not in1:
                flag1 = 2
            else:
                flag1 = 1
        if flag2 == 0:
            in2 = list(islice(index_file2, number_of_lines))
            #print(in2)
            j = 0
            if not in2:
                flag2 = 2
            else:
                flag2 = 1
        if flag3 == 0:
            in3 = list(islice(index_file3, number_of_lines))
            #print(in2)
            k = 0
            if not in3:
                flag3 = 2
            else:
                flag3 = 1

        if flag1 == 2 and flag2 == 2 and flag3 == 0:
            break

        if flag1 == 2 and flag3 == 2:
            #print("here1")
            while j < number_of_lines:
                try:
                    first2 = in2[j].split(",")
                except:
                    flag2 = 2
                    break
                getbyte = first2[1]
                p2.seek(int(getbyte))
                posting = p2.readline()
                byte = output_post.tell()
                output_post.write(posting)
                #print(first2[0] + "," + str(byte) + "\n")
                output_in.write(first2[0] + "," + str(byte) + "\n")
                j = j + 1
            if flag2 == 2:
                break
            flag2 = 0

        #index file 2 and 3 is finished, so read index 1 file completely
        elif flag2 == 2 and flag3 == 2:
            #print("here2")
            while i < number_of_lines:
                try:
                    first1 = in1[i].split(",")
                except:
                    flag1 = 2
                    break
                getbyte = first1[1]
                p1.seek(int(getbyte))
                posting = p1.readline()
                byte = output_post.tell()
                output_post.write(posting)
                #print(first1[0] + "," + str(byte) + "\n")
                output_in.write(first1[0] + "," + str(byte) + "\n")
                i = i + 1
            if flag1 == 2:
                break
            flag1 = 0

        elif flag1 == 2 and flag2 == 2:
            #print("here3")
            while k < number_of_lines:
                try:
                    first3 = in3[k].split(",")
                except:
                    flag3 = 2
                    break
                getbyte = first3[1]
                p3.seek(int(getbyte))
                posting = p3.readline()
                byte = output_post.tell()
                output_post.write(posting)
                #print(first1[0] + "," + str(byte) + "\n")
                output_in.write(first3[0] + "," + str(byte) + "\n")
                k = k + 1
            if flag3 == 0:
                break
            flag3 = 0

        # if only one file is finished
        elif flag3 == 2:
            #print("here4")
            while(flag1 == 1 and flag2 == 1):
                try:
                    first1 = in1[i].split(",")
                except:
                    flag1 = 2
                    continue
                try:
                    first2 = in2[j].split(",")
                except:
                    flag2 = 2
                    continue

                #if current word is same in both files 
                if first1[0] == first2[0]:
                    getbyte = first1[1]
                    p1.seek(int(getbyte))
                    posting1 = p1.readline()
                    posting1 = posting1[0:len(posting1)-1].split(",")

                    getbyte = first2[1]
                    p2.seek(int(getbyte))
                    posting2 = p2.readline()
                    posting2 = posting2[0:len(posting2)-1].split(",")

                    posting1 = Decode(posting1)
                    posting2 = Decode(posting2)

                    output_posting = MergePostings(posting1, posting2)
                    
                    output_posting = Encode(output_posting)
                    
                    lol = [str(int) for int in output_posting]
                    lol = ",".join(lol)
                    byte = output_post.tell()
                    output_post.write(lol + "\n")
                    output_in.write(first2[0] + "," + str(byte) + "\n")
                    i = i + 1
                    j = j + 1

                elif min(first1[0], first2[0]) == first1[0]:
                    getbyte = first1[1]
                    p1.seek(int(getbyte))
                    posting = p1.readline()
                    byte = output_post.tell()
                    output_post.write(posting)
                    #print(first1[0] + "," + str(byte) + "\n")
                    output_in.write(first1[0] + "," + str(byte) + "\n")
                    i = i + 1
                else:
                    getbyte = first2[1]
                    p2.seek(int(getbyte))
                
                    posting = p2.readline()
                    byte = output_post.tell()
                    output_post.write(posting)
                    #print(first2[0] + "," + str(byte) + "\n")
                    output_in.write(first2[0] + "," + str(byte) + "\n")
                    j = j + 1

                if i == number_of_lines:
                    flag1 = 0
                elif i < number_of_lines:
                    flag1 = 1
                if j == number_of_lines:
                    flag2 = 0
                elif j < number_of_lines:
                    flag2 = 1

        elif flag2 == 2:
            #print("here5")
            while(flag1 == 1 and flag3 == 1):
                try:
                    first1 = in1[i].split(",")
                except:
                    flag1 = 2
                    continue
                try:
                    first3 = in3[k].split(",")
                except:
                    flag3 = 2
                    continue

                #if current word is same in both files 
                if first1[0] == first3[0]:
                    getbyte = first1[1]
                    p1.seek(int(getbyte))
                    posting1 = p1.readline()
                    posting1 = posting1[0:len(posting1)-1].split(",")

                    getbyte = first3[1]
                    p3.seek(int(getbyte))
                    posting3 = p3.readline()
                    posting3 = posting3[0:len(posting3)-1].split(",")


                    posting1 = Decode(posting1)
                    posting3 = Decode(posting3)

                    output_posting = MergePostings(posting1, posting3)
                    
                    output_posting = Encode(output_posting)
                    
                    lol = [str(int) for int in output_posting]
                    lol = ",".join(lol)
                    byte = output_post.tell()
                    output_post.write(lol + "\n")
                    output_in.write(first3[0] + "," + str(byte) + "\n")
                    i = i + 1
                    k = k + 1

                elif min(first1[0], first3[0]) == first1[0]:
                    getbyte = first1[1]
                    p1.seek(int(getbyte))
                    posting = p1.readline()
                    byte = output_post.tell()
                    output_post.write(posting)
                    #print(first1[0] + "," + str(byte) + "\n")
                    output_in.write(first1[0] + "," + str(byte) + "\n")
                    i = i + 1
                else:
                    getbyte = first3[1]
                    p3.seek(int(getbyte))
                    posting = p3.readline()
                    byte = output_post.tell()
                    output_post.write(posting)
                    #print(first2[0] + "," + str(byte) + "\n")
                    output_in.write(first3[0] + "," + str(byte) + "\n")
                    k = k + 1

                if i == number_of_lines:
                    flag1 = 0
                elif i < number_of_lines:
                    flag1 = 1
                if k == number_of_lines:
                    flag3 = 0
                elif k < number_of_lines:
                    flag3 = 1

        elif flag1 == 2:
            #print("here5")
            while(flag3 == 1 and flag2 == 1):
                try:
                    first3 = in3[k].split(",")
                except:
                    flag3 = 2
                    continue
                try:
                    first2 = in2[j].split(",")
                except:
                    flag2 = 2
                    continue

                #if current word is same in both files 
                if first3[0] == first2[0]:
                    getbyte = first3[1]
                    p3.seek(int(getbyte))
                    posting3 = p3.readline()
                    posting3 = posting3[0:len(posting3)-1].split(",")

                    getbyte = first2[1]
                    p2.seek(int(getbyte))
                    posting2 = p2.readline()
                    posting2 = posting2[0:len(posting2)-1].split(",")

                    posting3 = Decode(posting3)
                    posting2 = Decode(posting2)

                    output_posting = MergePostings(posting3, posting2)
                    output_posting = Encode(output_posting)
                    lol = [str(int) for int in output_posting]
                    lol = ",".join(lol)
                    byte = output_post.tell()
                    output_post.write(lol + "\n")
                    output_in.write(first2[0] + "," + str(byte) + "\n")
                    k = k + 1
                    j = j + 1

                elif min(first3[0], first2[0]) == first3[0]:
                    getbyte = first3[1]
                    p3.seek(int(getbyte))
                    posting = p3.readline()
                    byte = output_post.tell()
                    output_post.write(posting)
                    #print(first1[0] + "," + str(byte) + "\n")
                    output_in.write(first3[0] + "," + str(byte) + "\n")
                    k = k + 1
                else:
                    getbyte = first2[1]
                    p2.seek(int(getbyte))
                    posting = p2.readline()
                    byte = output_post.tell()
                    output_post.write(posting)
                    #print(first2[0] + "," + str(byte) + "\n")
                    output_in.write(first2[0] + "," + str(byte) + "\n")
                    j = j + 1

                if k == number_of_lines:
                    flag3 = 0
                elif k < number_of_lines:
                    flag3 = 1
                if j == number_of_lines:
                    flag2 = 0
                elif j < number_of_lines:
                    flag2 = 1

        # read all files
        while(flag1 == 1 and flag2 == 1 and flag3 == 1):
            #print("here6")
            try:
                first1 = in1[i].split(",")
            except:
                flag1 = 2
                continue
            try:
                first2 = in2[j].split(",")
            except:
                flag2 = 2
                continue
            try:
                first3 = in3[k].split(",")
            except:
                flag3 = 2
                continue

            #if current word is same in both files 
            if first1[0] == first2[0] and first2[0] == first3[0]:
                getbyte = first1[1]
                p1.seek(int(getbyte))
                posting1 = p1.readline()
                posting1 = posting1[0:len(posting1)-1].split(",")

                getbyte = first2[1]
                p2.seek(int(getbyte))
                posting2 = p2.readline()
                posting2 = posting2[0:len(posting2)-1].split(",")

                getbyte = first3[1]
                p3.seek(int(getbyte))
                posting3 = p3.readline()
                posting3 = posting3[0:len(posting3)-1].split(",")

                posting1 = Decode(posting1)
                posting2 = Decode(posting2)
                posting3 = Decode(posting3)

                output_posting = MergePostings(posting1, posting2)
                output_posting = MergePostings(output_posting, posting3)
                output_posting = Encode(output_posting)

                lol = [str(int) for int in output_posting]
                lol = ",".join(lol)
                byte = output_post.tell()
                output_post.write(lol + "\n")
                output_in.write(first2[0] + "," + str(byte) + "\n")
                i = i + 1
                j = j + 1
                k = k + 1

            elif first1[0] == first2[0]:
                getbyte = first1[1]
                #print(getbyte)
                p1.seek(int(getbyte))
                posting1 = p1.readline()
                posting1 = posting1[0:len(posting1)-1].split(",")

                getbyte = first2[1]
                p2.seek(int(getbyte))
                posting2 = p2.readline()
                posting2 = posting2[0:len(posting2)-1].split(",")

                posting1 = Decode(posting1)
                posting2 = Decode(posting2)

                output_posting = MergePostings(posting1, posting2)
                output_posting = Encode(output_posting)
                lol = [str(int) for int in output_posting]
                lol = ",".join(lol)
                byte = output_post.tell()
                output_post.write(lol + "\n")
                output_in.write(first2[0] + "," + str(byte) + "\n")
                i = i + 1
                j = j + 1

            elif first1[0] == first3[0]:
                getbyte = first1[1]
                #print(getbyte)
                p1.seek(int(getbyte))
                posting1 = p1.readline()
                posting1 = posting1[0:len(posting1)-1].split(",")

                getbyte = first3[1]
                p3.seek(int(getbyte))
                posting3 = p3.readline()
                posting3 = posting3[0:len(posting3)-1].split(",")

                posting1 = Decode(posting1)
                posting3 = Decode(posting3)
                
                output_posting = MergePostings(posting1, posting3)
                output_posting = Encode(output_posting)

                lol = [str(int) for int in output_posting]
                lol = ",".join(lol)
                byte = output_post.tell()
                output_post.write(lol + "\n")
                output_in.write(first1[0] + "," + str(byte) + "\n")
                i = i + 1
                k = k + 1

            elif first3[0] == first2[0]:
                getbyte = first3[1]
                p3.seek(int(getbyte))
                posting3 = p3.readline()
                posting3 = posting3[0:len(posting3)-1].split(",")

                getbyte = first2[1]
                p2.seek(int(getbyte))
                posting2 = p2.readline()
                posting2 = posting2[0:len(posting2)-1].split(",")

                posting3 = Decode(posting3)
                posting2 = Decode(posting2)
                
                output_posting = MergePostings(posting3, posting2)
                output_posting = Encode(output_posting)
                lol = [str(int) for int in output_posting]
                lol = ",".join(lol)
                byte = output_post.tell()
                output_post.write(lol + "\n")
                output_in.write(first2[0] + "," + str(byte) + "\n")
                k = k + 1
                j = j + 1

            elif min(first1[0], first2[0], first3[0]) == first1[0]:
                getbyte = first1[1]
                p1.seek(int(getbyte))
                posting = p1.readline()
                byte = output_post.tell()
                output_post.write(posting)
                #print(first1[0] + "," + str(byte) + "\n")
                output_in.write(first1[0] + "," + str(byte) + "\n")
                i = i + 1

            elif min(first1[0], first2[0], first3[0]) == first3[0]:
                getbyte = first3[1]
                p3.seek(int(getbyte))
                posting = p3.readline()
                byte = output_post.tell()
                output_post.write(posting)
                #print(first1[0] + "," + str(byte) + "\n")
                output_in.write(first3[0] + "," + str(byte) + "\n")
                k = k + 1

            else:
                getbyte = first2[1]
                p2.seek(int(getbyte))
                posting = p2.readline()
                byte = output_post.tell()
                output_post.write(posting)
                #print(first2[0] + "," + str(byte) + "\n")
                output_in.write(first2[0] + "," + str(byte) + "\n")
                j = j + 1

            if i == number_of_lines:
                flag1 = 0
            elif i < number_of_lines:
                flag1 = 1
            if j == number_of_lines:
                flag2 = 0
            elif j < number_of_lines:
                flag2 = 1
            if k == number_of_lines:
                flag3 = 0
            elif k < number_of_lines:
                flag3 = 1

    output_in.close()
    output_post.close()
    p1.close()
    p2.close()
    p3.close()
    index_file1.close()
    index_file2.close()
    index_file3.close()

In [4]:
MergeIndexFiles()

In [15]:
def ProcessQuery(query):
    tokens = ParseText(query)
    tokens
    docs = {}
    postings = open("inverted_index_postings.txt", "r")
    with open('inverted_index_terms.txt', 'r') as index_terms:
        for line in index_terms:
            term = line.split(",")
            for word in tokens:
                if word == term[0]:
                    offset = int(term[1])
                    postings.seek(offset)
                    post = postings.readline()
                    post = post.split(",")
                    post = Decode(post)
                    j = 1
                    for i in range(post[0]):
                        if word not in docs:
                            docs[word] = [post[j]]
                        else:
                            docs[word].append(post[j])
                        j = j + 1
                        j = j + post[j] + 1
                        i = i + 1
    index_terms.close()
    postings.close()
    i = 0
    list1 = []
    for key in docs:
        if i == 0:
            list1 = docs[key]
        else:
            intersection_set = set.intersection(set(list1), set(docs[key]))
            list1 = list(intersection_set)
        i = i + 1
    result = []
    with open('docInfo.txt', 'r') as docinfo:
        for line in docinfo:
            info = line.split(",")
            for doc in list1:
                if doc == int(info[0]):
                    result.append(info[1])
    return result

In [17]:
query = 'movies fun'
result = ProcessQuery(query)
result

['clueweb12-0006wb-79-22541',
 'clueweb12-0012wb-94-31402',
 'clueweb12-0100tw-21-11869',
 'clueweb12-0100tw-25-05344',
 'clueweb12-0100tw-74-08426',
 'clueweb12-0100tw-84-14181',
 'clueweb12-0102wb-56-11337',
 'clueweb12-0104wb-88-03977',
 'clueweb12-0104wb-93-18870',
 'clueweb12-0105wb-01-30277',
 'clueweb12-0105wb-92-00243',
 'clueweb12-0109wb-16-01307',
 'clueweb12-0109wb-78-22647',
 'clueweb12-0203wb-61-27243',
 'clueweb12-0205wb-35-24050',
 'clueweb12-0205wb-50-18317',
 'clueweb12-0206wb-71-23133',
 'clueweb12-0207wb-02-00245',
 'clueweb12-0209wb-01-19807',
 'clueweb12-0209wb-43-15294',
 'clueweb12-0300tw-06-07418',
 'clueweb12-0300tw-47-10976',
 'clueweb12-0302wb-51-07386',
 'clueweb12-0306wb-91-15684',
 'clueweb12-0311wb-11-17224',
 'clueweb12-0400tw-15-07826',
 'clueweb12-0400tw-19-14132',
 'clueweb12-0400tw-77-05817',
 'clueweb12-0401wb-71-19597',
 'clueweb12-0402wb-69-17651',
 'clueweb12-0410wb-53-16290',
 'clueweb12-0506wb-00-27596',
 'clueweb12-0507wb-83-16824',
 'clueweb1