In [5]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk import flatten
from collections import Counter

In [6]:
import os
from os.path import join
import io
import ntpath
from nltk import flatten
import math
import pickle
import ast
import nltk
import re
import sys

## Utility functions for Preprocessing

In [7]:
def scrub_special_characters(string,with_brackets = True,with_space = False):  # removes special characters from the string
    replace_char = '@"+-=\\_!#$%^.,&*()<>?/\|}{~:;[]' if with_brackets else '@"+-=\\_!#$%^.,&*<>?/\|}{~:;[]'
    join_char = ' ' if with_space else ''
    return ''.join(x if not x in replace_char else join_char for x in string )

In [8]:
def white_space_tokenizer(string): # Tokenize the string
    tk = WhitespaceTokenizer()
    return tk.tokenize(string)

In [9]:
def remove_stop_words(x,with_boolean_connectives=True): # Removes stop words
    if type(x) is not list:
        x = x.split()
    stop_words = set(stopwords.words('english')) if with_boolean_connectives else set(stopwords.words('english'))-{'and','or','not'}
    ans = [i for i in x if not i in stop_words]
    return ans

In [10]:
def porter_stemmer(x):  # using porter stemmer for stemming
    if type(x) is not list:
        x = x.split()
    ps = PorterStemmer()
    ans = [ps.stem(i) for i in x]
    return ans

In [11]:
def remove_non_ascii(string): # Removes non ascii characters
    return string.encode("ascii", "ignore").decode()

In [12]:
def extract_path_name(path): # utility function to extract name of the file from path
    head, tail = ntpath.split(path)
    return tail or ntpath.basename(head)

In [13]:
def preprocess(string, with_stop_words=False): # Preprocessing
    cleaned_string = remove_non_ascii(string)  # removes non-ascii characters
    cleaned_string= cleaned_string.replace("'s"," ")   
    cleaned_string = cleaned_string.replace("'", "")
    cleaned_string = cleaned_string.replace("`", "")
    uncleaned_tokenized_list = white_space_tokenizer(cleaned_string) # tokenization
    #uncleaned_tokenized_list = [i.lower() for i in uncleaned_tokenized_list]
    uncleaned_tokenized_list = [scrub_special_characters(i)  for i in uncleaned_tokenized_list if scrub_special_characters(i)]
    cleaned_token_list = remove_stop_words(uncleaned_tokenized_list) if not with_stop_words else uncleaned_tokenized_list
    cleaned_token_list = [porter_stemmer(i) for i in cleaned_token_list if porter_stemmer(i)]  # case lower
    ans = list(flatten(cleaned_token_list)) # put into list
    ans = [i  for i in ans if(len(i))>1]
    return ans

## Preprocessing

In [14]:
def core_preprocess(dictionary,input_dir = "english-corpora/"):
    if (not os.path.exists(input_dir)): # check if path given is valid or not
        print("file path incorrect!!!")
        sys.exit(1)
    else:
        doc_list = os.listdir(input_dir)  # list of all files within a directory
        for doc in doc_list:
            doc_obj = io.open(join(input_dir,str(doc)),'r',encoding='utf-8',errors='ignore') # open file  
            text = doc_obj.read()
            tokens = preprocess(text)
            dictionary[doc] = tokens

In [15]:
#doc_dictionary = pickle.load(open('qurel_pickle.p', "rb"))

In [16]:
doc_dictionary = {}
core_preprocess(doc_dictionary)

## Boolean Information Retrieval

In [18]:
class Boolean_Information_Retrieval:
    
    dictionary = {} # dictionary with stemmed word as key with tuple as value containing offset and frequency
    postings = {} # list of all documents that contains a word, retrieved using offset stored in dictionary
    total_docs = [] # list of all documents

    def __init__(self,input_dir = "english-corpora/"): # constructing output file paths with respect to input given
        self.input_dir = input_dir
        self.output_dictionary_file_name = extract_path_name(input_dir)+"_dictionary.p"
        self.output_postings_file_name = extract_path_name(input_dir)+"_postings.txt"
        
    def generate_index(self): # build index
        for doc in doc_dictionary:
            self.total_docs.append(doc)
            tokens = doc_dictionary[doc]
            for token in tokens:
                if token not in self.dictionary:
                    self.dictionary[token] = (None, 1)
                    self.postings[token] = [doc]
                elif self.postings[token][self.dictionary[token][1]- 1] != doc: # checks if doc is already added to postings or not
                    self.dictionary[token] = (None, self.dictionary[token][1] + 1)
                    self.postings[token].append(doc)
        self.total_docs = list(dict.fromkeys(self.total_docs))

    # Write dictionary and index
    def save_postings(self):
        offset = 0
        postings_file = open(self.output_postings_file_name, "w")
        word_list = list(self.dictionary.keys())  # words in vocabulary
        word_list.sort()                        # sort words
        for token in word_list:
            self.dictionary[token] = (offset,self.dictionary[token][1]) # storing offset in dictionary
            postings_file.write(str(self.postings[token]) + '\n') # writing documents list to postings file
            offset = postings_file.tell()
        postings_file.flush()
    
    def save_dictionary(self): # Dumps dictionary using pickle
        dictionary_file = open(self.output_dictionary_file_name, "wb")
        pickle.dump(self.dictionary, dictionary_file)
        dictionary_file.flush()
    
    def clear(self):  # clears all variables
        self.dictionary.clear()
        self.postings.clear()

In [19]:
bir = Boolean_Information_Retrieval()
bir.generate_index()
bir.save_postings()
bir.save_dictionary()
bir.clear()

## TF-IDF Information Retrieval

In [21]:
class Tf_Idf:
    
    tf = {} # dictionary with documents as keys and value as another dictionary i.e. token as a key and tf value corresponding to it as value
    tf_idf={} # stores tfidf values
    total_docs = []   # list of total unique documents 
    vocabulary = set() # set of all stemmed words from the entire corpus
    inverse_document_freq = {}  # idf as dictionary for each token as key
    df={} # document frequency

    def __init__(self,input_dir = "english-corpora/"): # constructing output file paths with respect to input given
        self.input_dir = input_dir
        self.output_dictionary_file_name = extract_path_name(input_dir)+"_tfidf_dictionary.p"
        self.output_idf_file_name = extract_path_name(input_dir)+"_idf_dictionary.p"
        
    def generate_index(self):  # build index          
        for doc in doc_dictionary:
            self.total_docs.append(doc)
            tokens = doc_dictionary[doc]
            self.vocabulary.update(tokens)  # update the vocabulary
            self.tf[doc] = self.cal_tf(tokens)     # calculates only tf values in a document w.r.t each token
        self.total_docs = list(dict.fromkeys(self.total_docs))  # making total documents list unique
        self.idf()                # calculate idf value for each token
        self.tf_idf_calculator()  # calculates tfidf values in each document, or simply multiples already existing tf values with idf values
    
    def cal_tf(self,tokens):  # Term Frequency calculator
        dictionary = {}
        for k,v in Counter(tokens).items():
            dictionary[k] = (v/len(tokens))   # Normalizing w.r.t length of the document
            #dictionary[k] = v
        for term, _ in dictionary.items():
            if term in self.df.keys():
                self.df[term]+=1
            else:
                self.df[term]=1
        return dictionary
    
    def idf(self):        # Inverse Document Frequency calculator
        for i in self.df.keys():
            self.inverse_document_freq[i] = math.log(len(self.total_docs)/(self.df[i] + 1))
            
    def tf_idf_calculator(self): #tf_idf calculator
        for doc in self.tf.keys():
            temp_tf_idf={}
            doc_len=sum(list(self.tf[doc].values()))
            for token in self.tf[doc].keys():
                temp_tf_idf[token] = self.inverse_document_freq[token]*(self.tf[doc][token]/doc_len)# Multiplies idf with already existing term frequency
            self.tf_idf[doc]=temp_tf_idf
            
    def save_dictionary(self):  # Dumps dictionary using pickle
        dictionary_file = open('tf.p', "wb")
        pickle.dump(self.tf, dictionary_file)
        dictionary_file.flush()
        
        df_file = open('df.p', "wb")
        pickle.dump(self.df, df_file)
        df_file.flush()
        
        df_file = open('tf_idf.p', "wb")
        pickle.dump(self.tf_idf, df_file)
        df_file.flush()
    
        idf_file = open('idf.p', "wb")
        pickle.dump(self.inverse_document_freq, idf_file)
        idf_file.flush()
        
    def clear(self):  # clears all variables
        self.tf.clear()
        self.tf_idf.clear() 
        self.total_docs.clear()
        self.vocabulary.clear() 
        self.inverse_document_freq.clear()
        self.df.clear()

In [22]:
tf = Tf_Idf()
tf.generate_index()
tf.save_dictionary()
tf.clear()

## BM25 Information Retrieval

In [25]:
class BM25:
    
    dictionary = {}  # documents as keys and value is a dictionary with token as key and corresponding term frequency as value
    total_docs = []  # list of total unique documents 
    vocabulary = set() # set of all stemmed words from the entire corpus
    inverse_document_freq = {} # idf as dictionary for each token as key
    tf= pickle.load(open('tf.p', "rb"))
    df= pickle.load(open('df.p', "rb"))
    doc_lens={}
    
    def __init__(self,input_dir = "english-corpora/"): # constructing output file paths with respect to input given
        self.input_dir = input_dir
        self.output_dictionary_file_name = extract_path_name(input_dir)+"_bm25_dictionary.p"
        self.output_idf_file_name = extract_path_name(input_dir)+"_bm25_idf_dictionary.p"
        
    def generate_index(self):  # build index
        
        for doc in doc_dictionary:
            tokens = doc_dictionary[doc]           
            self.doc_lens[doc]=len(tokens)
        self.idf()
    
    def idf(self): 
        # Inverse Document Frequency calculator
        for i in self.df.keys():
            self.inverse_document_freq[i] = math.log(((len(doc_dictionary) - self.df[i] + 0.5)/(self.df[i] + 0.5)) + 1)
                
    def save_idf(self):           # Dumps idf dictionary and document lengths dictionary using pickle
        idf_file = open('idf_bm25.p', "wb")
        pickle.dump(self.inverse_document_freq, idf_file)
        idf_file.flush()
        
        idf_file = open('doc_lens.p', "wb")
        pickle.dump(self.doc_lens, idf_file)
        idf_file.flush()
    
    def clear(self):             # clears all variables
        self.dictionary.clear()
        self.total_docs.clear()
        self.vocabulary.clear()
        self.inverse_document_freq.clear()
        self.tf.clear()
        self.df.clear()
        self.doc_lens.clear()

In [26]:
bm = BM25()
bm.generate_index()
bm.save_idf()
bm.clear()