In [2]:
import os
import math

In [3]:
#This function displays the menu as follows
#    1. Indexing
#    3. Exit

def printMenu():
    print('Menu:')
    print('Please enter 1 for indexing, 2 to search, and 3 to exit')
    result_menu = int(input('1. Indexzing\n2. Searching\n3. Exit\n\n'))
    return result_menu
printMenu()

Menu:
Please enter 1 for indexing, 2 to search, and 3 to exit
1. Indexzing
2. Searching
3. Exit

1


1

In [5]:
# This function takes a text file as input and replaces all punctuations into blank, " "
# Input: text
# Output: text with no punctuations

def punctuationsRemoval(text: str) -> str:
    """
    (str) -> str
    Return a text after replacing any punctuations to blank, " ".
    
    >>>punctuationsRemoval('abcdefg##@@higklmn,*()%opqr""stu')
    'abcdefg    higklmn     opqr  stu'
    """
    PUNCTUATIONS = ["!", "(", ")", "—", "–", "-", "[", "]", "{", "}", ";", ":", "​", "•", "\'", '\"', "‘", "’",  "\\", ",", "<", ">", ".", "…", "/", '?', '@', "#", "$", "%", "^", "&", "+", "*", "=", "_", "~"]
    
    clear_text = ""
    for word in text:
        clear_text += word if word not in PUNCTUATIONS else " "
            # if a variable "word" is not punctuation, then append "word" itself; otherwise, append blank.
    
    return clear_text

In [7]:
# This function takes a text as input and removes all stopwords.
# Input: text
# Output: text with nostop words
def stopWordRemoval(text: str) -> list:
    """
    (str) -> list
    Return word list based on the input data(txt format), which are excluded stored words in 'Stop_Words.txt'.
    Before running this function, should be removed punctuations from the text.
    
    >>>stopWordRemoval("The monkeys jump on the bed.")
    ['monkeys', 'jump', 'bed.']
    """
    word_list = list(filter(lambda word: word != "", text.split(" ")))
        # excluding "" from the text after splitting by the blnk and change a word into a lower case
   
    with open('Stop_Words.txt', 'r') as f: # load stopwords file
        Stop_Words = f.read()
    
    remove_list = [i.strip("''").strip('""') for i in Stop_Words.split(", ")] # clear format
    clean_list = list(filter(lambda word: word not in remove_list, word_list)) # excluding the removewords from word_list
        
    return clean_list

stopWordRemoval("The monkeys jump on the bed.")

['The', 'monkeys', 'jump', 'bed.']

In [9]:
def appendTermDocFreq(docid: int, cleanText: list, termDocFreqFile: file_name):
    """
    Appends TermDocFreqFile with the term(lower case), the document number, and their frequency.
    The format is like below.
    
    ontario 1 2\n
    government 1 3\n
    """
    term_freq = {} # format -> {term: freqency}
    for word in cleanText:
        word = word.lower() # every word changes lower case
        if word not in term_freq:
            term_freq[word] = 1 
        else:
            term_freq[word] += 1
            
    append_text = ''
    for k, v in term_freq.items():
        append_text += '{} {} {}\n'.format(k, docid, v)
    
    termDocFreqFile.write(append_text)

In [10]:
def genIndex(termDocFreqFile):
    index_file = {}
    # Format -> {term_01: {doc#: freq, doc#: freq,...}, term_02:{doc#: freq, ...}}
    
    termDocFreqFile = open("TermDocFreq.txt", 'r', encoding='utf-8')    
    for line in termDocFreqFile: # read text document line by line
        read = line[:-1].split(" ") # apply split method after removing the last word('\n') of each
        if read[0] not in index_file: # read[0], read[1], read[2] = term, doc#, freq
            index_file[read[0]] = {read[1]: read[2]}    
        else:
            index_file[read[0]][read[1]] = read[2]
    
    return index_file

In [11]:
def readFolderContent():
    files = []
    file_list = os.listdir('dataset')
    for filename in sorted(file_list):
        with open('dataset' + '/' + filename, 'r', encoding='utf-8') as infile:
            files.append(infile.read())
    return files

In [12]:
def indexing():
    termDocFreqFile = open("TermDocFreq.txt", 'w', encoding='utf-8') 
        
    # readFolderContent is called to create a list of files.
    files = readFolderContent()
    id=1
    for file in files:        
        puncRemoved = punctuationsRemoval(file) # remove all punctuations
        stopWordsRemoved = stopWordRemoval(puncRemoved) # remove all stop words.
        appendTermDocFreq(id, stopWordsRemoved, termDocFreqFile)  # Call appendTermDocFreq function to append to termDocFreqFile
        id += 1
    
    global global_index_file
    global_index_file = genIndex(termDocFreqFile) # Call genIndex function to append to the global index file
    termDocFreqFile.close() 

In [32]:
def search(query: str) -> str:
    """
    Returns the document with the highest score as the most relevant one.
    Before running this function, must run "index()" function at first.
    """
    query_words = stopWordRemoval(punctuationsRemoval(query.lower())) # deal with punctuations and unnecessary words.
    
    file_list = os.listdir('dataset')
    num_docs_total = len(file_list) # the total number of documents in the collection 
    global scores
    scores = {} # {document_number: scores}, which will be updated
    for query in query_words:
        num_docs_query= len([doc for doc in global_index_file[query].keys()]) # number of documents that contain query word
        for doc, freq in global_index_file[query].items(): # document number, query frequency per document
            scores[int(doc)] = 0 if int(doc) not in scores else scores[int(doc)]
            scores[int(doc)] += int(freq) * math.log(num_docs_total/num_docs_query)

    most_relevant_doc_num = max(scores, key=scores.get)
    file_name = 'dataset/'+ file_list[most_relevant_doc_num-1]
    
    with open(file_name, 'r', encoding='utf-8') as f:
        most_relevant_document = f.read()
    
    return most_relevant_document

If I obeyed your instruction or its formula on MS2, a programmer might execute the "for" loop against all files, just like this.


    for D in range(len(os.listdir('dataset'))):
    ...


However, the above code repeats all files, even if query words are not contained. I think it takes more time, so I tried to calculate the score of query each word at first. That is, the computer does not need to search around all the files to find user-specified query words. If a file(document) contains multiple query words, the score of each words are adding up (referring to <b>code line 13&14</b>).

In [33]:
def main():
    option=printMenu()
    if option == 1:
        indexing()
    elif option == 2:
        query= input("What's your query word or sentence?\n")
        return search(query)

#if __name__ == "__main__":
#       main()

In [34]:
main()

Menu:
Please enter 1 for indexing, 2 to search, and 3 to exit
1. Indexzing
2. Searching
3. Exit

2
What's your query word or sentence?
Ontario document


'London, Ont., Mayor Joe Fontana says in retrospect it was "stupid" of him to alter a document he submitted for expenses while he was a Liberal member of Parliament, but insists it was no forgery.Fontana took the stand Wednesday in his own defence after pleading not guilty to fraud, breach of trust and uttering forged documents from his time as a cabinet minister.He admitted making seven changes — including whiting out his wife\'s signature and replacing it with his own — to an existing contract for a hall rental for his son\'s 2005 wedding to reflect an event he planned for then-finance minister Ralph Goodale at the same venue.Other alterations on the contract were changing the date of the event from June 25, 2005 to Feb. 25, 2004, the word "wedding" to "reception" and the addition of a yellow sticky note saying "misc constituents reception."The event didn\'t end up going ahead at the Marconi Club, but Fontana believed the club was owed a $1,700 deposit from his MP budget. Since he ha