In [7]:
import glob
import math
import re
import sys
from collections import defaultdict
from functools import reduce

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

STOPWORDS = set(stopwords.words("english"))
CORPUS = "D:/Shantanu/Docs/*"


# Each document has an id which are keys in the dictionary below.
doc_files = dict()

N = 0

# vocabulary of the corpus
vocabulary = set()

# postings list is made which is a dict whose keys are the document ids of documents and the corresponding values as term frequency
postings = defaultdict(dict)

# document_frequency is a defaultdict whose keys are terms and corresponding values are number of documents
document_frequency = defaultdict(int)

length = defaultdict(float)

   
# To fetch the details about corpus
def get_corpus():
    global doc_files,N
    documents = glob.glob(CORPUS)
    N = len(documents)
    doc_files = dict(zip(range(N), documents))

    
# Initialising terms and postings for the corpus   
def inp():
    global vocabulary,postings
    for id in doc_files:
        with open(doc_files[id], "r") as f:
            document = f.read()
        document = remove_special_characters(document)
        document = remove_digits(document)
        terms = tokenize(document)
        unique_terms = set(terms)
        vocabulary = vocabulary.union(unique_terms)
        for term in unique_terms:
            postings[term][id] = terms.count(term)


def tokenize(document):
    # using nltk tokenize library
    terms = word_tokenize(document)
    # Remove stopwords and convert terms to lowercase
    terms = [term.lower() for term in terms if term not in STOPWORDS]
    return terms

# Set document frequencies for all the terms in vocabulary
def indf():
    global document_frequency
    for term in vocabulary:
        document_frequency[term] = len(postings[term])

#calculate length for each doc for normalization
def initialize_lengths():
    global length
    for id in doc_files:
        l = 0
        for term in vocabulary:
            l += tf(term, id) ** 2
        length[id] = math.sqrt(l)


def tf(term, id):
    # This gives the term frequency of term in document id using the function : 1 + log(tf).
    # It returns 0 if the term isn't present in the document.
    # The base of log is 10
    if id in postings[term]:
        return (1 + math.log(N/postings[term][id], 10))
    else:
        return 0.0


def idf(term):
    # Returns the inverse document frequency of term.
    # The base of log is 10
    if term in vocabulary:
        return math.log(N / document_frequency[term], 10)
    else:
        return 0.0


def print_scores(scores):
    print("-" * 42)
    print("| %s | %-30s |" % ("Score", "Document"))
    print("-" * 42)
    c = 1
    for (id, score) in scores:
        if c>10:
            break
        if score != 0.0:
            c+=1
            print("| %s | %-30s |" % (str(score)[:5], doc_files[id]))

    print("-" * 42, end="\n\n")


def do_search():
    query = tokenize(input(" Enter your query : "))

    # Exit if query is empty
    if query == []:
        sys.exit()

    scores = sorted(
        [(id, cosine_similarity(query, id)) for id in range(N)],
        key=lambda x: x[1],
        reverse=True,
    )

    return scores


def intersection(sets):
    return reduce(set.intersection, [s for s in sets])


def cosine_similarity(query, id):
    """ 
    This gives the cosine_similarity between query and document id.
    """
    cosine_similarity = 0.0

    for term in query:
        if term in vocabulary:
            # calculate tf-idf score of the term and add to cosine_similarity
            cosine_similarity += tf(term, id) * idf(term)

    cosine_similarity = cosine_similarity / length[id]

    return cosine_similarity


def remove_special_characters(text):
    # Removes special characters using regex  
    regex = re.compile(r"[^a-zA-Z0-9\s]")
    return re.sub(regex, "", text)


def remove_digits(text):
    #Removes digits using regex 
    regex = re.compile(r"\d")
    return re.sub(regex, "", text)


get_corpus()
#initialize term and posting list
inp()
#intialize doc frequency
indf()
initialize_lengths()

while True:
    scores = do_search()
    print_scores(scores)

 Enter your query :  japan ruled korea


------------------------------------------
| Score | Document                       |
------------------------------------------
| 0.200 | D:/Shantanu/Docs\T01.txt       |
| 0.134 | D:/Shantanu/Docs\T02.txt       |
| 0.096 | D:/Shantanu/Docs\T09.txt       |
| 0.088 | D:/Shantanu/Docs\T12.txt       |
| 0.076 | D:/Shantanu/Docs\T23.txt       |
| 0.069 | D:/Shantanu/Docs\T08.txt       |
| 0.064 | D:/Shantanu/Docs\T05.txt       |
| 0.026 | D:/Shantanu/Docs\T10.txt       |
| 0.025 | D:/Shantanu/Docs\T06.txt       |
| 0.024 | D:/Shantanu/Docs\T14.txt       |
------------------------------------------



 Enter your query :  japan korea


------------------------------------------
| Score | Document                       |
------------------------------------------
| 0.096 | D:/Shantanu/Docs\T09.txt       |
| 0.088 | D:/Shantanu/Docs\T12.txt       |
| 0.076 | D:/Shantanu/Docs\T23.txt       |
| 0.075 | D:/Shantanu/Docs\T01.txt       |
| 0.069 | D:/Shantanu/Docs\T08.txt       |
| 0.064 | D:/Shantanu/Docs\T05.txt       |
| 0.051 | D:/Shantanu/Docs\T02.txt       |
| 0.026 | D:/Shantanu/Docs\T10.txt       |
| 0.025 | D:/Shantanu/Docs\T06.txt       |
| 0.024 | D:/Shantanu/Docs\T14.txt       |
------------------------------------------



 Enter your query :  hard-fought battles


------------------------------------------
| Score | Document                       |
------------------------------------------
| 0.105 | D:/Shantanu/Docs\T35.txt       |
| 0.087 | D:/Shantanu/Docs\T31.txt       |
| 0.079 | D:/Shantanu/Docs\T37.txt       |
| 0.068 | D:/Shantanu/Docs\T26.txt       |
| 0.065 | D:/Shantanu/Docs\T18.txt       |
------------------------------------------



 Enter your query :  General Walker


------------------------------------------
| Score | Document                       |
------------------------------------------
| 0.078 | D:/Shantanu/Docs\T16.txt       |
| 0.076 | D:/Shantanu/Docs\T35.txt       |
| 0.064 | D:/Shantanu/Docs\T13.txt       |
| 0.058 | D:/Shantanu/Docs\T38.txt       |
| 0.055 | D:/Shantanu/Docs\T33.txt       |
| 0.053 | D:/Shantanu/Docs\T14.txt       |
| 0.051 | D:/Shantanu/Docs\T21.txt       |
| 0.050 | D:/Shantanu/Docs\T15.txt       |
| 0.049 | D:/Shantanu/Docs\T17.txt       |
| 0.049 | D:/Shantanu/Docs\T20.txt       |
------------------------------------------



 Enter your query :  


SystemExit: 