<a href="https://colab.research.google.com/github/Sidplex/College/blob/main/VectorSpace.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [128]:
import glob
import math
import re
import sys
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

STOPWORDS = set(stopwords.words("english"))
document_filenames = dict()
N = 0
vocabulary = set()
postings = defaultdict(dict)
document_frequency = defaultdict(int)
length = defaultdict(float)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [114]:
from  google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [115]:
documents = glob.glob("/content/drive/MyDrive/Corpus2/*")
def readcorpus():
    global document_filenames, N
    N = len(documents)
    document_filenames = dict(zip(range(N), documents))

In [117]:
def tokenize(document):
    terms = word_tokenize(document)
    terms = [term.lower() for term in terms if term not in STOPWORDS]
    return terms

In [125]:
def remove_punct(text):
    text = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

In [126]:
def remove_digits(text):
    regex = re.compile(r"\d")
    return re.sub(regex, "", text)

In [116]:
def terms_posting():
    global vocabulary, postings
    for id in document_filenames:
        with open(document_filenames[id], "r") as f:
            document = f.read()
        document = remove_punct(document)
        document = remove_digits(document)
        terms = tokenize(document)
        unique_terms = set(terms)
        vocabulary = vocabulary.union(unique_terms)
        for term in unique_terms:
            postings[term][id] = terms.count(term)

In [118]:
def document_frequencies():
    global document_frequency
    for term in vocabulary:
        document_frequency[term] = len(postings[term])
    # print(document_frequency)

In [119]:
def document_lengths():
    global length
    for id in document_filenames:
        l = 0
        for term in vocabulary:
            l += term_frequency(term, id) ** 2
        length[id] = math.sqrt(l)
    print(length)

In [120]:
def term_frequency(term, id):
    if id in postings[term]:
        return postings[term][id]
    else:
        return 0.0

In [121]:
def inverse_document_frequency(term):
    if term in vocabulary:
        return math.log(N / document_frequency[term], 2)
    else:
        return 0.0

In [124]:
def similarity(query, id):
    similarity = 0.0
    for term in query:
        if term in vocabulary:
            similarity += term_frequency(term, id) * inverse_document_frequency(term)
    similarity = similarity / length[id]
    return similarity

In [123]:
def finalscores():
    query = tokenize("Developing your Zomato business account and profile is a great way to boost your restaurant’s online reputation")
    scores = sorted(
        [(id, similarity(query, id)) for id in range(N)],
        key=lambda x: x[1],
        reverse=True,
    )
    return scores

In [122]:
def print_scores(scores):
    print("-" * 42)
    print("| %s | %s |" % ("Score", "Document"))
    print("-" * 42)
    for (id, score) in scores:
        if score != 0.0:
            print("| %s | %s |" % (str(score)[:5], document_filenames[id]))
    print("-" * 42, end="\n\n")

In [127]:
def main():
    readcorpus()
    terms_posting()
    document_frequencies()
    document_lengths()
    scores = finalscores()
    print(terms_posting())
    print(document_frequencies())
    print(document_lengths())
    # print_scores(scores)
if __name__ == "__main__":
    main()

defaultdict(<class 'float'>, {0: 30.166206257996713, 1: 43.139309220245984, 2: 39.44616584663204, 3: 200.86811593680068, 4: 65.43699259593154, 5: 57.74945887192364, 6: 89.0, 7: 63.387695966961914, 8: 37.33630940518894, 9: 52.3354564325181, 10: 60.37383539249432, 11: 44.799553569204235, 12: 44.21538193886829, 13: 54.56189146281496, 14: 37.97367509209505, 15: 35.4259791678367, 16: 38.63935817272331, 17: 53.36665625650534, 18: 37.282703764614496, 19: 30.740852297878796, 20: 46.52956049652737, 21: 51.884487084291386, 22: 38.17066936798463, 23: 51.01960407529639, 24: 41.376321731154405, 25: 53.0, 26: 53.53503525729669, 27: 43.76071297408213, 28: 44.76605857119878, 29: 31.85906464414798, 30: 46.04345773288535, 31: 44.76605857119878, 32: 41.376321731154405, 33: 34.51086785347479, 34: 34.02939905434711, 35: 36.72873534441391, 36: 40.70626487409524, 37: 29.34280150224242, 38: 28.861739379323623, 39: 29.189039038652847, 40: 28.513154858766505})
None
None
defaultdict(<class 'float'>, {0: 30.16620