<a href="https://colab.research.google.com/github/Sidplex/College/blob/main/VectorSpace2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [129]:
import glob
import math
import re
import sys
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

STOPWORDS = set(stopwords.words("english"))
document_filenames = dict()
N = 0
vocabulary = set()
postings = defaultdict(dict)
document_frequency = defaultdict(int)
length = defaultdict(float)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [130]:
from  google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [131]:
documents = glob.glob("/content/drive/MyDrive/Corpus2/*")
def readcorpus():
    global document_filenames, N
    N = len(documents)
    document_filenames = dict(zip(range(N), documents))

In [132]:
def tokenize(document):
    terms = word_tokenize(document)
    terms = [term.lower() for term in terms if term not in STOPWORDS]
    return terms

In [133]:
def remove_punct(text):
    text = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

In [134]:
def remove_digits(text):
    regex = re.compile(r"\d")
    return re.sub(regex, "", text)

In [135]:
def terms_posting():
    global vocabulary, postings
    for id in document_filenames:
        with open(document_filenames[id], "r") as f:
            document = f.read()
        document = remove_punct(document)
        document = remove_digits(document)
        terms = tokenize(document)
        unique_terms = set(terms)
        vocabulary = vocabulary.union(unique_terms)
        for term in unique_terms:
            postings[term][id] = terms.count(term)

In [136]:
def document_frequencies():
    global document_frequency
    for term in vocabulary:
        document_frequency[term] = len(postings[term])
    # print(document_frequency)

In [137]:
def document_lengths():
    global length
    for id in document_filenames:
        l = 0
        for term in vocabulary:
            l += term_frequency(term, id) ** 2
        length[id] = math.sqrt(l)
    # print(length)

In [138]:
def term_frequency(term, id):
    if id in postings[term]:
        return postings[term][id]
    else:
        return 0.0

In [139]:
def inverse_document_frequency(term):
    if term in vocabulary:
        return math.log(N / document_frequency[term], 2)
    else:
        return 0.0

In [140]:
def similarity(query, id):
    similarity = 0.0
    for term in query:
        if term in vocabulary:
            similarity += term_frequency(term, id) * inverse_document_frequency(term)
    similarity = similarity / length[id]
    return similarity

In [141]:
def finalscores():
    query = tokenize("Developing your Zomato business account and profile is a great way to boost your restaurant’s online reputation")
    scores = sorted(
        [(id, similarity(query, id)) for id in range(N)],
        key=lambda x: x[1],
        reverse=True,
    )
    return scores

In [142]:
def print_scores(scores):
    print("-" * 42)
    print("| %s | %s |" % ("Score", "Document"))
    print("-" * 42)
    for (id, score) in scores:
        if score != 0.0:
            print("| %s | %s |" % (str(score)[:5], document_filenames[id]))
    print("-" * 42, end="\n\n")

In [143]:
def main():
    readcorpus()
    terms_posting()
    document_frequencies()
    document_lengths()
    scores = finalscores()
    # print(terms_posting())
    # print(document_frequencies())
    # print(document_lengths())
    print_scores(scores)
if __name__ == "__main__":
    main()

------------------------------------------
| Score | Document |
------------------------------------------
| 4.858 | /content/drive/MyDrive/Corpus2/zomato.txt |
| 1.617 | /content/drive/MyDrive/Corpus2/swiggy.txt |
| 0.936 | /content/drive/MyDrive/Corpus2/instagram.txt |
| 0.643 | /content/drive/MyDrive/Corpus2/messenger.txt |
| 0.608 | /content/drive/MyDrive/Corpus2/flipkart.txt |
| 0.589 | /content/drive/MyDrive/Corpus2/reddit.txt |
| 0.385 | /content/drive/MyDrive/Corpus2/nike.txt |
| 0.339 | /content/drive/MyDrive/Corpus2/shakespeare.txt |
| 0.336 | /content/drive/MyDrive/Corpus2/Discord.txt |
| 0.310 | /content/drive/MyDrive/Corpus2/paypal.txt |
| 0.309 | /content/drive/MyDrive/Corpus2/Amazon.txt |
| 0.283 | /content/drive/MyDrive/Corpus2/sony.txt |
| 0.274 | /content/drive/MyDrive/Corpus2/reliance.txt |
| 0.242 | /content/drive/MyDrive/Corpus2/skype.txt |
| 0.234 | /content/drive/MyDrive/Corpus2/steam.txt |
| 0.225 | /content/drive/MyDrive/Corpus2/samsung.txt |
| 0.207 | /content