In [None]:
import nltk
nltk.download('punkt')

from nltk.tokenize import word_tokenize

from collections import Counter
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Sample documents and query
documents = ["I love programming in Python, JAVA, and JavaScript because these languages are versatile and powerful.", "Python is great for data science and automation", "Python programming is fun and incredibly rewarding"]
query = "Python or javascript programming"



*   Creating **Bag of Words** for **documents**
*   Creating **Bag of Words** for a **query**



In [None]:
# Tokenizing the documents and query
bow_documents = [___(word_tokenize(doc)) for doc in documents]
print("bow_documents:", bow_documents)

bow_query = ___(word_tokenize(query))
print("bow_query:", bow_query)

bow_documents: [Counter({',': 2, 'and': 2, 'I': 1, 'love': 1, 'programming': 1, 'in': 1, 'Python': 1, 'JAVA': 1, 'JavaScript': 1, 'because': 1, 'these': 1, 'languages': 1, 'are': 1, 'versatile': 1, 'powerful': 1, '.': 1}), Counter({'Python': 1, 'is': 1, 'great': 1, 'for': 1, 'data': 1, 'science': 1, 'and': 1, 'automation': 1}), Counter({'Python': 1, 'programming': 1, 'is': 1, 'fun': 1, 'and': 1, 'incredibly': 1, 'rewarding': 1})]
bow_query: Counter({'Python': 1, 'or': 1, 'javascript': 1, 'programming': 1})




*   Use **set()** to create **a set** that contain all the unique keys from **bow_query**.
*   Updating the set with keys from **bow_documents**
*   **Sorting** the set





In [None]:
all_words = set(bow_query.keys())
print("all_words before update:", all_words)

for doc in bow_documents:
    all_words.update(doc.keys())
print("all_words after update:", all_words)

all_words = ___(all_words)
print("sort all_words:", all_words)

all_words before update: {'or', 'javascript', 'programming', 'Python'}
all_words after update: {'incredibly', 'javascript', 'Python', 'because', 'for', 'data', 'or', 'great', 'powerful', 'science', 'these', 'I', 'love', 'is', 'programming', 'and', 'JAVA', ',', 'in', 'versatile', 'automation', '.', 'are', 'rewarding', 'JavaScript', 'fun', 'languages'}
sort all_words: [',', '.', 'I', 'JAVA', 'JavaScript', 'Python', 'and', 'are', 'automation', 'because', 'data', 'for', 'fun', 'great', 'in', 'incredibly', 'is', 'javascript', 'languages', 'love', 'or', 'powerful', 'programming', 'rewarding', 'science', 'these', 'versatile']


[What is NumPy?](https://www.w3schools.com/python/numpy/numpy_intro.asp)

NumPy aims to provide an array object that is up to 50x faster than traditional Python lists.

*   Creating vectors for documents
*   Creating a query vector



In [None]:
def bow_to_vector(bow, all_words):
    return np.array([bow.get(word, 0) for word in all_words])

vectors = [bow_to_vector(bow, all_words) for bow in bow_documents]
query_vector = bow_to_vector(bow_query, all_words)
print("vectors:", vectors)
print("query_vector:", query_vector)

vectors: [array([2, 1, 1, 1, 1, 1, 2, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1,
       1, 0, 0, 1, 1]), array([0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0]), array([0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0])]
query_vector: [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0]


**scikit-learn** (sklearn) is a widely-used machine learning **library** for Python.

**sklearn.metrics.pairwise** is a module in the **scikit-learn library** that provides functions to compute pairwise distances or similarities between samples.

[**cosine_similarity**](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.cosine_similarity.html) is a metric used to measure the similarity between two vectors.





*   Compute similarity scores
*   Rank document indices



In [None]:
similarity_scores = cosine_similarity([query_vector], vectors)
print("similarity_scores:", similarity_scores)
ranked_indices = similarity_scores.argsort()[0][::-1]
print("ranked_indices:", ranked_indices)

for idx in ranked_indices:
    print(f"Document: {documents[idx]}, Similarity: {similarity_scores[0][idx]}")

similarity_scores: [[0.21320072 0.1767767  0.37796447]]
ranked_indices: [2 0 1]
Document: Python programming is fun and incredibly rewarding, Similarity: 0.3779644730092272
Document: I love programming in Python, JAVA, and JavaScript because these languages are versatile and powerful., Similarity: 0.21320071635561041
Document: Python is great for data science and automation, Similarity: 0.17677669529663687
