In [None]:
import nltk
# Download required resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from collections import Counter
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Sample documents and query
documents = ["I love programming in Python, JAVA, and JavaScript because these languages are versatile and powerful.", "Python is great for data science and automation", "Python programming is fun and incredibly rewarding"]
query = "Python or javascript programming"

*   Use **word_tokenize()** to split the **text** into **tokens**.
*   The **.lower()** method converts each token to lowercase.
*   Create a list **alpha_only** that contains only alphabetical characters using the **.isalpha()** method.
*   Create a list called **no_stops** consisting of words from **alpha_only** that are *not* contained in english_stops.
*   Lemmatize all tokens into a new list called **lemmatized**

In [None]:
# Preprocessing function
def preprocess_text(text):
    tokens = ___(text)
    lower_tokens = [t.___() for t in tokens]
    alpha_only = [t for t in lower_tokens if t.___()]
    no_stops = [t for t in alpha_only if t not in ___.words('english')]
    wordnet_lemmatizer = ___()
    words_lemmatized = [wordnet_lemmatizer.___(t) for t in no_stops]
    return words_lemmatized

*   Creating **Bag of Words** for **documents**
*   Creating **Bag of Words** for a **query**

In [None]:
# Tokenizing and preprocessing the documents and query
bow_documents = [___(preprocess_text(doc)) for doc in documents]
print("bow_documents:", bow_documents)

bow_query = ___(preprocess_text(query))
print("bow_query:", bow_query)

bow_documents: [Counter({'love': 1, 'programming': 1, 'python': 1, 'java': 1, 'javascript': 1, 'language': 1, 'versatile': 1, 'powerful': 1}), Counter({'python': 1, 'great': 1, 'data': 1, 'science': 1, 'automation': 1}), Counter({'python': 1, 'programming': 1, 'fun': 1, 'incredibly': 1, 'rewarding': 1})]
bow_query: Counter({'python': 1, 'javascript': 1, 'programming': 1})



*   Use **set()** to create **a set** that contain all the unique keys from **bow_query**.
*   Updating the set with keys from **bow_documents**
*   **Sorting** the set

In [None]:
all_words = set(bow_query.keys())
print("all_words before update:", all_words)

for doc in bow_documents:
    all_words.update(doc.keys())

print("all_words after update:", all_words)

all_words = ___(all_words)
print("sort all_words:", all_words)

all_words before update: {'javascript', 'python', 'programming'}
all_words after update: {'versatile', 'automation', 'rewarding', 'javascript', 'programming', 'love', 'powerful', 'fun', 'incredibly', 'java', 'data', 'science', 'great', 'language', 'python'}
sort all_words: ['automation', 'data', 'fun', 'great', 'incredibly', 'java', 'javascript', 'language', 'love', 'powerful', 'programming', 'python', 'rewarding', 'science', 'versatile']


*   Creating vectors for documents
*   Creating a query vector

In [None]:
def bow_to_vector(bow, all_words):
    return np.array([bow.get(word, 0) for word in all_words])

vectors = [bow_to_vector(bow, all_words) for bow in bow_documents]
query_vector = bow_to_vector(bow_query, all_words)

print("vectors:", vectors)
print("query_vector:", query_vector)

vectors: [array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1]), array([1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0]), array([0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0])]
query_vector: [0 0 0 0 0 0 1 0 0 0 1 1 0 0 0]



*   Compute similarity scores
*   Rank document indices

In [None]:
similarity_scores = cosine_similarity([query_vector], vectors)
print("similarity_scores:", similarity_scores)
ranked_indices = similarity_scores.argsort()[0][::-1]
print("ranked_indices:", ranked_indices)

for idx in ranked_indices:
    print(f"Document: {documents[idx]}, Similarity: {similarity_scores[0][idx]}")

similarity_scores: [[0.61237244 0.25819889 0.51639778]]
ranked_indices: [0 2 1]
Document: I love programming in Python, JAVA, and JavaScript because these languages are versatile and powerful., Similarity: 0.6123724356957945
Document: Python programming is fun and incredibly rewarding, Similarity: 0.5163977794943223
Document: Python is great for data science and automation, Similarity: 0.25819888974716115
