In [3]:
# Import the necessary libraries
import nltk
nltk.download('gutenberg')
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import gutenberg
from nltk.tokenize import sent_tokenize, PunktSentenceTokenizer
from nltk.corpus import stopwords
import string
import numpy as np
import pandas as pd
import sklearn
from sklearn.metrics import precision_score, recall_score, f1_score, average_precision_score

from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\rauna\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rauna\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rauna\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
# Read the document from the Shakespeare's Macbeth corpus
document = gutenberg.raw("shakespeare-macbeth.txt")

# Convert the document to lowercase
document = document.lower()

# Tokenize the document into sentences
#sentences = nltk.sent_tokenize(document)
# Tokenize the document into paragraphs
sentences = nltk.tokenize.blankline_tokenize(document)
# Create a SnowballStemmer object
stemmer = SnowballStemmer('english')
# Remove stopwords, punctuation, and any other irrelevant tokens
# Preprocess each sentence and store the result in a list of documents
stopwords = nltk.corpus.stopwords.words('english')
documents = []
punctuation = list(string.punctuation)

for sentence in sentences:
    #Tokenize the sentence into words
    words = nltk.word_tokenize(sentence)
    filtered_words = [word for word in words if word not in punctuation not in stopwords and word.isalpha() ]
    #print(filtered_words)
    #Stem the words
    stemmed_words = [stemmer.stem(word) for word in filtered_words]
    document = ' '.join(filtered_words)
    #print(document)
    documents.append(document)
   
len(documents)

678

In [23]:
# Create a vocabulary of terms from the documents
# Use a TfidfVectorizer to create a sparse matrix of term frequencies and inverse document frequencies
# Store the vocabulary as a list of terms
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)
print("tfidf_matrix shape is", tfidf_matrix.shape)
vocabulary = vectorizer.get_feature_names_out()

# Compute the TF and IDF matrices for the documents
# Use the TfidfVectorizer attributes to access the TF and IDF matrices
# Convert them to numpy arrays for easier manipulation
tf_matrix = tfidf_matrix.toarray()
idf_matrix = vectorizer.idf_.reshape(-1, 1)

# Define a query or a set of queries
queries = [
   "three witches magic death thunder tragic",
   "knife wound red blood witch",
   "Ruthless ambition king prophecies beast unmake fair"
]

# Preprocess the queries
# Apply the same steps as the documents
# Store the processed queries in a new list
processed_queries = []
for query in queries:
    tokens = nltk.word_tokenize(query.lower())
    stemmed = [stemmer.stem(token) for token in tokens if token not in punctuation not in stopwords and token.isalpha()]
    processed_queries.append(' '.join(stemmed))

# Define the parameters for the BM25 model
k1 = 1.2
b = 0.75
epsilon = 0.25

# Compute the average document length
# Use the numpy function to calculate the mean of the sum of the term frequencies for each document
avg_doc_len = np.mean(np.sum(tf_matrix, axis=1))

# Compute the BM25 matrix for the documents
# Store the BM25 matrix as a numpy array
bm25_matrix = np.zeros_like(tf_matrix)
for i in range(len(documents)):
    doc_len = np.sum(tf_matrix[i])
    for j in range(len(vocabulary)):
        tf = tf_matrix[i][j]
        idf = idf_matrix[j]
        numerator = tf * (k1 + 1)
        denominator = tf + k1 * (1 - b + b * doc_len / avg_doc_len)
        bm25_matrix[i][j] = idf * (numerator / (denominator + epsilon))

# Compute the similarity scores between the query and each document using the TF-IDF model
# Use the cosine similarity function from sklearn to calculate the dot product of the query vector and the document vector
# Store the similarity scores in a dictionary, where the key is the query and the value is a list of scores
tfidf_similarity_scores = {}
for query in processed_queries:
    query_vector = vectorizer.transform([query]).toarray()
    scores = cosine_similarity(query_vector, tf_matrix)[0]
    tfidf_similarity_scores[query] = scores

# Compute the similarity scores between the query and each document using the BM25 model
# Use the same cosine similarity function as before
# Store the similarity scores in a dictionary, where the key is the query and the value is a list of scores
bm25_similarity_scores = {}
for query in processed_queries:
    query_vector = vectorizer.transform([query]).toarray()
    scores = cosine_similarity(query_vector, bm25_matrix)[0]
    bm25_similarity_scores[query] = scores

# Rank the documents by their similarity scores using the TF-IDF model
# Use the sorted function to sort the scores in descending order
# Store the ranked documents in a dictionary, where the key is the query and the value is a list of document indices
tfidf_ranked_documents = {}
for query, scores in tfidf_similarity_scores.items():
    ranked_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
    tfidf_ranked_documents[query] = ranked_indices

# Rank the documents by their similarity scores using the BM25 model
# Use the same sorted function as before
# Store the ranked documents in a dictionary, where the key is the query and the value is a list of document indices
bm25_ranked_documents = {}
for query, scores in bm25_similarity_scores.items():
    ranked_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
    bm25_ranked_documents[query] = ranked_indices

# Return the top k documents for each query using the TF-IDF model
# Define the value of k as the number of documents to return
# Print the query and the top k documents with their scores
k = 5
print("TF-IDF results:")
for query, indices in tfidf_ranked_documents.items():
    print(f"Query: {query}")
    print(f"Top {k} documents:")
    for i in range(k):
        index = indices[i]
        score = tfidf_similarity_scores[query][index]
        document = documents[index]
        print(f"{i+1}. {document} (score: {score:.4f})")
    print()

# Return the top k documents for each query using the BM25 model
# Print the query and the top k documents with their scores
print("BM25 results:")
for query, indices in bm25_ranked_documents.items():
    print(f"Query: {query}")
    print(f"Top {k} documents:")
    for i in range(k):
        index = indices[i]
        score = bm25_similarity_scores[query][index]
        document = documents[index]
        print(f"{i+1}. {document} (score: {score:.4f})")
    print()

tfidf_matrix shape is (678, 3310)


  bm25_matrix[i][j] = idf * (numerator / (denominator + epsilon))


TF-IDF results:
Query: three witch magic death thunder tragic
Top 5 documents:
1. thunder enter the three witches (score: 0.4877)
2. thunder enter the three witches (score: 0.4877)
3. thunder and lightning enter three witches (score: 0.4056)
4. thunder enter the three witches meeting hecat (score: 0.3677)
5. enter three murtherers (score: 0.2648)

Query: knife wound red blood witch
Top 5 documents:
1. that my keene knife see not the wound it makes nor heauen peepe through the blanket of the darke to cry hold hold enter macbeth (score: 0.2415)
2. macb it will haue blood they say blood will haue blood stones haue beene knowne to moue trees to speake augures and vnderstood relations haue by maggot pyes choughes rookes brought forth the man of blood what is the night la almost at oddes with morning which is which (score: 0.1492)
3. macb whence is that knocking how with me when euery noyse appalls me what hands are here hah they pluck out mine eyes will all great neptunes ocean wash this bl