In [111]:
#Boolean retrieval model
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Define the text collection
docs = [
    "Every year Maha Shivratri is celebrated with a lot of pomp and grandeur. It is considered to be a very special time of the year since millions of people celebrate this momentous occasion with a lot of fervour and glee.",
    "Lord Shiva devotees celebrate this occasion with a lot of grandness. It is accompanied by folk dances, songs, prayers, chants, mantras etc. This year, the beautiful occasion of Maha Shivratri will be celebrated on February 18. People keep a fast on this Maha shivratri, stay awake at night and pray to the lord for blessings, happiness, hope and prosperity. This festival holds a lot of significance and is considered to be one of the most important festivals in India.",
    "The festival of Maha Shivratri will be celebrated on February 18 and is a very auspicious festival. This Hindu festival celebrates the power of Lord Shiva. Lord Shiva protects his devotees from negative and evil spirits. He is the epitome of powerful and auspicious energy."
]

# Define the TF-IDF vectorizer and fit it to the text collection
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(docs)

# Convert the matrix to a pandas DataFrame for easy viewing
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.vocabulary_.keys())
print(tfidf_df)


      every      year      maha  shivratri        is  celebrated      with  \
0  0.000000  0.000000  0.210618   0.000000  0.000000    0.000000  0.105309   
1  0.089086  0.117137  0.207550   0.117137  0.000000    0.117137  0.138366   
2  0.116167  0.000000  0.270642   0.000000  0.305491    0.000000  0.090214   

        lot        of      pomp  ...  protects       his      from  negative  \
0  0.000000  0.000000  0.000000  ...  0.000000  0.000000  0.105309  0.105309   
1  0.117137  0.117137  0.117137  ...  0.000000  0.117137  0.207550  0.276733   
2  0.000000  0.000000  0.000000  ...  0.152746  0.000000  0.270642  0.090214   

       evil   spirits        he   epitome  powerful    energy  
0  0.178303  0.135604  0.135604  0.000000  0.271208  0.271208  
1  0.000000  0.178172  0.000000  0.089086  0.089086  0.089086  
2  0.000000  0.000000  0.116167  0.116167  0.000000  0.000000  

[3 rows x 82 columns]


In [112]:
from typing import Dict, List, Set


def create_posting_lists(documents: Dict[str, str]) -> Dict[str, Dict[str, List[int]]]:
    """
    Create posting lists for each term in the given documents.
    """
    index = {}
    for doc_id, doc_text in documents.items():
        for position, term in enumerate(doc_text.split()):
            if term not in index:
                index[term] = {}
            if doc_id not in index[term]:
                index[term][doc_id] = []
            index[term][doc_id].append(position)
    return index


def proximity_query(query: str, proximity: int, index: Dict[str, Dict[str, List[int]]]) -> List[str]:
    """
    Find documents where the terms in the query appear within the specified proximity of each other.
    """
    query_terms = query.split()
    positions = {}
    for term in query_terms:
        positions[term] = set(index[term].keys())

    # Find all document pairs that contain the query terms
    pairs = set()
    for term1 in query_terms:
        for term2 in query_terms:
            if term1 != term2:
                pairs |= set(index[term1].keys()) & set(index[term2].keys())

    # Check the proximity of each pair of terms in each document
    results = []
    for doc_id in pairs:
        doc_positions = {term: index[term][doc_id] for term in query_terms if doc_id in index[term]}
        for i in range(len(doc_positions[query_terms[0]])):
            start = doc_positions[query_terms[0]][i]
            for j in range(1, len(query_terms)):
                term = query_terms[j]
                if term not in doc_positions:
                    break
                positions = [pos for pos in doc_positions[term] if pos > start]
                if not positions:
                    break
                start = positions[0]
                if start - doc_positions[query_terms[0]][i] > proximity:
                    break
            else:
                results.append(doc_id)
                break

    return results
documents = {
    "doc1": "People keep a fast on this Maha shivratri, stay awake at night and pray to the lord for blessings, happiness, hope and prosperity.",
    "doc2": "The festival of Maha Shivratri will be celebrated on February 18 and is a very auspicious festival. This Hindu festival celebrates the power of Lord Shiva. Lord Shiva protects his devotees from negative and evil spirits. He is the epitome of powerful and auspicious energy."
}

index = create_posting_lists(documents)

results = proximity_query("hope and prosperity.", 10, index)
print(results)

['doc1']


In [128]:
import nltk

# Define the corpus of documents to be searched
corpus = [
    "Every year Maha Shivratri is celebrated with a lot of pomp and grandeur. It is considered to be a very special time of the year since millions of people celebrate this momentous occasion with a lot of fervour and glee. Lord Shiva devotees celebrate this occasion with a lot of grandness. It is accompanied by folk dances, songs, prayers, chants, mantras etc. This year, the beautiful occasion of Maha Shivratri will be celebrated on February 18. People keep a fast on this Maha shivratri, stay awake at night and pray to the lord for blessings, happiness, hope and prosperity. This festival holds a lot of significance and is considered to be one of the most important festivals in India.",
    "The festival of Maha Shivratri will be celebrated on February 18 and is a very auspicious festival. This Hindu festival celebrates the power of Lord Shiva. Lord Shiva protects his devotees from negative and evil spirits. He is the epitome of powerful and auspicious energy.",
]

# Define the Boolean retrieval model query
query = "(Maha AND Shivratri) AND (stay AND awake AND night OR blessings)"

# Define the NLTK Porter Stemmer for word stemming
porter = nltk.PorterStemmer()

# Define the inverted index with Boolean weights
inverted_index = {}
for i, doc in enumerate(corpus):
    tokens = nltk.word_tokenize(doc.lower())
    for token in tokens:
        stemmed_token = porter.stem(token)
        if stemmed_token not in inverted_index:
            inverted_index[stemmed_token] = [0] * len(corpus)
        inverted_index[stemmed_token][i] = 1

# Tokenize and stem the query
query_tokens = nltk.word_tokenize(query.lower())
query_stemmed = [porter.stem(word) for word in query_tokens]

# Perform Boolean retrieval on the corpus using the query and the inverted index
matches = []
operator = "and"
for i, doc in enumerate(corpus):
    matches_doc = True
    for query_term in query_stemmed:
        if query_term.startswith("("):
            operator = query_term[1:-1]
        elif query_term == "AND":
            operator = "and"
        elif query_term == "OR":
            operator = "or"
        else:
            inverted_list = inverted_index.get(query_term, [0] * len(corpus))
            if operator == "not":
                if inverted_list[i] == 1:
                    matches_doc = False
                    break
            elif operator == "and":
                if inverted_list[i] != 1:
                    matches_doc = False
                    break
            elif operator == "or":
                if inverted_list[i] == 1:
                    break
            else:
                print(f"Unrecognized operator: {operator}")
                break
    if matches_doc:
        matches.append(i)

# Print the matching documents
if len(matches) > 0:
    print("Matching documents:")
    for match in matches:
        print(f"Document {match+1}: {corpus[match]}")
else:
    print("No matching documents found.")


Unrecognized operator: 
Unrecognized operator: 
Matching documents:
Document 1: Every year Maha Shivratri is celebrated with a lot of pomp and grandeur. It is considered to be a very special time of the year since millions of people celebrate this momentous occasion with a lot of fervour and glee. Lord Shiva devotees celebrate this occasion with a lot of grandness. It is accompanied by folk dances, songs, prayers, chants, mantras etc. This year, the beautiful occasion of Maha Shivratri will be celebrated on February 18. People keep a fast on this Maha shivratri, stay awake at night and pray to the lord for blessings, happiness, hope and prosperity. This festival holds a lot of significance and is considered to be one of the most important festivals in India.
Document 2: The festival of Maha Shivratri will be celebrated on February 18 and is a very auspicious festival. This Hindu festival celebrates the power of Lord Shiva. Lord Shiva protects his devotees from negative and evil spirits

In [115]:
#Vector space Retrieval model
#step 1 (Tokenization/Lemmatization) using Python's NLTK package
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
# Sample document
documents = "Every year Maha Shivratri is celebrated with a lot of pomp and grandeur. It is considered to be a very special time of the year since millions of people celebrate this momentous occasion with a lot of fervour and glee."

# Tokenization
tokens = word_tokenize(documents)

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(token) for token in tokens]

print(lemmas)


['Every', 'year', 'Maha', 'Shivratri', 'is', 'celebrated', 'with', 'a', 'lot', 'of', 'pomp', 'and', 'grandeur', '.', 'It', 'is', 'considered', 'to', 'be', 'a', 'very', 'special', 'time', 'of', 'the', 'year', 'since', 'million', 'of', 'people', 'celebrate', 'this', 'momentous', 'occasion', 'with', 'a', 'lot', 'of', 'fervour', 'and', 'glee', '.']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [116]:
# step 2 (Vocabulary Creation)
# Sample documents
document1 = "Every year Maha Shivratri is celebrated with a lot of pomp and grandeur. It is considered to be a very special time of the year since millions of people celebrate this momentous occasion with a lot of fervour and glee."
document2 = "The festival of Maha Shivratri will be celebrated on February 18 and is a very auspicious festival. This Hindu festival celebrates the power of Lord Shiva. Lord Shiva protects his devotees from negative and evil spirits. He is the epitome of powerful and auspicious energy."
documents = [document1, document2]

# Tokenization and Lemmatization
tokens = []
lemmatizer = WordNetLemmatizer()
for doc in documents:
    doc_tokens = word_tokenize(doc)
    doc_lemmas = [lemmatizer.lemmatize(token) for token in doc_tokens]
    tokens.extend(doc_lemmas)

# Vocabulary Creation
vocabulary = list(set(tokens))

print(vocabulary)


['from', 'Every', 'glee', 'devotee', 'year', 'This', 'since', 'celebrated', 'grandeur', 'and', 'energy', 'the', 'epitome', 'celebrates', 'Shivratri', '18', 'power', 'very', 'celebrate', 'occasion', 'The', 'Lord', 'protects', 'It', 'considered', 'is', 'a', 'of', 'be', 'will', 'people', 'powerful', 'auspicious', 'special', 'festival', 'time', 'lot', 'pomp', 'Maha', 'with', 'this', 'Shiva', 'his', 'momentous', '.', 'fervour', 'million', 'Hindu', 'on', 'February', 'spirit', 'to', 'negative', 'He', 'evil']


In [117]:
# step 3 (Term Frequency Calculation)
# Sample documents
document1 = "Every year Maha Shivratri is celebrated with a lot of pomp and grandeur. It is considered to be a very special time of the year since millions of people celebrate this momentous occasion with a lot of fervour and glee."
document2 = "The festival of Maha Shivratri will be celebrated on February 18 and is a very auspicious festival. This Hindu festival celebrates the power of Lord Shiva. Lord Shiva protects his devotees from negative and evil spirits. He is the epitome of powerful and auspicious energy."
documents = [document1, document2]

# Tokenization and Lemmatization
tokens = []
lemmatizer = WordNetLemmatizer()
for doc in documents:
    doc_tokens = word_tokenize(doc)
    doc_lemmas = [lemmatizer.lemmatize(token) for token in doc_tokens]
    tokens.append(doc_lemmas)

# Vocabulary Creation
vocabulary = list(set([word for doc in tokens for word in doc]))

# Term Frequency Calculation
tf = []
for doc in tokens:
    doc_tf = {}
    for word in vocabulary:
        doc_tf[word] = doc.count(word) / len(doc)
    tf.append(doc_tf)

print(tf)


[{'from': 0.0, 'Every': 0.023809523809523808, 'glee': 0.023809523809523808, 'devotee': 0.0, 'year': 0.047619047619047616, 'This': 0.0, 'since': 0.023809523809523808, 'celebrated': 0.023809523809523808, 'grandeur': 0.023809523809523808, 'and': 0.047619047619047616, 'energy': 0.0, 'the': 0.023809523809523808, 'epitome': 0.0, 'celebrates': 0.0, 'Shivratri': 0.023809523809523808, '18': 0.0, 'power': 0.0, 'very': 0.023809523809523808, 'celebrate': 0.023809523809523808, 'occasion': 0.023809523809523808, 'The': 0.0, 'Lord': 0.0, 'protects': 0.0, 'It': 0.023809523809523808, 'considered': 0.023809523809523808, 'is': 0.047619047619047616, 'a': 0.07142857142857142, 'of': 0.09523809523809523, 'be': 0.023809523809523808, 'will': 0.0, 'people': 0.023809523809523808, 'powerful': 0.0, 'auspicious': 0.0, 'special': 0.023809523809523808, 'festival': 0.0, 'time': 0.023809523809523808, 'lot': 0.047619047619047616, 'pomp': 0.023809523809523808, 'Maha': 0.023809523809523808, 'with': 0.047619047619047616, 't

In [118]:
#Inverse Document Frequency (IDF)
import math
documents = [ "Every year Maha Shivratri is celebrated with a lot of pomp and grandeur. It is considered to be a very special time of the year since millions of people celebrate this momentous occasion with a lot of fervour and glee. Lord Shiva devotees celebrate this occasion with a lot of grandness. It is accompanied by folk dances, songs, prayers, chants, mantras etc. This year, the beautiful occasion of Maha Shivratri will be celebrated on February 18. People keep a fast on this Maha shivratri, stay awake at night and pray to the lord for blessings, happiness, hope and prosperity. This festival holds a lot of significance and is considered to be one of the most important festivals in India.",    "The festival of Maha Shivratri will be celebrated on February 18 and is a very auspicious festival. This Hindu festival celebrates the power of Lord Shiva. Lord Shiva protects his devotees from negative and evil spirits. He is the epitome of powerful and auspicious energy."]

def calculate_idf(documents):
    N = len(documents)
    idf = {}
    for doc in documents:
        for word in doc.split():
            if word not in idf:
                n = sum([1 for doc in documents if word in doc])
                idf[word] = math.log10(N/n)
    return idf

idf = calculate_idf(documents)
print(idf)


{'Every': 0.3010299956639812, 'year': 0.3010299956639812, 'Maha': 0.0, 'Shivratri': 0.0, 'is': 0.0, 'celebrated': 0.0, 'with': 0.3010299956639812, 'a': 0.0, 'lot': 0.3010299956639812, 'of': 0.0, 'pomp': 0.3010299956639812, 'and': 0.0, 'grandeur.': 0.3010299956639812, 'It': 0.3010299956639812, 'considered': 0.3010299956639812, 'to': 0.0, 'be': 0.0, 'very': 0.0, 'special': 0.3010299956639812, 'time': 0.3010299956639812, 'the': 0.0, 'since': 0.3010299956639812, 'millions': 0.3010299956639812, 'people': 0.3010299956639812, 'celebrate': 0.0, 'this': 0.3010299956639812, 'momentous': 0.3010299956639812, 'occasion': 0.3010299956639812, 'fervour': 0.3010299956639812, 'glee.': 0.3010299956639812, 'Lord': 0.0, 'Shiva': 0.0, 'devotees': 0.0, 'grandness.': 0.3010299956639812, 'accompanied': 0.3010299956639812, 'by': 0.3010299956639812, 'folk': 0.3010299956639812, 'dances,': 0.3010299956639812, 'songs,': 0.3010299956639812, 'prayers,': 0.3010299956639812, 'chants,': 0.3010299956639812, 'mantras': 0.

In [119]:
#Step 5: Calculate TF-IDF
import math

document = [
    "Every year Maha Shivratri is celebrated with a lot of pomp and grandeur. It is considered to be a very special time of the year since millions of people celebrate this momentous occasion with a lot of fervour and glee. Lord Shiva devotees celebrate this occasion with a lot of grandness. It is accompanied by folk dances, songs, prayers, chants, mantras etc. This year, the beautiful occasion of Maha Shivratri will be celebrated on February 18. People keep a fast on this Maha shivratri, stay awake at night and pray to the lord for blessings, happiness, hope and prosperity. This festival holds a lot of significance and is considered to be one of the most important festivals in India.",
    "The festival of Maha Shivratri will be celebrated on February 18 and is a very auspicious festival. This Hindu festival celebrates the power of Lord Shiva. Lord Shiva protects his devotees from negative and evil spirits. He is the epitome of powerful and auspicious energy."
]

def calculate_tf(document):
    tf = {}
    for word in document.split():
        if word not in tf:
            tf[word] = 0
        tf[word] += 1
    for word in tf:
        tf[word] = tf[word] / len(document.split())
    return tf

def calculate_idf(documents):
    N = len(documents)
    idf = {}
    for doc in documents:
        for word in doc.split():
            if word not in idf:
                n = sum([1 for doc in documents if word in doc.split()])
                idf[word] = math.log10(N/n)
    return idf

def calculate_tfidf(documents):
    tfidf = []
    idf = calculate_idf(documents)
    for doc in documents:
        doc_tfidf = {}
        tf = calculate_tf(doc)
        for word, freq in tf.items():
            doc_tfidf[word] = freq * idf[word]
        tfidf.append(doc_tfidf)
    return tfidf

tfidf = calculate_tfidf(documents)
print(tfidf)


[{'Every': 0.0025296638291090858, 'year': 0.0050593276582181716, 'Maha': 0.0, 'Shivratri': 0.0, 'is': 0.0, 'celebrated': 0.0, 'with': 0.007588991487327257, 'a': 0.0, 'lot': 0.010118655316436343, 'of': 0.0, 'pomp': 0.0025296638291090858, 'and': 0.0, 'grandeur.': 0.0025296638291090858, 'It': 0.0050593276582181716, 'considered': 0.0050593276582181716, 'to': 0.007588991487327257, 'be': 0.0, 'very': 0.0, 'special': 0.0025296638291090858, 'time': 0.0025296638291090858, 'the': 0.0, 'since': 0.0025296638291090858, 'millions': 0.0025296638291090858, 'people': 0.0025296638291090858, 'celebrate': 0.0050593276582181716, 'this': 0.007588991487327257, 'momentous': 0.0025296638291090858, 'occasion': 0.007588991487327257, 'fervour': 0.0025296638291090858, 'glee.': 0.0025296638291090858, 'Lord': 0.0, 'Shiva': 0.0, 'devotees': 0.0, 'grandness.': 0.0025296638291090858, 'accompanied': 0.0025296638291090858, 'by': 0.0025296638291090858, 'folk': 0.0025296638291090858, 'dances,': 0.0025296638291090858, 'song

In [120]:
#Step 6: Calculate Cosine Similarity
import numpy as np

query = "Maha Shivratri will be celebrated on February 18"
documents = [    "Every year Maha Shivratri is celebrated with a lot of pomp and grandeur. It is considered to be a very special time of the year since millions of people celebrate this momentous occasion with a lot of fervour and glee. Lord Shiva devotees celebrate this occasion with a lot of grandness. It is accompanied by folk dances, songs, prayers, chants, mantras etc. This year, the beautiful occasion of Maha Shivratri will be celebrated on February 18. People keep a fast on this Maha shivratri, stay awake at night and pray to the lord for blessings, happiness, hope and prosperity. This festival holds a lot of significance and is considered to be one of the most important festivals in India.",    "The festival of Maha Shivratri will be celebrated on February 18 and is a very auspicious festival. This Hindu festival celebrates the power of Lord Shiva. Lord Shiva protects his devotees from negative and evil spirits. He is the epitome of powerful and auspicious energy."]

def calculate_idf(documents):
    idf = {}
    for doc in documents:
        for word in doc.split():
            if word not in idf:
                idf[word] = 1
            else:
                idf[word] += 1
    for word in idf:
        idf[word] = np.log(len(documents) / idf[word])
    return idf

def calculate_tf(document):
    tf = {}
    for word in document.split():
        if word not in tf:
            tf[word] = 1
        else:
            tf[word] += 1
    return tf

def calculate_tfidf(documents):
    idf = calculate_idf(documents)
    tfidf = []
    for doc in documents:
        tf = calculate_tf(doc)
        doc_tfidf = {}
        for word in doc.split():
            doc_tfidf[word] = tf[word] * idf[word]
        tfidf.append(doc_tfidf)
    return tfidf

def cosine_similarity(query, documents):
    tfidf = calculate_tfidf(documents)
    query_tfidf = {word: 0 for doc in tfidf for word in doc}
    idf = calculate_idf(documents)
    for word in query.split():
        if word in idf:
            query_tfidf[word] = calculate_tf(query)[word] * idf[word]
    similarities = []
    for i, doc in enumerate(tfidf):
        doc_values = np.array(list(doc.values()))
        query_values = np.array([query_tfidf[word] for word in doc])
        similarity = np.dot(doc_values, query_values) / (np.linalg.norm(doc_values) * np.linalg.norm(query_values))
        similarities.append((i, similarity))
    return sorted(similarities, key=lambda x: x[1], reverse=True)


print(cosine_similarity(query, documents))



[(1, 0.17149463025263811), (0, 0.16526057755382573)]


In [121]:
#Step 7: Return Top 2 Relevant Documents
def search(query, documents):
    similarities = cosine_similarity(query, documents)
    top_2 = similarities[:2]
    results = []
    for i, similarity in top_2:
        results.append((documents[i], similarity))
    return results
print(search(query, documents))

[('The festival of Maha Shivratri will be celebrated on February 18 and is a very auspicious festival. This Hindu festival celebrates the power of Lord Shiva. Lord Shiva protects his devotees from negative and evil spirits. He is the epitome of powerful and auspicious energy.', 0.17149463025263811), ('Every year Maha Shivratri is celebrated with a lot of pomp and grandeur. It is considered to be a very special time of the year since millions of people celebrate this momentous occasion with a lot of fervour and glee. Lord Shiva devotees celebrate this occasion with a lot of grandness. It is accompanied by folk dances, songs, prayers, chants, mantras etc. This year, the beautiful occasion of Maha Shivratri will be celebrated on February 18. People keep a fast on this Maha shivratri, stay awake at night and pray to the lord for blessings, happiness, hope and prosperity. This festival holds a lot of significance and is considered to be one of the most important festivals in India.', 0.1652

In [122]:
#Step 8: Jaccard Similarity
def jaccard_similarity(query, documents):
    query_tokens = set(query.split())
    similarities = []
    for i, document in enumerate(documents):
        document_tokens = set(document.split())
        intersection = len(query_tokens.intersection(document_tokens))
        union = len(query_tokens.union(document_tokens))
        similarity = intersection / union
        similarities.append((i, similarity))
    return sorted(similarities, key=lambda x: x[1], reverse=True)

print(jaccard_similarity(query, documents))


[(1, 0.2222222222222222), (0, 0.09333333333333334)]
