Step 1: Importing Necessary Libraries, Loading, and Cleaning Documents

In [1]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
import os
import string
import logging
import re
from collections import defaultdict, Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

STOPWORDS = set(stopwords.words('english'))
STOPWORDS.remove('and')
STOPWORDS.remove('or')
STOPWORDS.remove('not')
LEMMATIZER = WordNetLemmatizer()

def load_documents(directory):
    documents = {}
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename), 'r') as file:
                documents[filename] = file.read()
    return documents

documents = load_documents('path_to_documents')

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    tokens = word_tokenize(text)
    tokens = [LEMMATIZER.lemmatize(token) for token in tokens if token not in STOPWORDS]
    return tokens

cleaned_documents = {filename: clean_text(content) for filename, content in documents.items()}

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Step 2: Creating Vocabulary

In [2]:
def create_vocabulary(cleaned_documents):
    vocabulary = set()
    for tokens in cleaned_documents.values():
        vocabulary.update(tokens)
    return vocabulary

vocabulary = create_vocabulary(cleaned_documents)

Step 3: Computing Term Frequency

In [3]:
def compute_term_frequency(cleaned_documents):
    term_frequency = defaultdict(Counter)
    for filename, tokens in cleaned_documents.items():
        term_frequency[filename] = Counter(tokens)
    return term_frequency

term_frequency = compute_term_frequency(cleaned_documents)

Step 4: Compute Inverse Document Frequency

In [4]:
import math

def compute_inverse_document_frequency(cleaned_documents):
    num_documents = len(cleaned_documents)
    df = Counter()

    for tokens in cleaned_documents.values():
        unique_tokens = set(tokens)
        for token in unique_tokens:
            df[token] += 1

    idf = {token: math.log(num_documents / df[token]) for token in df}
    return idf

idf = compute_inverse_document_frequency(cleaned_documents)

Step 5: Computing TF-IDF for Documents

In [5]:
def compute_tf_idf(term_frequency, idf):
    tf_idf = defaultdict(dict)
    for filename, tf in term_frequency.items():
        for term, freq in tf.items():
            tf_idf[filename][term] = freq * idf[term]
    return tf_idf

tf_idf = compute_tf_idf(term_frequency, idf)

Step 6: Computing TF-IDF for Queries

In [6]:
def clean_query(query):
    return clean_text(query)

In [7]:
def compute_query_tf_idf(query, idf):
    tokens = clean_query(query)
    tf = Counter(tokens)
    query_tf_idf = {term: freq * idf.get(term, 0) for term, freq in tf.items()}
    return query_tf_idf

Step 7: Computing Cosine Similarity Between Documents and Queries

In [8]:
def cosine_similarity(vec_a, vec_b):
    intersection = set(vec_a) & set(vec_b)
    numerator = sum(vec_a[x] * vec_b[x] for x in intersection)

    sum_a = sum(vec_a[x] ** 2 for x in vec_a)
    sum_b = sum(vec_b[x] ** 2 for x in vec_b)
    denominator = math.sqrt(sum_a) * math.sqrt(sum_b)

    if not denominator:
        return 0.0
    else:
        return numerator / denominator

Step 8: Example queries

In [9]:
queries = [
    "storm heart",
    "robert",
    "sun"
]

# Compute TF-IDF for each query and cosine similarity with documents
query_tf_idf = {query: compute_query_tf_idf(query, idf) for query in queries}

# Compute cosine similarities between each query and each document
cosine_similarities_queries = defaultdict(dict)
for query, query_vector in query_tf_idf.items():
    for doc_name, doc_vector in tf_idf.items():
        similarity = cosine_similarity(query_vector, doc_vector)
        cosine_similarities_queries[query][doc_name] = similarity

Step 9: Writing Results to a Text File

In [10]:
with open("result_soniya.txt", "w") as result_file:
    result_file.write("\nCosine Similarities (Queries to Documents):\n")

    for query, similarities in cosine_similarities_queries.items():
        result_file.write(f"\nRanked Cosine Similarities for Query: '{query}':\n")

        # Sort the similarities in descending order based on the similarity score
        ranked_similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)

        # Write the ranked results to the file
        for rank, (doc_name, similarity) in enumerate(ranked_similarities, start=1):
            result_file.write(f"Rank {rank}: Cosine similarity between Query: '{query}' and Document: '{doc_name}': {similarity}\n")