In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import logging
import math
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
STOPWORDS = set(stopwords.words('english'))
LEMMATIZER = WordNetLemmatizer()

In [4]:
def load_text_files(folder_path):
    data = {}
    doc_id_to_filename = {}
    doc_id = 0
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                data[doc_id] = file.read()
                doc_id_to_filename[doc_id] = filename
                logging.info(f"Loaded file: {filename} with doc_id: {doc_id}")
                doc_id += 1
    return data, doc_id_to_filename

In [5]:
# Removes special characters from text, tokenizes it, eliminates stop words, and lemmatizes it.
def tokenize(text):
  tokens = text.lower()
  text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove special characters
  tokens = text.split()
  cleaned_tokens = [LEMMATIZER.lemmatize(word) for word in tokens if word not in STOPWORDS]
  return cleaned_tokens


In [6]:
def term_frequency(term, document):
    return document.count(term) / len(document)

In [7]:
def inverse_document_frequency(term, all_documents):
    num_docs_containing_term = sum(1 for doc in all_documents if term in doc)
    return math.log(len(all_documents) / (1 + num_docs_containing_term))

In [8]:
def compute_tfidf(document, all_documents, vocab):
    tfidf_vector = []
    for term in vocab:
        tf = term_frequency(term, document)
        idf = inverse_document_frequency(term, all_documents)
        tfidf_vector.append(tf * idf)
    return np.array(tfidf_vector)

In [9]:
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2) if norm_vec1 * norm_vec2 != 0 else 0

In [10]:
import os
import logging

def main():
    folder_path = "/content/drive/MyDrive/TECH 400 Information Retrieval/cars"

    # Load documents
    docs, doc_id_to_filename = load_text_files(folder_path)

    # Tokenize the documents
    tokenized_docs = [tokenize(doc) for doc in docs.values()]

    # Create vocabulary
    vocab = sorted(set(word for doc in tokenized_docs for word in doc))
    logging.info(f"Vocabulary size: {len(vocab)}")

    # Compute TF-IDF vectors for each document
    doc_tfidf_vectors = [compute_tfidf(doc, tokenized_docs, vocab) for doc in tokenized_docs]

    results = []

    # Start query processing loop
    while True:
        # Ask user for a query
        query = input("Enter a query (or 'exit' to quit): ").strip()
        if query.lower() == 'exit':
            break

        tokenized_query = tokenize(query)
        query_tfidf_vector = compute_tfidf(tokenized_query, tokenized_docs, vocab)

        similarities = []

        # Compute cosine similarity between the query and all documents
        for doc_id, doc_vector in enumerate(doc_tfidf_vectors):
            similarity = cosine_similarity(query_tfidf_vector, doc_vector)
            similarities.append((doc_id, similarity))

        # Sort the documents based on similarity in descending order
        similarities.sort(key=lambda x: x[1], reverse=True)

        # Keep only the top 5 most similar documents
        top_5_similarities = similarities[:5]

        # Append the query and its top 5 results to the final results list
        results.append((query, top_5_similarities))

        # Display the results for the current query
        print(f"\nTop 5 results for query: '{query}'")
        for doc_id, similarity in top_5_similarities:
            filename = doc_id_to_filename[doc_id]
            print(f"  Document: {filename}, Similarity: {similarity:.4f}")
        print("\n")

    # Define the output path and filename
    path = "/content/drive/MyDrive/TECH 400 Information Retrieval/result"
    output_file = os.path.join(path, "results_Nisha.txt")

    # Write results to the output file
    with open(output_file, 'w', encoding='utf-8') as f:
        for query, similarities in results:
            f.write(f"Query: {query}\n")
            for doc_id, similarity in similarities:
                filename = doc_id_to_filename[doc_id]
                f.write(f"  Document: {filename}, Similarity: {similarity:.4f}\n")
            f.write("\n")

    logging.info(f"Results written to {output_file}")

if __name__ == "__main__":
    main()



Top 5 results for query: 'turbocharged'
  Document: Genesis GV80.txt, Similarity: 0.0688
  Document: Subaru Outback Wilderness.txt, Similarity: 0.0665
  Document: Volvo XC90 Recharge.txt, Similarity: 0.0652
  Document: Jeep Grand Cherokee 4xe.txt, Similarity: 0.0599
  Document: Lexus RX 500h.txt, Similarity: 0.0573



Top 5 results for query: 'eco-friendly hatchbacks'
  Document: BMW iX.txt, Similarity: 0.0854
  Document: Jeep Grand Cherokee 4xe.txt, Similarity: 0.0800
  Document: Lexus RX 500h.txt, Similarity: 0.0766
  Document: Tesla Model S Plaid.txt, Similarity: 0.0000
  Document: Ford F-150 Lightning.txt, Similarity: 0.0000



Top 5 results for query: 'hybrid vehicles'
  Document: Volvo XC90 Recharge.txt, Similarity: 0.1356
  Document: Jeep Grand Cherokee 4xe.txt, Similarity: 0.0903
  Document: Lexus RX 500h.txt, Similarity: 0.0864
  Document: BMW iX.txt, Similarity: 0.0732
  Document: Porsche Taycan Turbo S.txt, Similarity: 0.0673



Top 5 results for query: 'sports cars under $