In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import nltk
import re
import os
import logging
import math
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict, Counter

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
def read_documents(path):

    content = {}
    doc_id_filename = {}
    doc_id = 0
    for filename in os.listdir(path):
        if filename.endswith(".txt"):
            with open(os.path.join(path, filename), 'r', encoding='utf-8') as file:
                content[doc_id] = file.read()
                doc_id_filename[doc_id] = filename
                logging.info(f"Loaded file: {filename} with doc_id: {doc_id}")
                doc_id += 1
    return content, doc_id_filename


 Tokenization

In [None]:
def tokenize(text):

    if isinstance(text, str):
        return text.lower().split()
    else:
        return []

Cleaning

In [None]:
def text_cleaning(text):

    text = text.lower()
    text = re.sub(r'http[s]?://\S+', '', text)

    tokens = word_tokenize(text)
    cleaned_text = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return cleaned_text

Frequency Calculation

In [None]:
def term_frequency(term, document):
    return document.count(term) / len(document)

In [None]:
def inverse_document_frequency(term, all_documents):
    num_docs_containing_term = sum(1 for doc in all_documents if term in doc)
    return math.log(len(all_documents) / (1 + num_docs_containing_term))

In [None]:
def compute_tfidf(document, all_documents, vocab):
    tfidf_vector = []
    for term in vocab:
        tf = term_frequency(term, document)
        idf = inverse_document_frequency(term, all_documents)
        tfidf_vector.append(tf * idf)
    return np.array(tfidf_vector)

In [None]:
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2 + 1e-8)

In [None]:
def main():
  path = "/content/drive/MyDrive/Tech_cars"
  docs, doc_id_to_filename = read_documents(path)

  queries = [ " electric", " sedan", " car", "family car"]

  tokenized_docs = [tokenize(doc) for doc in docs.values()]

  vocab = sorted(set(word for doc in tokenized_docs for word in doc))
  print(logging.info(f"Vocabulary size: {len(vocab)}"))

  doc_tfidf_vectors = [compute_tfidf(doc, tokenized_docs, vocab) for doc in tokenized_docs]

  results = []

  for query in queries:
        tokenized_query = tokenize(query)
        query_tfidf_vector = compute_tfidf(tokenized_query, tokenized_docs, vocab)

        similarities = []
        for doc_id, doc_vector in enumerate(doc_tfidf_vectors):
            similarity = cosine_similarity(query_tfidf_vector, doc_vector)
            similarities.append((doc_id, similarity))

        similarities.sort(key=lambda x: x[1], reverse=True)

        results.append((query, similarities))

        print(results)

  result_path = "/content/drive/MyDrive/Tech400/results"
  output_file = os.path.join(result_path, "results_resha_maharjan.txt")
  with open(output_file, 'w', encoding='utf-8') as f:
        for query, similarities in results:
          f.write(f"Query: {query}\n")
          for doc_id, similarity in similarities:
              filename = doc_id_to_filename[doc_id]
              f.write(f"  Document: {filename}, Similarity: {similarity:.4f}\n")
          f.write("\n")

          logging.info(f"Results written to {output_file}")

if __name__ == "__main__":
    main()


None
[(' electric', [(0, 0.065458442965528), (4, 0.037985988096978916), (3, 0.037773195424594105), (5, 0.030146567959790007), (6, 0.028514332334712542), (1, 0.0), (2, 0.0), (7, 0.0), (8, 0.0), (9, 0.0)])]
[(' electric', [(0, 0.065458442965528), (4, 0.037985988096978916), (3, 0.037773195424594105), (5, 0.030146567959790007), (6, 0.028514332334712542), (1, 0.0), (2, 0.0), (7, 0.0), (8, 0.0), (9, 0.0)]), (' sedan', [(8, 0.11012353979271548), (5, 0.10815050649759479), (6, 0.10229487698065012), (0, 0.0), (1, 0.0), (2, 0.0), (3, 0.0), (4, 0.0), (7, 0.0), (9, 0.0)])]
[(' electric', [(0, 0.065458442965528), (4, 0.037985988096978916), (3, 0.037773195424594105), (5, 0.030146567959790007), (6, 0.028514332334712542), (1, 0.0), (2, 0.0), (7, 0.0), (8, 0.0), (9, 0.0)]), (' sedan', [(8, 0.11012353979271548), (5, 0.10815050649759479), (6, 0.10229487698065012), (0, 0.0), (1, 0.0), (2, 0.0), (3, 0.0), (4, 0.0), (7, 0.0), (9, 0.0)]), (' car', [(9, 0.09003668375064869), (4, 0.07597197619395783), (7, 0.042