In [2]:
import os
import numpy as np
import math

from google.colab import drive
drive.mount("/content/drive")



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Load text files from the folder
def load_text_files(folder_path):
    data = []
    doc_id_to_filename = {}
    for i, filename in enumerate(os.listdir(folder_path)):
        if filename.endswith('.txt'):  # Ensure it's a text file
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                data.append(file.read())
                doc_id_to_filename[i] = filename
    return data, doc_id_to_filename

In [4]:
# Folder path
folder_path = '/content/drive/MyDrive/document'

In [5]:
# Load dataset
docs, doc_id_to_filename = load_text_files(folder_path)

In [6]:
# Queries with logical operators
queries = [
    "Healthy eating AND disease prevention",
    "Vitamins in fruits AND immunity",
    "Whole grains: cholesterol OR sugar",
    "Plant OR animal proteins",
    "Processed food AND heart/weight"
]

# Preprocess documents and queries: lowercase and tokenize
def tokenize(text):
  return text.lower().split()
tokenized_docs = [tokenize(doc) for doc in docs]
tokenized_queries = [tokenize(query) for query in queries]

In [7]:
def clean_text(text):
    """Performs text cleaning: removing special characters, digits, tokenization, stopword removal, and lemmatization."""

    # Convert to lowercase
    text = text.lower()

    # Remove special characters and punctuation using regular expressions (keeps only alphanumeric and spaces)
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)

    # Remove all digits
    text = re.sub(r"\d+", "", text)  # Removes digits globally

    # Tokenize the cleaned text
    tokens = word_tokenize(text)

    # Remove stopwords and lemmatize
    cleaned_tokens = [LEMMATIZER.lemmatize(word) for word in tokens if word not in STOPWORDS]

    return cleaned_tokens

In [19]:
# Build vocabulary (unique words across all documents and queries)
vocab = set([word for doc in tokenized_docs for word in doc if word.isalpha()])
vocab = sorted(vocab)  # Optional sorting for consistency
print("Vocabulary:", vocab)


Vocabulary: ['a', 'achieving', 'acids', 'addition', 'additional', 'additives', 'after', 'aiding', 'aids', 'all', 'almond', 'along', 'also', 'alternatives', 'amino', 'an', 'and', 'another', 'apples', 'are', 'as', 'aspect', 'b', 'balance', 'balanced', 'be', 'blood', 'body', 'bone', 'bowel', 'broccoli', 'brown', 'build', 'by', 'c', 'calcium', 'calorie', 'can', 'carbohydrates', 'certain', 'cholesterol', 'choosing', 'chronic', 'cognitive', 'colon', 'colorful', 'combination', 'complete', 'components', 'conditions', 'consumed', 'consuming', 'contain', 'contribute', 'critical', 'crucial', 'cruciferous', 'daily', 'dairy', 'detoxification', 'diet', 'digestion', 'disease', 'diseases', 'drinking', 'drinks', 'each', 'eating', 'eggs', 'energy', 'enough', 'ensure', 'enzymes', 'especially', 'essential', 'excellent', 'fats', 'feeling', 'fiber', 'foods', 'for', 'fortified', 'foundation', 'from', 'fruits', 'full', 'fullness', 'function', 'gastrointestinal', 'glasses', 'good', 'grain', 'grains', 'greens',

In [9]:
# Function to calculate term frequency (TF)
def term_frequency(term, document):
  return document.count(term) / len(document)

In [10]:
# Function to calculate inverse document frequency (IDF)
def inverse_document_frequency(term, all_documents):
  num_docs_containing_term = sum(1 for doc in all_documents if term in doc)
  return math.log(len(all_documents) / (1 + num_docs_containing_term))

In [11]:
# Compute TF-IDF for a document
def compute_tfidf(document, all_documents, vocab):
  tfidf_vector = []
  for term in vocab:
    tf = term_frequency(term, document)
    idf = inverse_document_frequency(term, all_documents)
    tfidf_vector.append(tf * idf)
  return np.array(tfidf_vector)

In [12]:
# Compute cosine similarity between two vectors
def cosine_similarity(vec1, vec2):
  dot_product = np.dot(vec1, vec2)
  norm_vec1 = np.linalg.norm(vec1)
  norm_vec2 = np.linalg.norm(vec2)
  return dot_product / (norm_vec1 * norm_vec2)

In [13]:
# Calculate TF-IDF vectors for documents and queries
doc_tfidf_vectors = [compute_tfidf(doc, tokenized_docs, vocab) for doc in tokenized_docs]
query_tfidf_vectors = [compute_tfidf(query, tokenized_docs, vocab) for query in tokenized_queries]

In [21]:
# Path for the output file
output_file_path = "/content/drive/MyDrive/result_Shristina.txt"

# Opening the file in write mode
with open(output_file_path, 'w') as f:
    # Calculate cosine similarities
    cosine_similarities = []
    for query_vector in query_tfidf_vectors:
        similarities = [cosine_similarity(query_vector, doc_vector) for doc_vector in doc_tfidf_vectors]
        cosine_similarities.append(similarities)

    # Displaying the results in ascending order of cosine similarity
    for i, query in enumerate(queries):
        f.write(f"\nCosine similarities for query '{query}':\n")

        # Zipping document indices and their corresponding similarities
        doc_sim_pairs = list(enumerate(cosine_similarities[i]))

        # Sorting the pairs based on similarity in ascending order
        doc_sim_pairs_sorted = sorted(doc_sim_pairs, key=lambda x: x[1])

        # Writing the sorted document similarities to the file
        for doc_idx, similarity in doc_sim_pairs_sorted:
            f.write(f"Document {doc_idx + 1}: {similarity:.4f}\n")

# Confirming that the output has been saved
print(f"Output has been saved to {output_file_path}")


Output has been saved to /content/drive/MyDrive/result_Shristina.txt
