In [35]:
import os
import re
import numpy as np
import pandas as pd
import nltk
import math

In [36]:
# Function to tokenize the text
def tokenize(text):
    return text.lower().split()

# Function to calculate term frequency (TF)
def term_frequency(term, document):
    return document.count(term) / len(document)

# Function to calculate inverse document frequency (IDF)
def inverse_document_frequency(term, all_documents):
    num_docs_containing_term = sum(1 for doc in all_documents if term in doc)
    return math.log(len(all_documents) / (1 + num_docs_containing_term))

In [37]:
# Function to compute TF-IDF for a document
def compute_tfidf(document, all_documents, vocabulary):
    tfidf_vector = []
    for term in vocabulary:
        tf = term_frequency(term, document)
        idf = inverse_document_frequency(term, all_documents)
        tfidf_vector.append(tf * idf)
    return np.array(tfidf_vector)

# Function to compute cosine similarity between two vectors
def cosine_similarity(vector1, vector2):
    dot_product = np.dot(vector1, vector2)
    norm_vector1 = np.linalg.norm(vector1)
    norm_vector2 = np.linalg.norm(vector2)
    return dot_product / (norm_vector1 * norm_vector2)

In [38]:
# Main function
def main():
  #data
    data_directory = '/content/data'

    # Reading all files from the directory
    documents = []
    file_names = []
    for file_name in os.listdir(data_directory):
        if file_name.endswith(".txt"):
            with open(os.path.join(data_directory, file_name), "r") as file:
                content = file.read()
                documents.append(content)
                file_names.append(file_name)

    # Hardcoded queries
    search_queries = ['The three women sit huddled together',
    'The cave remained a place',
    'That leads some here to worry',
    'artificial intelligence model that',
    'The planet was really vulnerable at that time']

    # Tokenizing documents and queries
    tokenized_documents = [tokenize(doc) for doc in documents]
    tokenized_queries = [tokenize(query) for query in search_queries]

    # Building the vocabulary (unique words across all documents)
    vocabulary = sorted(set([word for doc in tokenized_documents for word in doc]))

  # Calculate TF-IDF vectors for documents and queries
    document_tfidf_vectors = [compute_tfidf(doc, tokenized_documents, vocabulary) for doc in tokenized_documents]
    query_tfidf_vectors = [compute_tfidf(query, tokenized_documents, vocabulary) for query in tokenized_queries]

    # Calculate cosine similarities
    similarity_results = []
    for query_vector in query_tfidf_vectors:
        similarities = [cosine_similarity(query_vector, doc_vector) for doc_vector in document_tfidf_vectors]
        similarity_results.append(similarities)

    # Write the results to a text file
    with open("similarity_results_susan.txt", "w") as output_file:
        for i, query in enumerate(search_queries):
            output_file.write(f"\nCosine similarities for query '{query}':\n")
            for j, doc in enumerate(documents):
                output_file.write(f"Document {file_names[j]}: {similarity_results[i][j]:.4f}\n")

    # Optional: print results for checking
    for i, query in enumerate(search_queries):
        print(f"\nCosine similarities for query '{query}':")
        for j, doc in enumerate(documents):
            print(f"Document {file_names[j]}: {similarity_results[i][j]:.4f}")

if __name__ == "__main__":
    main()




Cosine similarities for query 'The three women sit huddled together':
Document Data9.txt: 0.0068
Document data6.txt: 0.0079
Document data2.txt: 0.0030
Document Data5.txt: 0.0037
Document Data8.txt: 0.0056
Document data4.txt: 0.0333
Document data1.txt: 0.0691
Document data3.txt: 0.0212
Document Data7.txt: 0.0084
Document Data10.txt: 0.0114

Cosine similarities for query 'The cave remained a place':
Document Data9.txt: 0.0832
Document data6.txt: 0.0219
Document data2.txt: 0.0059
Document Data5.txt: 0.0082
Document Data8.txt: 0.0125
Document data4.txt: 0.0086
Document data1.txt: 0.0078
Document data3.txt: 0.0062
Document Data7.txt: 0.0087
Document Data10.txt: 0.0076

Cosine similarities for query 'That leads some here to worry':
Document Data9.txt: 0.0152
Document data6.txt: 0.0033
Document data2.txt: 0.0024
Document Data5.txt: 0.0807
Document Data8.txt: 0.0040
Document data4.txt: 0.0063
Document data1.txt: 0.0044
Document data3.txt: 0.0033
Document Data7.txt: 0.0027
Document Data10.txt: