In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import math
from collections import Counter
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Precision at K
def precision_at_k(relevant_docs, retrieved_docs, k):
    retrieved_at_k = retrieved_docs[:k]
    relevant_and_retrieved = len([doc for doc in retrieved_at_k if doc in relevant_docs])
    return relevant_and_retrieved / k

# Recall at K
def recall_at_k(relevant_docs, retrieved_docs, k):
    retrieved_at_k = retrieved_docs[:k]
    relevant_and_retrieved = len([doc for doc in retrieved_at_k if doc in relevant_docs])
    return relevant_and_retrieved / len(relevant_docs) if relevant_docs else 0

# Main function
def main():
    # Directory containing the text documents
    directory = '/content/drive/MyDrive/documents'

    # Verify directory exists
    if not os.path.exists(directory):
        print(f"Error: The directory '{directory}' was not found.")
        return

    # Reading all files from the directory
    docs = []
    filenames = []
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            filepath = os.path.join(directory, filename)
            try:
                with open(filepath, "r") as file:
                    content = file.read()
                    docs.append(content)
                    filenames.append(filename)
            except Exception as e:
                print(f"Error reading file '{filename}': {e}")
                return

    if not docs:
        print("No documents found in the directory.")
        return

    # Hardcoded queries
    queries = ['horror', 'sci-fi', 'drama']

    # Define relevant documents per query (update with correct filenames from your dataset)
    relevant_docs_per_query = {
        'horror': ['horror_1.txt', 'horror_2.txt'],
        'sci-fi': ['sci-fi_1.txt', 'sci-fi_2.txt'],
        'drama': ['drama_1.txt', 'drama_2.txt']
    }

    # Use TfidfVectorizer to create TF-IDF vectors for both documents and queries
    vectorizer = TfidfVectorizer()

    # Fit the vectorizer on the document collection (this creates a vocabulary and assigns TF-IDF values)
    doc_tfidf_vectors = vectorizer.fit_transform(docs).toarray()

    # Transform the queries into the same TF-IDF space
    query_tfidf_vectors = vectorizer.transform(queries).toarray()

    # Calculate cosine similarities
    cosine_similarities = []
    for query_vector in query_tfidf_vectors:
        similarities = cosine_similarity(query_vector.reshape(1, -1), doc_tfidf_vectors).flatten()
        cosine_similarities.append(similarities)

    # Precision and Recall calculations at k=5
    k = 5
    for i, query in enumerate(queries):
        print(f"\nResults for query '{query}':")

        # Pair document filenames with their similarity scores
        doc_similarity_pairs = list(zip(filenames, cosine_similarities[i]))
        # Sort by similarity in descending order
        ranked_docs = sorted(doc_similarity_pairs, key=lambda x: x[1], reverse=True)
        ranked_filenames = [doc[0] for doc in ranked_docs]

        # Debugging output: Print top 5 ranked documents for each query
        print(f"Top {k} ranked documents for query '{query}':")
        for rank, (filename, score) in enumerate(ranked_docs[:k], 1):
            print(f"Rank {rank}: Document {filename} - Score: {score:.4f}")

        # Calculate precision and recall at k
        precision_k = precision_at_k(relevant_docs_per_query.get(query, []), ranked_filenames, k)
        recall_k = recall_at_k(relevant_docs_per_query.get(query, []), ranked_filenames, k)

        print(f"Precision at {k}: {precision_k:.4f}")
        print(f"Recall at {k}: {recall_k:.4f}")

    # Naive Bayes Classification
    labels = [1 if 'horror' in filename else 2 if 'sci-fi' in filename else 3 for filename in filenames]

    # Fit the Naive Bayes model
    nb_model = MultinomialNB()
    X_train = np.array(doc_tfidf_vectors)
    y_train = np.array(labels)
    nb_model.fit(X_train, y_train)

    # Predict using Naive Bayes
    X_test = np.array(query_tfidf_vectors)
    y_pred = nb_model.predict(X_test)

    # Expected labels for the queries
    y_true = [1 if 'horror' in query else 2 if 'sci-fi' in query else 3 for query in queries]

    # Calculate accuracy, precision, and recall
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='macro', zero_division=0)
    recall = recall_score(y_true, y_pred, average='macro', zero_division=0)

    print(f"\nNaive Bayes Accuracy: {accuracy:.4f}")
    print(f"Naive Bayes Precision: {precision:.4f}")
    print(f"Naive Bayes Recall: {recall:.4f}")

if __name__ == "__main__":
    main()


Results for query 'horror':
Top 5 ranked documents for query 'horror':
Rank 1: Document The Walking Dead.txt - Score: 0.0784
Rank 2: Document Speak No Evil.txt - Score: 0.0772
Rank 3: Document Beetlejuice.txt - Score: 0.0705
Rank 4: Document Poltergeist.txt - Score: 0.0698
Rank 5: Document Transformers One.txt - Score: 0.0000
Precision at 5: 0.0000
Recall at 5: 0.0000

Results for query 'sci-fi':
Top 5 ranked documents for query 'sci-fi':
Rank 1: Document Transformers One.txt - Score: 0.1394
Rank 2: Document Poltergeist.txt - Score: 0.0000
Rank 3: Document The Intouchables.txt - Score: 0.0000
Rank 4: Document The Walking Dead.txt - Score: 0.0000
Rank 5: Document WALL·E.txt - Score: 0.0000
Precision at 5: 0.0000
Recall at 5: 0.0000

Results for query 'drama':
Top 5 ranked documents for query 'drama':
Rank 1: Document Your Name (Kimi no Na w.txt - Score: 0.0627
Rank 2: Document The Godfather.txt - Score: 0.0624
Rank 3: Document The Intouchables.txt - Score: 0.0619
Rank 4: Document Ikiru