In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

In [16]:
# Load and preprocess documents
def load_documents(file_path="extracted_pdf_content.txt"):
    with open(file_path, "r", encoding="utf-8") as file:
        documents = file.readlines()
    return [doc.strip() for doc in documents if doc.strip()]

In [17]:
# Preprocess query (optional: add stemming/lemmatization if needed)
def preprocess_query(query):
    query = query.lower()
    return query

In [18]:
# Build the TF-IDF index
def build_index(documents):
    vectorizer = TfidfVectorizer(stop_words="english")
    tfidf_matrix = vectorizer.fit_transform(documents)
    return vectorizer, tfidf_matrix

In [19]:
# Search the documents
def search_query(query, vectorizer, tfidf_matrix, documents):
    query = preprocess_query(query)
    query_vector = vectorizer.transform([query])
    similarity_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()
    ranked_indices = np.argsort(similarity_scores)[::-1]
    return [(documents[i], similarity_scores[i]) for i in ranked_indices if similarity_scores[i] > 0]

In [20]:
# Display search results
def display_results(results, query):
    print(f"Search Results for Query: \"{query}\"\n")
    if results:
        for rank, (doc, score) in enumerate(results, start=1):
            print(f"Rank {rank}: (Score: {score:.4f})")
            print(f"Document: {doc[:200]}...")  # Display first 200 chars of the document
            print("-" * 80)
    else:
        print("No relevant documents found.")

In [21]:
# Main function to demonstrate functionality
def main():
    # Load documents
    documents = load_documents()
    print(f"Loaded {len(documents)} documents.")
    
    # Build the TF-IDF index
    vectorizer, tfidf_matrix = build_index(documents)
    print("TF-IDF index built.")
    
    # Accept user queries
    while True:
        query = input("\nEnter your search query (or type 'exit' to quit): ").strip()
        if query.lower() == 'exit':
            break
        results = search_query(query, vectorizer, tfidf_matrix, documents)
        display_results(results, query)

if __name__ == "__main__":
    main()

Loaded 4522 documents.
TF-IDF index built.
Search Results for Query: "what is the importnace of machine learning"

Rank 1: (Score: 1.0000)
Document: 3.4. Machine learning...
--------------------------------------------------------------------------------
Rank 2: (Score: 0.6821)
Document: Supervised learning is the machine learning task of learning...
--------------------------------------------------------------------------------
Rank 3: (Score: 0.6356)
Document: Machine Learning Algorithms -A Review...
--------------------------------------------------------------------------------
Rank 4: (Score: 0.6356)
Document: Machine Learning Algorithms - A Review...
--------------------------------------------------------------------------------
Rank 5: (Score: 0.6255)
Document: Reinforcement learning is an area of machine learning...
--------------------------------------------------------------------------------
Rank 6: (Score: 0.5947)
Document: from these data using machine learning techniqu