In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##Load and Preprocess PDF Files

In [112]:
import os
import zipfile
from pathlib import Path
from nltk.tokenize import TreebankWordTokenizer
from PyPDF2 import PdfReader


def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file."""
    with open(pdf_path, 'rb') as file:
        pdf = PdfReader(file)
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text


def load_pdfs_from_zip(zip_path):
    """Loads PDF files from a ZIP file and extracts their text."""
    extracted_texts = {}
    temp_dir = "temp_documents"

    # Unzip the documents
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(temp_dir)

    # Iterate through the extracted PDF files
    for pdf_file in Path(temp_dir).rglob("*.pdf"):
        extracted_texts[pdf_file.name] = extract_text_from_pdf(pdf_file)

    # Clean up the temporary directory
    for file in Path(temp_dir).glob("*"):
        if file.is_file():
            file.unlink()
        else:
            for subfile in file.glob("*"):
                subfile.unlink()
            file.rmdir()
    Path(temp_dir).rmdir()

    return extracted_texts


##Boolean Retrieval Model

In [113]:
def boolean_search(documents, query):
    """
    Performs a boolean search over documents.
    Supports queries with AND, OR, NOT operators.
    """
    tokenizer = TreebankWordTokenizer()
    query = query.lower()
    query_tokens = tokenizer.tokenize(query)

    results = {}
    for doc_name, content in documents.items():
        content_tokens = set(tokenizer.tokenize(content.lower()))

        # Split the content into lines
        lines = content.split("\n")

        matching_lines = []  # To store matching lines for this document

        # Handle simple AND, OR, NOT queries
        if "and" in query_tokens:
            terms = [t for t in query_tokens if t != "and"]
            if all(term in content_tokens for term in terms):
                # Add lines that contain the query terms
                matching_lines = [line for line in lines if all(term in line.lower() for term in terms)]
        elif "or" in query_tokens:
            terms = [t for t in query_tokens if t != "or"]
            if any(term in content_tokens for term in terms):
                # Add lines that contain any of the query terms
                matching_lines = [line for line in lines if any(term in line.lower() for term in terms)]
        elif "not" in query_tokens:
            term = query_tokens[query_tokens.index("not") + 1]
            if term not in content_tokens:
                # Add lines that do not contain the term
                matching_lines = [line for line in lines if term not in line.lower()]
        else:  # Single term
            if query in content_tokens:
                # Add lines that contain the query term
                matching_lines = [line for line in lines if query in line.lower()]

        if matching_lines:
            results[doc_name] = matching_lines

    return results


##Vector Space Model

In [114]:
def vector_space_search(documents, query, top_n=10):
    """
    Performs a vector space search using TF-IDF.
    Returns the top N most relevant documents and matching lines.
    """
    tokenizer = TreebankWordTokenizer()
    query_tokens = " ".join(tokenizer.tokenize(query.lower()))

    # Prepare TF-IDF matrix
    vectorizer = TfidfVectorizer()
    doc_names = list(documents.keys())
    doc_texts = list(documents.values())
    tfidf_matrix = vectorizer.fit_transform(doc_texts)

    # Transform query into the same TF-IDF space
    query_vector = vectorizer.transform([query_tokens])

    # Compute cosine similarity
    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    top_indices = similarities.argsort()[-top_n:][::-1]

    # Get the top N results
    top_documents = [(doc_names[i], similarities[i], doc_texts[i]) for i in top_indices]

    # Find matching lines for each of the top N documents
    results = {}
    for doc_name, score, content in top_documents:
        lines = content.split("\n")
        matching_lines = [line for line in lines if query.lower() in line.lower()]
        if matching_lines:
            results[doc_name] = matching_lines

    return results


##Main Script

In [152]:
def main():
    zip_path = "/content/drive/MyDrive/phase2/Documents.zip"  # Path to your ZIP file containing PDFs

    print("Loading documents...")
    documents = load_pdfs_from_zip(zip_path)
    print(f"Documents loaded: {documents.keys()}")

    # # Print all document names
    # print("All Document Names:")
    # for doc_name in documents.keys():
    #   print(doc_name)

    # Example usage of Boolean Retrieval Model
    print("\n--- Boolean Search ---")
    boolean_query = "factors AND must AND be AND considered"
    # factors AND must AND be AND considered
    boolean_results = boolean_search(documents, boolean_query)
    print(f"Boolean Search Results for '{boolean_query}':")
    for doc, lines in boolean_results.items():
        print(f"Document: {doc}")
        for line in lines:
            print(f"  Line: {line}")

    # Example usage of Vector Space Model
    print("\n--- Vector Space Search ---")
    vector_query = "may be commonly held"
    vector_results = vector_space_search(documents, vector_query, top_n=10)
    print(f"Vector Space Search Results for '{vector_query}':")
    for doc, lines in vector_results.items():
        print(f"Document: {doc}")
        for line in lines:
            print(f"  Line: {line}")



In [153]:
if __name__ == "__main__":
    main()

Loading documents...
Documents loaded: dict_keys(['Disruptive, Impulse-Control, and Conduct Disorders.pdf', 'Personality Disorders.pdf', 'Other Mental Disorders.pdf', 'Depressive Disorders.pdf', 'Gender Dysphoria.pdf', 'Anxiety Disorders.pdf', 'Feeding and Eating Disorders.pdf', 'Paraphilic Disorders.pdf', 'Trauma- and Stressor-Related Disorders.pdf', 'Obsessive-Compulsive and Related Disorders.pdf', 'Elimination Disorders.pdf', 'Neurodevelopmental Disorders.pdf', 'Sexual Dysfunctions.pdf', 'Dissociative Disorders.pdf', 'Schizophrenia Spectrum and Other Psychotic Disorders.pdf', 'Neurocognitive Disorders.pdf', 'Sleep-Wake Disorders.pdf', 'Medication-Induced Movement Disorders and.pdf', 'Bipolar and Related Disorders.pdf', 'Substance-Related and Addictive Disorders.pdf', 'Other Conditions That May Be a Focus of Clinical Attention.pdf', 'Somatic Symptom and Related Disorders.pdf'])

--- Boolean Search ---
Boolean Search Results for 'factors AND must AND be AND considered':
Document: Sexu