# Colab init

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##Load and Preprocess PDF Files

In [5]:
!pip install PyPDF2



In [6]:
import os
import zipfile
from pathlib import Path
from nltk.tokenize import TreebankWordTokenizer
from PyPDF2 import PdfReader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity



def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file."""
    with open(pdf_path, 'rb') as file:
        pdf = PdfReader(file)
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text


def load_pdfs_from_zip(zip_path):
    """Loads PDF files from a ZIP file and extracts their text."""
    extracted_texts = {}
    temp_dir = "temp_documents"

    # Unzip the documents
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(temp_dir)

    # Iterate through the extracted PDF files
    for pdf_file in Path(temp_dir).rglob("*.pdf"):
        extracted_texts[pdf_file.name] = extract_text_from_pdf(pdf_file)

    # Clean up the temporary directory
    for file in Path(temp_dir).glob("*"):
        if file.is_file():
            file.unlink()
        else:
            for subfile in file.glob("*"):
                subfile.unlink()
            file.rmdir()
    Path(temp_dir).rmdir()

    return extracted_texts


# Phase 2

##Boolean Retrieval Model

In [None]:
def boolean_search(documents, query):
    """
    Performs a boolean search over documents.
    Supports queries with AND, OR, NOT operators.
    """
    tokenizer = TreebankWordTokenizer()
    query = query.lower()
    query_tokens = tokenizer.tokenize(query)

    results = {}
    for doc_name, content in documents.items():
        content_tokens = set(tokenizer.tokenize(content.lower()))

        # Split the content into lines
        lines = content.split("\n")

        matching_lines = []  # To store matching lines for this document

        # Handle simple AND, OR, NOT queries
        if "and" in query_tokens:
            terms = [t for t in query_tokens if t != "and"]
            if all(term in content_tokens for term in terms):
                # Add lines that contain the query terms
                matching_lines = [line for line in lines if all(term in line.lower() for term in terms)]
        elif "or" in query_tokens:
            terms = [t for t in query_tokens if t != "or"]
            if any(term in content_tokens for term in terms):
                # Add lines that contain any of the query terms
                matching_lines = [line for line in lines if any(term in line.lower() for term in terms)]
        elif "not" in query_tokens:
            term = query_tokens[query_tokens.index("not") + 1]
            if term not in content_tokens:
                # Add lines that do not contain the term
                matching_lines = [line for line in lines if term not in line.lower()]
        else:  # Single term
            if query in content_tokens:
                # Add lines that contain the query term
                matching_lines = [line for line in lines if query in line.lower()]

        if matching_lines:
            results[doc_name] = matching_lines

    return results


##Vector Space Model

In [10]:
import nltk
nltk.download('punkt_tab')  # for tokenaization
nltk.download('stopwords')  # for stop words
nltk.download('wordnet')  # for lemmatization


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Download necessary resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_query(query):
    # Tokenize query
    tokens = word_tokenize(query.lower())

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Apply stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    # Apply lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Rejoin tokens back into string
    return " ".join(tokens)

def vector_space_search(documents, query, top_n=155):
    """
    Performs a vector space search using TF-IDF.
    Returns the top N most relevant documents and matching lines.
    """
    # Preprocess query (remove stopwords, stemming, and lemmatization)
    query_tokens = preprocess_query(query)

    # Prepare TF-IDF matrix
    vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 3))
    doc_names = list(documents.keys())
    doc_texts = list(documents.values())
    tfidf_matrix = vectorizer.fit_transform(doc_texts)

    # Transform query into the same TF-IDF space
    query_vector = vectorizer.transform([query_tokens])

    # Compute cosine similarity
    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    top_indices = similarities.argsort()[-top_n:][::-1]

    # Get the top N results
    top_documents = [(doc_names[i], similarities[i], doc_texts[i]) for i in top_indices]

    # Find matching lines for each of the top N documents
    results = {}
    for doc_name, score, content in top_documents:
        lines = content.split("\n")
        matching_lines = [line for line in lines if query.lower() in line.lower()]
        if matching_lines:
            results[doc_name] = matching_lines

    return results


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


##Main Script

In [None]:
def main():
    zip_path = "/content/drive/MyDrive/phase2/Documents.zip"  # Path to your ZIP file containing PDFs

    print("Loading documents...")
    documents = load_pdfs_from_zip(zip_path)
    print(f"Documents loaded: {documents.keys()}")

    # # Print all document names
    # print("All Document Names:")
    # for doc_name in documents.keys():
    #   print(doc_name)

    # Example usage of Boolean Retrieval Model
    print("\n--- Boolean Search ---")
    boolean_query = "factors AND must AND be AND considered"
    # factors AND must AND be AND considered
    boolean_results = boolean_search(documents, boolean_query)
    print(f"Boolean Search Results for '{boolean_query}':")
    for doc, lines in boolean_results.items():
        print(f"Document: {doc}")
        for line in lines:
            print(f"  Line: {line}")

    # Example usage of Vector Space Model
    print("\n--- Vector Space Search ---")
    vector_query = "ADHD"
    vector_results = vector_space_search(documents, vector_query, top_n=155)
    print(f"Vector Space Search Results for '{vector_query}':")
    for doc, lines in vector_results.items():
        print(f"Document: {doc}")
        for line in lines:
            print(f"  Line: {line}")



In [None]:
if __name__ == "__main__":
    main()

Loading documents...
Documents loaded: dict_keys(['Voyeuristic Disorder.pdf', 'Rapid Eye Movement Sleep Behavior Disorder.pdf', 'Narcolepsy.pdf', 'Panic Attack Specifier.pdf', 'Major or Mild Neurocognitive Disorder Due to Prion Disease.pdf', 'Conversion Disorder (Functional Neurological Symptom Disorder).pdf', 'Narcissistic Personality Disorder.pdf', 'Hoarding Disorder.pdf', 'Other Mental Disorders.pdf', 'Stimulant-Related Disorders.pdf', 'Unspecified Bipolar and Related Disorder.pdf', 'Obsessive-Compulsive and Related Disorders.pdf', 'Disruptive, Impulse-Control, and Conduct Disorders.pdf', 'Premenstrual Dysphoric Disorder.pdf', 'Body Dysmorphic Disorder.pdf', 'Dependent Personality Disorder.pdf', 'Anxiety Disorders.pdf', 'Major or Mild Neurocognitive Disorder With Lewy Bodies.pdf', 'Separation Anxiety Disorder.pdf', 'Alcohol-Related Disorders.pdf', 'Major or Mild Vascular Neurocognitive Disorder.pdf', 'Dissociative Amnesia.pdf', 'Histrionic Personality Disorder.pdf', 'Substance Medic

#####The bellow code provides a hybrid approach for document retrieval and ranking using a combination of TF-IDF and BERT models. First, it extracts text from PDFs stored in a ZIP file. The documents are then preprocessed, and TF-IDF is used to rank the documents based on the query. BERT is employed to rank documents by semantic similarity, generating embeddings for both the query and documents. The system first retrieves documents using BERT embeddings for relevance and then refines the ranking using TF-IDF. Finally, the top-ranked documents are presented, with matching lines from the documents displayed based on the query, offering a comprehensive document retrieval system that handles both direct and conceptual queries.

##Import Required Libraries

In [None]:
import os
import zipfile
from pathlib import Path
from nltk.tokenize import TreebankWordTokenizer
from PyPDF2 import PdfReader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import torch
from transformers import BertTokenizer, BertModel
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer


##BERT Setup

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')


##Helper Functions to Load Documents

In [None]:
# Helper functions to load documents
def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file."""
    with open(pdf_path, 'rb') as file:
        pdf = PdfReader(file)
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text

def load_pdfs_from_zip(zip_path):
    """Loads PDF files from a ZIP file and extracts their text."""
    extracted_texts = {}
    temp_dir = "temp_documents"

    # Unzip the documents
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(temp_dir)

    # Iterate through the extracted PDF files
    for pdf_file in Path(temp_dir).rglob("*.pdf"):
        extracted_texts[pdf_file.name] = extract_text_from_pdf(pdf_file)

    # Clean up the temporary directory
    for file in Path(temp_dir).glob("*"):
        if file.is_file():
            file.unlink()
        else:
            for subfile in file.glob("*"):
                subfile.unlink()
            file.rmdir()
    Path(temp_dir).rmdir()

    return extracted_texts


##BERT Embedding for Queries and Documents

In [None]:
# BERT embedding for queries and documents
def get_bert_embeddings(text):
    """Generate BERT embeddings for a given text."""
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze()


##TF-IDF Pre-processing

In [None]:
# TF-IDF pre-processing
def preprocess_query(query):
    """Preprocesses the query by tokenizing, stemming, and lemmatizing."""
    tokens = word_tokenize(query.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Apply stemming and lemmatization
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return " ".join(tokens)


##Vector Space Search using TF-IDF

In [None]:
# Vector space search using TF-IDF
def vector_space_search(documents, query, top_n=5):
    """Performs a vector space search using TF-IDF and ranks documents."""
    query_tokens = preprocess_query(query)

    # Prepare TF-IDF matrix
    vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 3))
    doc_names = list(documents.keys())
    doc_texts = list(documents.values())
    tfidf_matrix = vectorizer.fit_transform(doc_texts)

    # Transform query into the same TF-IDF space
    query_vector = vectorizer.transform([query_tokens])

    # Compute cosine similarity
    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    top_indices = similarities.argsort()[-top_n:][::-1]

    # Get the top N results
    top_documents = [(doc_names[i], similarities[i], doc_texts[i]) for i in top_indices]

    return top_documents


##Rank Documents with BERT

In [None]:
# Rank documents with BERT similarity
def rank_documents_with_bert(documents, query, top_n=5):
    """Ranks documents using BERT embeddings based on semantic similarity."""
    query_embedding = get_bert_embeddings(query)

    document_embeddings = []
    for doc_name, doc_text in documents.items():
        doc_embedding = get_bert_embeddings(doc_text)
        document_embeddings.append(doc_embedding)

    similarities = [cosine_similarity([query_embedding], [doc_emb])[0][0] for doc_emb in document_embeddings]
    ranked_docs = sorted(zip(documents.keys(), similarities), key=lambda x: x[1], reverse=True)

    return ranked_docs


##Combine BERT for Retrieval and TF-IDF for Ranking

In [None]:
# Combine BERT for retrieval and TF-IDF for ranking
def retrieve_and_rank(documents, query, top_n=5):
    """Retrieve and rank documents using BERT for retrieval and TF-IDF for ranking."""
    # Step 1: Retrieve documents using BERT similarity
    ranked_docs_by_bert = rank_documents_with_bert(documents, query, top_n=top_n)

    # Step 2: Rank documents with TF-IDF within the top BERT results
    top_docs = [doc_name for doc_name, _ in ranked_docs_by_bert]
    filtered_docs = {doc_name: documents[doc_name] for doc_name in top_docs}
    tfidf_results = vector_space_search(filtered_docs, query, top_n=top_n)

    return tfidf_results


##Main Function to Load Documents and Perform Search

In [None]:
# Main function
def main():
    zip_path = "/content/drive/MyDrive/phase2/Documents.zip"  # Path to your ZIP file containing PDFs
    print("Loading documents...")
    documents = load_pdfs_from_zip(zip_path)
    print(f"Documents loaded: {documents.keys()}")

    # Example query
    query = "what is Diagnostic Features?"

    # Retrieve and rank documents
    print("\n--- Retrieve and Rank Documents ---")
    ranked_documents = retrieve_and_rank(documents, query, top_n=20)

    # Show matching lines containing the query
    for doc_name, score, content in ranked_documents:
        print(f"Document: {doc_name}")
        print(f"Score (TF-IDF): {score}")

        # Extract matching lines from the document
        lines = content.split("\n")
        matching_lines = [line for line in lines if query.lower() in line.lower()]

        if matching_lines:
            for line in matching_lines:
                print(f"  Matching Line: {line}")
        else:
            print("  No matching lines found.")

In [None]:
if __name__ == "__main__":
    main()

Loading documents...
Documents loaded: dict_keys(['Voyeuristic Disorder.pdf', 'Rapid Eye Movement Sleep Behavior Disorder.pdf', 'Narcolepsy.pdf', 'Panic Attack Specifier.pdf', 'Major or Mild Neurocognitive Disorder Due to Prion Disease.pdf', 'Conversion Disorder (Functional Neurological Symptom Disorder).pdf', 'Narcissistic Personality Disorder.pdf', 'Hoarding Disorder.pdf', 'Other Mental Disorders.pdf', 'Stimulant-Related Disorders.pdf', 'Unspecified Bipolar and Related Disorder.pdf', 'Obsessive-Compulsive and Related Disorders.pdf', 'Disruptive, Impulse-Control, and Conduct Disorders.pdf', 'Premenstrual Dysphoric Disorder.pdf', 'Body Dysmorphic Disorder.pdf', 'Dependent Personality Disorder.pdf', 'Anxiety Disorders.pdf', 'Major or Mild Neurocognitive Disorder With Lewy Bodies.pdf', 'Separation Anxiety Disorder.pdf', 'Alcohol-Related Disorders.pdf', 'Major or Mild Vascular Neurocognitive Disorder.pdf', 'Dissociative Amnesia.pdf', 'Histrionic Personality Disorder.pdf', 'Substance Medic

# Phase 3

In [7]:
zip_path = "/content/drive/MyDrive/Colab Notebooks/IR_Project/Documents1.zip"  # Path to your ZIP file containing PDFs

print("Loading documents...")
documents = load_pdfs_from_zip(zip_path)
print(f"Documents loaded: {documents.keys()}")

Loading documents...
Documents loaded: dict_keys(['Acute Stress Disorder.pdf', 'Adjustment Disorders.pdf'])


## TF-IDF + Cosine Similarity

In [30]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import re

from sklearn.feature_extraction.text import TfidfVectorizer


# Load the documents and preprocess
def preprocess(documents):

    stop_words = set(stopwords.words("english"))
    processed_docs = []
    for doc_name, content in documents.items():

        # Convert to lowercase
        text = re.sub(r'\d+', '', content.lower())
        print(text)
        print(doc_name)
        # Tokenize
        tokens = word_tokenize(text)
        # Remove stop words and punctuation
        tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]
        processed_docs.append(tokens)
    return processed_docs


processed_documents = preprocess(documents)
# Flatten documents into strings for vectorization
flattened_docs = [" ".join(doc) for doc in processed_documents]

# Generate the TF-IDF matrix
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(flattened_docs)
terms = vectorizer.get_feature_names_out()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
decrease or discontinue use (cri terion ). the individual may sp end a great deal of time ob-
taining the substance, using the substance, or recovering from  its effects (criterion ). in
some instances of more severe substance use disorders, virtually all of the individual’s daily
activities revolve around  the substance. craving (criterion ) is manifested by an intense de-
sire or urge for the drug that may occur at an y time but is more likely when in an environ-
ment where the drug previous ly was obtained or used. cr aving has also been shown to
involve classical conditioning and is associated with activation of specific reward structures
in the brain. craving is queried by asking if there has ever been a time when they had such
strong urges to take the drug th at they could not think of anythi ng else. current craving is of-
ten used as a treatment outcome measure becaus e it may be a signal of impending relapse.
soc

In [31]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Transpose the TF-IDF matrix to focus on terms
term_similarity_matrix = cosine_similarity(tfidf_matrix.T)

# Convert similarity matrix to a dictionary for building the thesaurus
thesaurus = {}
for i, term in enumerate(terms):
    similar_terms = np.argsort(-term_similarity_matrix[i])  # Sort by similarity
    thesaurus[term] = [terms[j] for j in similar_terms if terms[j] != term][:10]  # Top 10 related terms


In [32]:
import json

# Save the thesaurus
with open("/content/drive/MyDrive/Colab Notebooks/IR_Project/thesaurus.json", "w") as f:
    json.dump(thesaurus, f)


In [33]:
print(thesaurus)

Output hidden; open in https://colab.research.google.com to view.

## Bert Model

In [9]:
from collections import defaultdict
import numpy as np

def extract_word_embeddings_from_documents(documents, tokenizer, model, max_length=512):
    """
    Extract and aggregate word embeddings from multiple documents.

    Args:
        documents (dict): A dictionary where keys are document names and values are document contents.
        tokenizer: Tokenizer from a pre-trained BERT model.
        model: Pre-trained BERT model.
        max_length (int): Maximum sequence length for BERT (default: 512).

    Returns:
        dict: A dictionary mapping words to their aggregated embeddings across documents.
    """
    word_embeddings = defaultdict(list)  # To store embeddings for each word

    for doc_name, content in documents.items():
        # Tokenize and process each document
        inputs = tokenizer(
            content, return_tensors="pt", truncation=True, padding=True, max_length=max_length
        )
        outputs = model(**inputs)
        token_embeddings = outputs.last_hidden_state[0]  # Shape: (seq_len, hidden_dim)

        # Map tokens to words
        tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
        for i, token in enumerate(tokens):
            if token not in ["[CLS]", "[SEP]", "[PAD]"]:  # Skip special tokens
                word_embeddings[token].append(token_embeddings[i].detach().numpy())

    # Aggregate embeddings by averaging across all occurrences
    aggregated_embeddings = {
        word: np.mean(embeddings, axis=0) for word, embeddings in word_embeddings.items()
    }
    return aggregated_embeddings

In [10]:
import numpy as np

def get_phrase_embedding(phrase, tokenizer, model, max_length=512):
    inputs = tokenizer(
        phrase, return_tensors="pt", truncation=True, padding=True, max_length=max_length
    )
    outputs = model(**inputs)
    token_embeddings = outputs.last_hidden_state[0]
    # Average the embeddings for the entire phrase
    phrase_embedding = torch.mean(token_embeddings, dim=0).detach().numpy()
    return phrase_embedding


In [11]:
from sklearn.metrics.pairwise import cosine_similarity

def build_thesaurus(word_embeddings):
    thesaurus = {}
    words = list(word_embeddings.keys())
    embeddings = np.array(list(word_embeddings.values()))

    # Compute cosine similarity between all word embeddings
    similarity_matrix = cosine_similarity(embeddings)
    for i, word in enumerate(words):
        similar_indices = similarity_matrix[i].argsort()[-6:-1][::-1]  # Top 5 similar words
        similar_words = [words[j] for j in similar_indices]
        thesaurus[word] = similar_words
    return thesaurus


In [14]:
import json
from transformers import BertTokenizer, BertModel


tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
word_embeddings = extract_word_embeddings_from_documents(documents, tokenizer, model)
thesaurus = build_thesaurus(word_embeddings)

# Save the thesaurus
with open("/content/drive/MyDrive/Colab Notebooks/IR_Project/thesaurus.json", "w") as f:
    json.dump(thesaurus, f)
print(thesaurus)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


{'280': ['286', '–', 'd', 'other', 'severe'], 'trauma': ['traumatic', 'injury', 'blast', 'accident', 'event'], '-': ['–', 'of', '##s', '##ic', '##t'], 'and': ['these', 'in', 'a', 'an', 'or'], 'stress': ['bipolar', 'traumatic', '##pressive', 'trauma', 'anxiety'], '##or': ['##ration', '##ic', 'trauma', '##t', 'medication'], 'related': ['induced', '##table', 'due', 'specific', 'causing'], 'disorders': ['disorder', 'disturbances', 'features', 'symptoms', 'deficit'], 'psychotic': ['bipolar', '##pressive', 'disorders', '##ual', 'psychological'], 'features': ['symptoms', 'disorders', 'disturbances', 'effects', 'disorder'], ';': ['and', 'of', 'due', '/', '280'], 'del': ['##um', 'hall', '##iri', 'de', 'di'], '##iri': ['del', '##um', '##uc', 'hall', '##rita'], '##um': ['del', '##bility', '##inations', '##a', '##iri'], 'substance': ['medication', '##pressive', 'bipolar', 'psychotic', 'use'], '/': ['or', 'and', '##ration', '-', ';'], 'medication': ['substance', '##or', 'psychotic', 'induced', 'dis