In [1]:
from PyPDF2 import PdfReader
import re
from sentence_transformers import SentenceTransformer
import numpy as np
import google.generativeai as genai
import faiss

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    full_text = ''
    for page in reader.pages:
        full_text += page.extract_text() + '\n'
    return full_text

# Usage
raw_text = extract_text_from_pdf("data\DSM-5.pdf")

# Save to file
with open("raw.txt", "w", encoding="utf-8") as f:
    f.write(raw_text)


  raw_text = extract_text_from_pdf("data\DSM-5.pdf")
unknown widths : 
[0, IndirectObject(5443, 0, 2190842236896)]
unknown widths : 
[0, IndirectObject(5447, 0, 2190842236896)]
unknown widths : 
[0, IndirectObject(5451, 0, 2190842236896)]
unknown widths : 
[0, IndirectObject(3779, 0, 2190842236896)]
unknown widths : 
[0, IndirectObject(3832, 0, 2190842236896)]
unknown widths : 
[0, IndirectObject(3782, 0, 2190842236896)]
unknown widths : 
[0, IndirectObject(3841, 0, 2190842236896)]
unknown widths : 
[0, IndirectObject(3785, 0, 2190842236896)]
unknown widths : 
[0, IndirectObject(3832, 0, 2190842236896)]
unknown widths : 
[0, IndirectObject(3788, 0, 2190842236896)]
unknown widths : 
[0, IndirectObject(3841, 0, 2190842236896)]
unknown widths : 
[0, IndirectObject(3852, 0, 2190842236896)]
unknown widths : 
[0, IndirectObject(3852, 0, 2190842236896)]
unknown widths : 
[0, IndirectObject(3832, 0, 2190842236896)]
unknown widths : 
[0, IndirectObject(3860, 0, 2190842236896)]
unknown widths : 

In [3]:
def clean_text(text):
    # Remove multiple newlines
    text = re.sub(r'\n+', '\n', text)
    # Remove unwanted characters
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII
    text = re.sub(r'\s{2,}', ' ', text)         # Collapse multiple spaces
    return text.strip()

# Usage
with open("raw.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

cleaned_text = clean_text(raw_text)

with open("cleaned.txt", "w", encoding="utf-8") as f:
    f.write(cleaned_text)


In [4]:
def chunk_text(text, min_words=200, max_words=500):
    words = text.split()
    chunks = []
    i = 0
    while i < len(words):
        end = i + max_words
        chunk = words[i:end]
        if len(chunk) >= min_words:
            chunks.append(" ".join(chunk))
        i += max_words
    return chunks

# Usage
chunks = chunk_text(cleaned_text)

# Save chunks to file (optional)
with open("chunks.txt", "w", encoding="utf-8") as f:
    for i, chunk in enumerate(chunks):
        f.write(f"--- Chunk {i+1} ---\n{chunk}\n\n")


In [5]:
embed_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embed_model.encode(chunks, show_progress_bar=True)

# Optional: Save embeddings
import numpy as np
np.save("embeddings.npy", embeddings)

Batches: 100%|██████████| 30/30 [00:18<00:00,  1.60it/s]


In [6]:
# Load embeddings and the chunks if not in memory
embeddings = np.load("embeddings.npy")  # if saved earlier

# Convert to float32 (required by FAISS)
embeddings = np.array(embeddings).astype('float32')

# Create FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)  # L2 = Euclidean distance
index.add(embeddings)

# Save index (optional)
faiss.write_index(index, "faiss.index")

In [7]:
def retrieve_top_chunks(query, model, index, chunks, top_k=10):
    query_embedding = model.encode([query])[0].astype('float32')
    distances, indices = index.search(np.array([query_embedding]), top_k)
    return [chunks[i] for i in indices[0]]

In [None]:
genai.configure(api_key="")  # Replace with your actual API key

model = genai.GenerativeModel("gemini-2.0-flash")  #Make sure to use the correct model name according to the API documentation

def generate_with_gemini(question, context):
    prompt = f"""You are a helpful mental health assistant.  You have access to the DSM-5, a comprehensive manual for diagnosing and classifying mental disorders.
    Use this information to provide accurate and relevant answers to questions about mental health conditions, symptoms, and treatments. 
Your responses should be informative, well fromated, if there are any comparasions make sure its in a table, reflecting the latest understanding in the field of mental health.

Answer the following question based on the context below.

Context:
{context}

Question:
{question}

Answer:"""

    try:
        response = model.generate_content(prompt)
        return response.text
    except Exception as e:
        print(f"Error generating response: {e}")
        return None


In [9]:
query = "what is Borderline Personality Disorder (BPD) ?"

similar_chunks = retrieve_top_chunks(query, embed_model, index, chunks)
context =" "
context = "\n\n".join(similar_chunks)

answer = generate_with_gemini(query, context)
if answer:
    print(answer)
else:
    print("Failed to generate answer")

Borderline Personality Disorder (BPD) is a complex mental disorder characterized by a pervasive pattern of instability in interpersonal relationships, self-image, and affects, along with marked impulsivity. This pattern typically begins by early adulthood and is present across various contexts.

**Diagnostic Criteria:**

According to the DSM-5, an individual must exhibit five or more of the following criteria to be diagnosed with BPD:

1.  **Frantic efforts to avoid real or imagined abandonment:** These efforts may include impulsive actions such as self-harm or suicidal behaviors.
2.  **A pattern of unstable and intense interpersonal relationships:** Characterized by alternating between extremes of idealization and devaluation.
3.  **Identity disturbance:** Markedly and persistently unstable self-image or sense of self.
4.  **Impulsivity in at least two areas that are potentially self-damaging:** Such as spending, sex, substance abuse, reckless driving, or binge eating.
5.  **Recurrent