In [37]:
from pdfminer.high_level import extract_text
from pdfminer.pdfpage import PDFPage
from tqdm import tqdm
import re

pdf_path = "Data/Psychology2e_WEB.pdf"
page_texts = []

with open(pdf_path, "rb") as f:
    for i, page in enumerate(tqdm(PDFPage.get_pages(f), desc="Extracting pages"), 1):
        text = extract_text(pdf_path, page_numbers=[i - 1])
        text = text.replace("\n\nAccess for free at openstax.org", "")
        page_texts.append((i, text))


def convert_to_markdown_headers(text):
    text = re.sub(r"\bCHAPTER\s+(\d+)\s+(.*)", r"# \1 \2", text, flags=re.IGNORECASE)
    text = re.sub(r"\n(\d{1,2}\.\d+)\s+(.*)", r"\n## \1 \2", text)
    return text

markdown_pages = [(p, convert_to_markdown_headers(t)) for p, t in page_texts]        

Extracting pages: 753it [03:42,  3.38it/s]


In [38]:
from langchain.text_splitter import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter

header_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=[("#", "chapter"), ("##", "section")])
docs_with_meta = []

for page_num, md_text in markdown_pages:
    page_docs = header_splitter.split_text(md_text)
    for d in page_docs:
        d.metadata["page"] = page_num
    docs_with_meta.extend(page_docs)

chunk_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

final_chunks = []
for doc in docs_with_meta:
    chunks = chunk_splitter.create_documents([doc.page_content])
    for chunk in chunks:
        chunk.metadata.update(doc.metadata)
    final_chunks.extend(chunks)


In [51]:
for i, doc in enumerate(final_chunks[40:70]):
    print(f"📄 Page: {doc.metadata.get('page')}")
    print(f"📚 Chapter: {doc.metadata.get('chapter')}")
    print(f"🔖 Section: {doc.metadata.get('section')}")
    print(f"📝 Preview:\n{doc.page_content[:200]}\n---")


📄 Page: 8
📚 Chapter: 5 Sensation and Perception
🔖 Section: 5.6 Gestalt Principles of Perception
📝 Preview:
168  
Key Terms  
Summary  
172  
174  
Review Questions  
175  
Critical Thinking Questions  
178  
Personal Application Questions  
179
---
📄 Page: 8
📚 Chapter: 6 Learning
🔖 Section: None
📝 Preview:
Introduction  
181  
181
---
📄 Page: 8
📚 Chapter: 6 Learning
🔖 Section: 6.1 What Is Learning?
📝 Preview:
182
---
📄 Page: 8
📚 Chapter: 6 Learning
🔖 Section: 6.3 Operant Conditioning
📝 Preview:
183  
192
---
📄 Page: 8
📚 Chapter: 6 Learning
🔖 Section: 6.4 Observational Learning (Modeling)
📝 Preview:
203  
Key Terms  
Summary  
207  
208  
Review Questions  
208  
Critical Thinking Questions  
210  
Personal Application Questions  
211
---
📄 Page: 8
📚 Chapter: 7 Thinking and Intelligence
🔖 Section: None
📝 Preview:
Introduction  
213  
213
---
📄 Page: 8
📚 Chapter: 7 Thinking and Intelligence
🔖 Section: 7.1 What Is Cognition?
📝 Preview:
214
---
📄 Page: 8
📚 Chapter: 7 Thinking and Intellige

In [57]:
print(final_chunks[].page_content)


wrote The Animal Mind: A Textbook of Comparative Psychology, and it was the standard in the field for over 20  
years. In the mid 1890s, Mary Whiton Calkins completed all requirements toward the PhD in psychology, but  
Harvard University refused to award her that degree because she was a woman. She had been taught and  
mentored by William James, who tried and failed to convince Harvard to award her the doctoral degree. Her


In [40]:
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec

embedder = SentenceTransformer("all-MiniLM-L6-v2")

pinecone_api_key = "api-key"
index_name = "casml-py"
pc = Pinecone(api_key=pinecone_api_key)

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

index = pc.Index(index_name)


In [39]:
# Delete the index
# if pc.has_index(index_name):
#     pc.delete_index(index_name)
#     print(f"Index '{index_name}' has been deleted.")
# else:
#     print(f"Index '{index_name}' does not exist.")

Index 'casml-py' has been deleted.


In [41]:
from tqdm import tqdm

batch_size = 100
for i in tqdm(range(0, len(final_chunks), batch_size), desc="Uploading to Pinecone"):
    batch = final_chunks[i:i + batch_size]
    texts = [doc.page_content for doc in batch]
    ids = [f"chunk-{i+j}" for j in range(len(batch))]
    vectors = embedder.encode(texts).tolist()
    to_upsert = [
        {
            "id": ids[j],
            "values": vectors[j],
            "metadata": {
                **batch[j].metadata,
                "text": texts[j]  # ✅ Now included for query results
            }
        }
        for j in range(len(batch))
    ]
    index.upsert(vectors=to_upsert)


Uploading to Pinecone: 100%|██████████| 57/57 [01:05<00:00,  1.15s/it]


In [48]:
# Your search query
query = "What is personality?"
query_vector = embedder.encode([query]).tolist()

# Query Pinecone
results = index.query(vector=query_vector[0], top_k=5, include_metadata=True)

# Display the results
for match in results["matches"]:
    print(f"\n Score: {match['score']:.4f}")
    print(f"Page: {match['metadata'].get('page')}")
    print(f"Chapter: {match['metadata'].get('chapter')}")
    print(f"Section: {match['metadata'].get('section')}")
    print(f"\nText:\n{match['metadata']['text'][:500]}...")



 Score: 0.6910
Page: 372.0
Chapter: None
Section: 11.1 What Is Personality?

Text:
LEARNING OBJECTIVES  
By the end of this section, you will be able to:  
• Define personality  
• Describe early theories about personality development  
Personality refers to the long-standing traits and patterns that propel individuals to consistently think, feel,  
and behave in specific ways. Our personality is what makes us unique individuals. Each person has an  
idiosyncratic pattern of enduring, long-term characteristics and a manner in which they interact with other...

 Score: 0.6794
Page: 594.0
Chapter: None
Section: 15.11 Personality Disorders

Text:
that are important in the etiology of both  
The term personalityrefers loosely to one’s stable, consistent, and distinctive way of thinking about, feeling,  
acting, and relating to the world. People with personality disorders exhibit a personality style that differs  
markedly from the expectations of their culture, is pervasive and inflexible

In [47]:
final_chunks[67]

Document(metadata={'chapter': '11 Personality', 'page': 9}, page_content='Introduction  \n359  \n359')