In [1]:
pdf_path = '../SmartScriblle.pdf'

In [None]:
## PDF - Vector DB
# Document Loading
# Chunk the Documents
# Create Embeddings
# Store in Vector DB

## Document Parsing Using PyPdf

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters  import MarkdownHeaderTextSplitter
import pymupdf4llm

# Load and parse the PDF with markdown structure
md_text = pymupdf4llm.to_markdown(pdf_path)

# Split by markdown headers
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
md_header_splits = markdown_splitter.split_text(md_text)

# Display the parsed sections
for i, section in enumerate(md_header_splits):
    print(f"Section {i}:")
    print(f"Content: {section.page_content[:200]}...")
    print(f"Metadata: {section.metadata}")
    print("-" * 50)

Consider using the pymupdf_layout package for a greatly improved page layout analysis.
Section 0:
Content: **"The Renaissance of Writing in the Age of Artificial Intelligence"**...
Metadata: {'Header 1': 'The SmartScribble AI Notebook'}
--------------------------------------------------
Section 1:
Content: In an era dominated by glowing screens, constant notifications, and digital noise, the act of deep thinking has become a  
lost art. While digital tools offer speed and connectivity, they often lack t...
Metadata: {'Header 1': 'The SmartScribble AI Notebook', 'Header 2': 'Section 1: Executive Summary & Philosophy', 'Header 3': 'The Vision'}
--------------------------------------------------
Section 2:
Content: 1. **Focus:** A distraction-free environment using E-Ink technology.  
2. **Intelligence:** On-board and Cloud-based AI that understands, organizes, and expands upon user input.  
3. **Integration:** ...
Metadata: {'Header 1': 'The SmartScribble AI Notebook', 'Header 2': 'Secti

In [23]:
print(md_text)

# The SmartScribble AI Notebook

**"The Renaissance of Writing in the Age of Artificial Intelligence"**

## Section 1: Executive Summary & Philosophy

### The Vision


In an era dominated by glowing screens, constant notifications, and digital noise, the act of deep thinking has become a


lost art. While digital tools offer speed and connectivity, they often lack the cognitive benefits of handwriting—the tactile


connection that helps the brain synthesize information and retain memory. Conversely, traditional paper notebooks, while


conducive to focus, are disconnected 'data islands'—unsearchable, shareable only by scanning, and vulnerable to loss.


Enter the **SmartScribble AI Notebook** .


SmartScribble acts as the bridge between the analog and digital worlds. It is not a tablet trying to be a computer; it is a


notebook supercharged with the limitless potential of Artificial Intelligence. It preserves the friction and flow of pen on


paper while seamlessly integrating the con

## Document Parsing Using Azure Embeddings

In [None]:
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader

file_path = pdf_path 
endpoint = "<endpoint>"
key = "<key>"

loader = AzureAIDocumentIntelligenceLoader(
    api_endpoint=endpoint, api_key=key, file_path=file_path, api_model="prebuilt-layout"
)

documents = loader.load()

In [18]:
# Split by markdown headers
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
md_header_splits = markdown_splitter.split_text(documents[0].metadata['content'])

# Display the parsed sections
for i, section in enumerate(md_header_splits):
    print(f"Section {i}:")
    print(f"Content: {section.page_content[:200]}...")
    print(f"Metadata: {section.metadata}")
    print("-" * 50)

Section 0:
Content: "The Renaissance of Writing in the Age of Artificial Intelligence"...
Metadata: {'Header 1': 'The SmartScribble AI Notebook'}
--------------------------------------------------
Section 1:
Content: In an era dominated by glowing screens, constant notifications, and digital noise, the act of deep thinking has become a
lost art. While digital tools offer speed and connectivity, they often lack the...
Metadata: {'Header 1': 'Section 1: Executive Summary & Philosophy', 'Header 2': 'The Vision'}
--------------------------------------------------
Section 2:
Content: 1\. Focus: A distraction-free environment using E-Ink technology.  
2\. Intelligence: On-board and Cloud-based AI that understands, organizes, and expands upon user input.  
3\. Integration: An agnost...
Metadata: {'Header 1': 'Section 1: Executive Summary & Philosophy', 'Header 2': 'Core Value Proposition'}
--------------------------------------------------
Section 3:
Content: The Display: Paper Perfected  
Th

In [10]:
print(documents[0].metadata['content'])

# The SmartScribble AI Notebook

"The Renaissance of Writing in the Age of Artificial Intelligence"


# Section 1: Executive Summary & Philosophy


## The Vision

In an era dominated by glowing screens, constant notifications, and digital noise, the act of deep thinking has become a
lost art. While digital tools offer speed and connectivity, they often lack the cognitive benefits of handwriting-the tactile
connection that helps the brain synthesize information and retain memory. Conversely, traditional paper notebooks, while
conducive to focus, are disconnected 'data islands'-unsearchable, shareable only by scanning, and vulnerable to loss.

Enter the SmartScribble AI Notebook.

SmartScribble acts as the bridge between the analog and digital worlds. It is not a tablet trying to be a computer; it is a
notebook supercharged with the limitless potential of Artificial Intelligence. It preserves the friction and flow of pen on
paper while seamlessly integrating the content into the user's d

## Embedding Setup

In [None]:
import os

os.environ['AZURE_OPENAI_ENDPOINT'] = '<endpoint>'
os.environ['AZURE_OPENAI_API_EMBEDDINGS_DEPLOYMENT_NAME']='text-embedding-3-large'
os.environ['AZURE_OPENAI_API_KEY']='<key>'
os.environ['OPENAI_API_VERSION']="2024-02-01"


from langchain_openai.embeddings.azure import AzureOpenAIEmbeddings

embeddings = AzureOpenAIEmbeddings(
    azure_deployment="text-embedding-3-large", # e.g., "text-embedding-ada-002"
    model="text-embedding-3-large" # Or other deployed model like "text-embedding-3-small"
)

query_embedding = embeddings.embed_query("What is the capital of France?")
query_embedding

In [32]:
query_embedding

[-0.05563553050160408,
 0.040865033864974976,
 -0.01207282766699791,
 0.042547229677438736,
 -0.05645611509680748,
 -0.02736644446849823,
 0.02299683913588524,
 -0.01158047839999199,
 -0.02693563885986805,
 -0.0086827939376235,
 -0.002689973684027791,
 0.00037919910391792655,
 -0.02646380290389061,
 -0.01828874461352825,
 0.012298488058149815,
 0.0007539106882177293,
 -0.005333789624273777,
 0.016534747555851936,
 -1.1108722901553847e-05,
 0.007718609180301428,
 0.021047955378890038,
 -0.008457133546471596,
 -0.019447818398475647,
 -0.039018724113702774,
 0.0004445893282536417,
 0.008908454328775406,
 0.03854689002037048,
 -0.0090930862352252,
 0.03532610088586807,
 0.02872040681540966,
 0.07360629737377167,
 0.004159330390393734,
 0.01738610304892063,
 -0.017416875809431076,
 -0.044434573501348495,
 -0.00449012778699398,
 0.015078213065862656,
 0.01529361680150032,
 0.02523292787373066,
 0.006554407067596912,
 0.009780324064195156,
 -0.012083085253834724,
 0.007113429252058268,
 0.021

In [25]:
from langchain_postgres import PGVector

# See docker command above to launch a postgres instance with pgvector enabled.
connection = "postgresql+psycopg://langchain:langchain@localhost:6024/langchain"  # Uses psycopg3!
collection_name = "smart_scribble_docs"

vector_store = PGVector(
    embeddings=embeddings,
    collection_name=collection_name,
    connection=connection,
    use_jsonb=True,
)

In [30]:
len(md_header_splits)

22

In [31]:
vector_store.add_documents(md_header_splits, ids=[id for id in range(len(md_header_splits))])

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]

In [38]:
user_query = 'give me the technical specifications'

retriever = vector_store.as_retriever(search_kwargs={"k": 3})
chunks = retriever.invoke(user_query)

In [43]:
prompt = f"User Query - {user_query} context : {' '.join([chunk.page_content for chunk in chunks])}"

In [50]:
from langchain_openai import AzureChatOpenAI
from langchain.messages import HumanMessage, SystemMessage
os.environ['OPENAI_API_VERSION']="2024-02-01"

# Initialize Azure OpenAI LLM
llm = AzureChatOpenAI(
    azure_deployment="gpt-4.1-mini",  # Replace with your actual deployment name
    model="gpt-4.1-mini",  # Or your specific model
    temperature=0.7,
    max_tokens=1000
)

system_message = SystemMessage(content="You are a helpful assistant that provides accurate information based on the provided context, limit yourself to only the requested user queries response")
human_message = HumanMessage(content=prompt)

ai_response = llm.invoke([system_message, human_message])

In [51]:
print(ai_response.content)

Here are the technical specifications for the product:

- Display: 10.3-inch CanvasInk™ E-Ink Display (Monochrome)  
- Resolution: 1872 × 1404 (227 DPI)  
- Processor: Quad-Core 1.8GHz ARM Cortex Processor  
- Storage: 64GB Internal Storage (Non-expandable)  
- RAM: 4GB LPDDR4X  
- Connectivity: Wi-Fi (2.4GHz + 5GHz), Bluetooth 5.0, USB-C  
- Battery: 3000mAh Li-ion (Up to 2 weeks standby, 10 hours active writing)  
- Stylus: Wacom EMR Technology (Battery-free), 4096 Pressure Levels  
- Dimensions: 230mm × 190mm × 5.8mm  
- Weight: 395g  
- Operating System: SmartOS (Based on Linux Kernel)  
- File Formats Supported: PDF, EPUB, TXT, PNG, SVG  
- Chassis: Machined aerospace-grade aluminum alloy  
- Thickness: 5.8mm, thinner than a typical smartphone  
- Weight: Under 400g, comfortable for one-hand use for extended sessions
