In [1]:
pdf_path = '../SmartScriblle.pdf'

## Document Parsing Using PyPdf

In [12]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters  import MarkdownHeaderTextSplitter
import pymupdf4llm

# Load and parse the PDF with markdown structure
md_text = pymupdf4llm.to_markdown(pdf_path)

# Split by markdown headers
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
md_header_splits = markdown_splitter.split_text(md_text)

# Display the parsed sections
for i, section in enumerate(md_header_splits):
    print(f"Section {i}:")
    print(f"Content: {section.page_content[:200]}...")
    print(f"Metadata: {section.metadata}")
    print("-" * 50)

Consider using the pymupdf_layout package for a greatly improved page layout analysis.
Section 0:
Content: **"The Renaissance of Writing in the Age of Artificial Intelligence"**...
Metadata: {'Header 1': 'The SmartScribble AI Notebook'}
--------------------------------------------------
Section 1:
Content: In an era dominated by glowing screens, constant notifications, and digital noise, the act of deep thinking has become a  
lost art. While digital tools offer speed and connectivity, they often lack t...
Metadata: {'Header 1': 'The SmartScribble AI Notebook', 'Header 2': 'Section 1: Executive Summary & Philosophy', 'Header 3': 'The Vision'}
--------------------------------------------------
Section 2:
Content: 1. **Focus:** A distraction-free environment using E-Ink technology.  
2. **Intelligence:** On-board and Cloud-based AI that understands, organizes, and expands upon user input.  
3. **Integration:** ...
Metadata: {'Header 1': 'The SmartScribble AI Notebook', 'Header 2': 'Secti

In [13]:
print(md_text)

# The SmartScribble AI Notebook

**"The Renaissance of Writing in the Age of Artificial Intelligence"**

## Section 1: Executive Summary & Philosophy

### The Vision


In an era dominated by glowing screens, constant notifications, and digital noise, the act of deep thinking has become a


lost art. While digital tools offer speed and connectivity, they often lack the cognitive benefits of handwriting—the tactile


connection that helps the brain synthesize information and retain memory. Conversely, traditional paper notebooks, while


conducive to focus, are disconnected 'data islands'—unsearchable, shareable only by scanning, and vulnerable to loss.


Enter the **SmartScribble AI Notebook** .


SmartScribble acts as the bridge between the analog and digital worlds. It is not a tablet trying to be a computer; it is a


notebook supercharged with the limitless potential of Artificial Intelligence. It preserves the friction and flow of pen on


paper while seamlessly integrating the con

## Document Parsing Using Azure Embeddings

In [None]:
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader

file_path = pdf_path 
endpoint = "<endpoint>"
key = "<key>"

loader = AzureAIDocumentIntelligenceLoader(
    api_endpoint=endpoint, api_key=key, file_path=file_path, api_model="prebuilt-layout"
)

documents = loader.load()

In [18]:
# Split by markdown headers
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
md_header_splits = markdown_splitter.split_text(documents[0].metadata['content'])

# Display the parsed sections
for i, section in enumerate(md_header_splits):
    print(f"Section {i}:")
    print(f"Content: {section.page_content[:200]}...")
    print(f"Metadata: {section.metadata}")
    print("-" * 50)

Section 0:
Content: "The Renaissance of Writing in the Age of Artificial Intelligence"...
Metadata: {'Header 1': 'The SmartScribble AI Notebook'}
--------------------------------------------------
Section 1:
Content: In an era dominated by glowing screens, constant notifications, and digital noise, the act of deep thinking has become a
lost art. While digital tools offer speed and connectivity, they often lack the...
Metadata: {'Header 1': 'Section 1: Executive Summary & Philosophy', 'Header 2': 'The Vision'}
--------------------------------------------------
Section 2:
Content: 1\. Focus: A distraction-free environment using E-Ink technology.  
2\. Intelligence: On-board and Cloud-based AI that understands, organizes, and expands upon user input.  
3\. Integration: An agnost...
Metadata: {'Header 1': 'Section 1: Executive Summary & Philosophy', 'Header 2': 'Core Value Proposition'}
--------------------------------------------------
Section 3:
Content: The Display: Paper Perfected  
Th

In [10]:
print(documents[0].metadata['content'])

# The SmartScribble AI Notebook

"The Renaissance of Writing in the Age of Artificial Intelligence"


# Section 1: Executive Summary & Philosophy


## The Vision

In an era dominated by glowing screens, constant notifications, and digital noise, the act of deep thinking has become a
lost art. While digital tools offer speed and connectivity, they often lack the cognitive benefits of handwriting-the tactile
connection that helps the brain synthesize information and retain memory. Conversely, traditional paper notebooks, while
conducive to focus, are disconnected 'data islands'-unsearchable, shareable only by scanning, and vulnerable to loss.

Enter the SmartScribble AI Notebook.

SmartScribble acts as the bridge between the analog and digital worlds. It is not a tablet trying to be a computer; it is a
notebook supercharged with the limitless potential of Artificial Intelligence. It preserves the friction and flow of pen on
paper while seamlessly integrating the content into the user's d

## Embedding Setup

In [None]:
import os

os.environ['AZURE_OPENAI_ENDPOINT'] = '<endpoint>'
os.environ['AZURE_OPENAI_API_EMBEDDINGS_DEPLOYMENT_NAME']='text-embedding-3-large'
os.environ['AZURE_OPENAI_API_KEY']='<key>'
os.environ['AZURE_OPENAI_API_VERSION']="2024-02-01"


from langchain_openai.embeddings.azure import AzureOpenAIEmbeddings

embeddings = AzureOpenAIEmbeddings(
    azure_deployment="text-embedding-3-large", # e.g., "text-embedding-ada-002"
    model="text-embedding-3-large" # Or other deployed model like "text-embedding-3-small"
)

query_embedding = embeddings.embed_query("What is the capital of France?")
query_embedding