In [0]:
%run ./00_setup_environment.ipynb/


In [0]:
# Databricks notebook: Load, Chunk, and Save Manuals

from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType
import os

In [0]:
# Function to load and extract text from PDF
def load_pdf_pages(pdf_path):
    reader = PdfReader(pdf_path)
    pages = []
    for i, page in enumerate(reader.pages):
        text = page.extract_text()
        if text and is_useful_page(text):
            pages.append(text)
        else:
            print(f"Skipping page {i+1}: likely TOC or not useful")
    return pages

def is_useful_page(text):
    text_lower = text.lower()
    # Skip table of contents, indexes, and short pages
    return (
        "table of contents" not in text_lower and
        "index" not in text_lower and
        len(text.strip()) > 400 and
        text.count(". . .") < 5  # These patterns often indicate TOC or index
    )


# Function to chunk text
def chunk_useful_texts(pages, language, source):
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    
    useful_pages = [page for page in pages if is_useful_page(page)]
    
    all_chunks = []
    for page in useful_pages:
        docs = splitter.create_documents([page])
        for doc in docs:
            all_chunks.append({
                "text": doc.page_content,
                "language": language,
                "source": source
            })
    
    return all_chunks


# Paths
manual_base = "/Workspace/Users/pbmarihal3929@gmail.com/GenAI_CarManual_Chatbot_Repo/data"
english_path = f"{manual_base}/a4-2025-owners-manual.pdf"
german_path = f"{manual_base}/a4-2025-betriebsanleitung.pdf"


In [0]:
english_pages = load_pdf_pages(english_path)
german_pages = load_pdf_pages(german_path)

english_chunks = chunk_useful_texts(english_pages, "English", "a4-2025-owners-manual.pdf")
german_chunks = chunk_useful_texts(german_pages, "German", "a4-2025-betriebsanleitung.pdf")

# Combine and create Spark DataFrame
all_chunks = english_chunks + german_chunks
schema = StructType([
    StructField("text", StringType(), True),
    StructField("language", StringType(), True),
    StructField("source", StringType(), True)
])
df = spark.createDataFrame(all_chunks, schema=schema)

# Save to Unity Catalog volume (or use DBFS if needed)
output_path = "/Volumes/genai_catalog/car_manuals/manual_chunks/chunks_delta"
df.write.mode("overwrite").format("delta").save(output_path)

print(f"Saved {df.count()} chunks to: {output_path}")
