In [0]:
%run ./00_setup_environment.ipynb

In [0]:
# Databricks notebook: Load, Chunk, and Save Manuals

from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType
import os

In [0]:
# Function to load and extract text from PDF
def load_pdf_text(pdf_path):
    reader = PdfReader(pdf_path)
    return "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])

# Function to chunk text
def chunk_text(text, language, source):
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = splitter.create_documents([text])
    return [{
        "text": doc.page_content,
        "language": language,
        "source": source
    } for doc in chunks]

# Paths
manual_base = "/Workspace/Users/pbmarihal3929@gmail.com/GenAI_CarManual_Chatbot_Repo/manuals"
english_path = f"{manual_base}/a4-2025-owners-manual.pdf"
german_path = f"{manual_base}/a4-2025-betriebsanleitung.pdf"


In [0]:

# Load and chunk
english_chunks = chunk_text(load_pdf_text(english_path), "English", "a4-2025-owners-manual.pdf")
german_chunks = chunk_text(load_pdf_text(german_path), "German", "a4-2025-betriebsanleitung.pdf")

# Combine and create Spark DataFrame
all_chunks = english_chunks + german_chunks
schema = StructType([
    StructField("text", StringType(), True),
    StructField("language", StringType(), True),
    StructField("source", StringType(), True)
])
df = spark.createDataFrame(all_chunks, schema=schema)

# Save to Unity Catalog volume (or use DBFS if needed)
output_path = "/Volumes/genai_catalog/car_manuals/manual_chunks/chunks_delta"
df.write.mode("overwrite").format("delta").save(output_path)

print(f"Saved {df.count()} chunks to: {output_path}")
