In [1]:
# Python - 3.13.7
# This file contains code for parsing, preprocessing, chunking and loading netsuite pdfs in vector databases

In [2]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_milvus import Milvus
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores import Chroma

In [3]:
folder_path = "./Netsuite_pdfs/"

In [4]:
# Parsing pdfs, not needed if parsed_docs.json present

# loader = DirectoryLoader(
#     path=folder_path,
#     glob='*.pdf',
#     loader_cls=PyPDFLoader
# )

# docs = loader.lazy_load()

# pages = []
# async for page in loader.alazy_load():
#     pages.append(page)

In [5]:
# Functions for saving and loading parsed documents

import json


def save_documents(docs, filename="docs.json"):
    # Convert each Document object into a dictionary
    # containing its text content and metadata
    data = [{"page_content": d.page_content, "metadata": d.metadata}
            for d in docs]

    # Save the list of dictionaries into a JSON file
    with open(filename, "w", encoding="utf-8") as f:
        # ensure_ascii=False keeps Unicode characters readable
        # indent=2 makes the JSON human-readable
        json.dump(data, f, ensure_ascii=False, indent=2)


def load_documents(filename="docs.json"):
    from langchain.schema import Document
    import json

    # Load the JSON file back into a list of dictionaries
    with open(filename, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Convert dictionaries back into Document objects
    return [Document(page_content=d["page_content"], metadata=d["metadata"]) for d in data]

In [6]:
# not needed if parsed_docs.json present

# save_documents(pages, "parsed_docs.json")

In [7]:
pages = load_documents("parsed_docs.json")

## Preprocessing

In [8]:
import re
import ftfy
from cleantext import clean
from typing import List, Dict
from langchain.schema import Document
import hashlib

In [9]:
def preprocess_text(text: str) -> str:
    """
    Clean and normalize PDF text.

    Steps:
    - Fix broken Unicode characters (using ftfy).
    - Clean text using the `clean` function:
        * Normalize Unicode
        * Remove URLs, emails, phone numbers
        * Keep line breaks
        * Replace sensitive info with placeholders
    - Fix words split across lines with hyphen + newline.
    - Collapse multiple spaces/tabs into a single space.
    - Limit multiple newlines to at most 2 (preserve paragraphs).
    - Normalize quotes and dashes.

    Args:
        text (str): Raw extracted PDF text.

    Returns:
        tuple:
            str: Cleaned text.
            bool: True if Unicode was fixed, False otherwise.
    """
    fixed = ftfy.fix_text(text)  # fix broken unicode

    # General cleaning (remove URLs, emails, phone numbers, etc.)
    text = clean(
        fixed,
        fix_unicode=True,
        to_ascii=False,
        lower=False,
        no_urls=True,
        no_emails=True,
        no_phone_numbers=True,
        no_line_breaks=False,            # preserve line breaks
        replace_with_email="<EMAIL>",    # replace emails with placeholder
        replace_with_phone_number="<PHONE>",  # replace phone numbers
        replace_with_url="<URL>",        # replace URLs
        lang="en"
    )

    # Fix words split across lines with a hyphen + newline
    text = re.sub(r'-\s*\n\s*', '-', text)

    # Collapse multiple spaces/tabs into one space
    text = re.sub(r'[ \t]+', ' ', text)

    # Normalize multiple newlines -> at most 2 (preserve paragraphs)
    text = re.sub(r'\n\s*\n+', '\n\n', text)

    # Normalize quotes and dashes for consistency
    text = text.replace("“", '"').replace(
        "”", '"').replace("’", "'").replace("–", "-")

    return text.strip(), (fixed != text)  # return also if unicode was fixed

In [10]:
def get_fingerprint(text: str) -> str:
    """
    Generate a fingerprint for a given text using SHA256 hashing.
    This ensures duplicates are detected across full content.
    """
    # Normalize by lowercasing
    normalized = text.lower().encode("utf-8")
    # Hash entire content
    return hashlib.sha256(normalized).hexdigest()

In [None]:
def clean_documents(pages: List[Document]):
    """
    Clean and validate a list of Document objects.

    Steps performed:
    - Track statistics (original count, short docs, duplicates, unicode fixes, 
      character counts before/after cleaning).
    - Preprocess each document's text (normalize, clean, fix unicode).
    - Count short documents (<50 chars).
    - Skip duplicates (based on fingerprint).
    - Count how many documents had Unicode fixes.
    - Return cleaned documents with updated metadata and stats.

    Args:
        pages (List[Document]): List of Document objects to clean.

    Returns:
        tuple:
            - List[Document]: Cleaned documents with updated metadata.
            - dict: Statistics about the cleaning process.
    """
    stats = {
        "original_docs": len(pages),  # total input documents
        "short_docs": 0,              # count of docs with < 50 characters
        "duplicates": 0,              # count of removed duplicate docs
        "unicode_fixes": 0,           # how many docs had unicode issues fixed
        "chars_before": 0,            # total characters before cleaning
        "chars_after": 0,             # total characters after cleaning
    }

    seen = set()         # store fingerprints of processed documents
    cleaned_pages = []   # store cleaned Document objects

    for page in pages:
        orig = page.page_content
        stats["chars_before"] += len(orig)  # count original characters

        # Clean and normalize the text
        cleaned, unicode_fixed = preprocess_text(orig)
        stats["chars_after"] += len(cleaned)  # count cleaned characters

        # Validation: check if the document is too short
        is_short = len(cleaned) < 50
        if is_short:
            stats["short_docs"] += 1

        # Check for duplicates using fingerprint
        fp = get_fingerprint(cleaned)
        is_duplicate = fp in seen
        if is_duplicate:
            stats["duplicates"] += 1
            continue
        else:
            seen.add(fp)

        # Count Unicode fixes
        if unicode_fixed:
            stats["unicode_fixes"] += 1

        # Create cleaned Document object with updated metadata
        cleaned_page = Document(
            page_content=cleaned,
            metadata={**page.metadata,
                      # "is_short": is_short,        # optional flags
                      # "is_duplicate": is_duplicate,
                      "had_unicode_fix": unicode_fixed}
        )
        cleaned_pages.append(cleaned_page)

    return cleaned_pages, stats

In [12]:
def report(stats: Dict, pages: List[Document]):
    """
    Print a summary report of the text preprocessing process.

    Displays:
    - Total number of original documents
    - Number of short documents (<50 chars)
    - Number of duplicates removed
    - Number of documents with Unicode fixes
    - Characters reduced (absolute and percentage)
    - A sample cleaned text and metadata (if available)

    Args:
        stats (Dict): Dictionary containing preprocessing statistics 
                      (produced by `clean_documents`).
        pages (List[Document]): List of cleaned Document objects.
    """
    # Header section
    print("\n" + "="*40)
    print(" TEXT PREPROCESSING REPORT ")
    print("="*40)

    # Core statistics
    print(f"Original documents : {stats['original_docs']:,}")
    print(f"Short docs present : {stats['short_docs']:,}")
    print(f"Duplicates removed : {stats['duplicates']:,}")
    print(f"Unicode fixes      : {stats['unicode_fixes']:,}")

    # Calculate and display character reduction
    chars_removed = stats["chars_before"] - stats["chars_after"]
    reduction_pct = (
        chars_removed / stats["chars_before"] * 100) if stats["chars_before"] > 0 else 0
    print(f"Characters reduced : {chars_removed:,} ({reduction_pct:.1f}%)")

    # Show a sample cleaned document (hardcoded index 62 for debugging/demo)
    if pages:
        print("\nSample cleaned text (with tags):")
        print(f"\nText: \n{pages[62].page_content}")
        print(f"\nMetadata: {pages[62].metadata}")

    # Footer
    print("="*40)

In [13]:
cleaned_pages, stats = clean_documents(pages)

In [15]:
report(stats, cleaned_pages)


 TEXT PREPROCESSING REPORT 
Original documents : 32,388
Short docs present : 170
Duplicates removed : 3,533
Unicode fixes      : 28,638
Characters reduced : 1,172,109 (1.6%)

Sample cleaned text (with tags):

Text: 
Analytics Features
32
Feature Description
KPI Scorecards Add the ability to display a portlet on your dashboard that shows the results of
multiple KPIs for multiple date or period ranges. For more information, see the help
topic KPI Scorecards
Connectivity
SuiteAnalytics Connect Enable the SuiteAnalytics Connect feature to access and query your NetSuite data
using SQL through database standards such as ODBC, JDBC, and ADO.NET. For
more information, see the help topic SuiteAnalytics Connect.
NetSuite Analytics Warehouse Configure and transfer data to the NetSuite Analytics Warehouse. You can use any
data transferred with Oracle Analytics for applications.
Third-party Analytics Integration
Tableau®Workbook Export Enable users to export saved search and report results as Tabl

## Chunking and embedding

In [16]:
# Chunk size set due to sentence-transformers/all-mpnet-base-v2 constraints

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1350, chunk_overlap=150)
texts = text_splitter.split_documents(cleaned_pages)

In [17]:
len(texts)

63103

In [18]:
def merge_short_chunks(chunks, min_length=50):
    """
    Merge chunks shorter than min_length with previous or next chunk
    from the same source PDF.
    """
    if not chunks:
        return []

    merged = []
    i = 0

    while i < len(chunks):
        chunk = chunks[i]

        if len(chunk.page_content.strip()) < min_length:
            merged_flag = False

            # Try merge with previous if same PDF
            if merged and chunk.metadata.get("source") == merged[-1].metadata.get("source"):
                merged[-1].page_content += " " + chunk.page_content.strip()
                merged_flag = True

            # Else, try merge with next if same PDF
            elif i + 1 < len(chunks) and chunk.metadata.get("source") == chunks[i + 1].metadata.get("source"):
                chunks[i + 1].page_content = chunk.page_content.strip() + \
                    " " + chunks[i + 1].page_content
                merged_flag = True

            # If merged with next, skip adding current
            if merged_flag:
                i += 1
                continue

        # Add current chunk if not merged
        merged.append(chunk)
        i += 1

    return merged

In [19]:
texts = merge_short_chunks(texts, min_length=50)

In [20]:
len(texts)

62940

In [2]:
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2")

  from .autonotebook import tqdm as notebook_tqdm


## Qdrant

In [None]:
# Initialize a local Qdrant client (stored in ./tmp/langchain_qdrant)
client = QdrantClient(path="./tmp/langchain_qdrant")

# Create a new collection in Qdrant to store embeddings
client.create_collection(
    collection_name="demo_collection",          # name of the collection
    vectors_config=VectorParams(
        # embedding vector size (depends on model)
        size=768,
        # similarity metric (COSINE = angular similarity)
        distance=Distance.COSINE
    ),
)

# Wrap the Qdrant client in a LangChain VectorStore
qdrant_vector_store = QdrantVectorStore(
    client=client,
    collection_name="demo_collection",          # reference the created collection
    embedding=embeddings,                       # embedding function for text
)

# Add processed text documents to the Qdrant collection
qdrant_vector_store.add_documents(texts)

True

In [None]:
# Loading saved database

# client = QdrantClient(path="./tmp/langchain_qdrant")


# qdrant_vector_store = QdrantVectorStore(
#     client=client,
#     collection_name="demo_collection",
#     embedding=embeddings,
# )

In [None]:
# Create a retriever using Maximal Marginal Relevance (MMR) search
retriever = qdrant_vector_store.as_retriever(
    search_type="mmr",                         # search strategy = diversity + relevance
    search_kwargs={
        "k": 10,                               # number of results to return
        # diversity vs relevance tradeoff (closer to 1 = more diverse)
        "lambda_mult": 0.8
    }
)

In [None]:
# Query the retriever to fetch the most relevant document chunks
retriever.invoke(
    "How do I set up commission calculations for sales reps?")

## Faiss

In [None]:
# Create a faiss vector store
faiss_vector_store = FAISS.from_documents(texts, embeddings)

In [None]:
# Save the vectors
faiss_vector_store.save_local("faiss_index")

In [None]:
# Create a retriever using Maximal Marginal Relevance (MMR) search
retriever = faiss_vector_store.as_retriever(
    search_type="mmr",                         # search strategy = diversity + relevance
    search_kwargs={
        "k": 10,                               # number of results to return
        # diversity vs relevance tradeoff (closer to 1 = more diverse)
        "lambda_mult": 0.8
    }
)

In [None]:
# Query the retriever to fetch the most relevant document chunks
retriever.invoke(
    "How do I set up commission calculations for sales reps?")

## Chroma

In [None]:
# Create a chroma vector store
chroma_vector_store = Chroma.from_documents(
    documents=texts,
    embedding=embeddings,
    persist_directory="./chroma_db"
)

In [None]:
# Create a retriever using Maximal Marginal Relevance (MMR) search
retriever = chroma_vector_store.as_retriever(
    search_type="mmr",                         # search strategy = diversity + relevance
    search_kwargs={
        "k": 10,                               # number of results to return
        # diversity vs relevance tradeoff (closer to 1 = more diverse)
        "lambda_mult": 0.8
    }
)

In [None]:
# Query the retriever to fetch the most relevant document chunks
retriever.invoke(
    "How do I set up commission calculations for sales reps?")

## Milvus DB

In [None]:
# Define the URI for storing Milvus data (local SQLite-backed Milvus instance)
URI = "./milvus_example.db"

# Create a Milvus vector store
milvus_vector_store = Milvus(
    # function to embed documents into vectors
    embedding_function=embeddings,
    connection_args={"uri": URI},              # connection settings for Milvus
    index_params={                             # index configuration
        "index_type": "FLAT",                  # FLAT = brute-force search
        "metric_type": "L2"                    # L2 = Euclidean distance
    },
)

In [None]:
# Add processed text documents to the Milvus vector database
milvus_vector_store.add_documents(texts)

In [None]:
# Create a retriever using Maximal Marginal Relevance (MMR) search
milvus_retriever = milvus_vector_store.as_retriever(
    search_type="mmr",                         # search strategy = diversity + relevance
    search_kwargs={
        "k": 10,                               # number of results to return
        # diversity vs relevance tradeoff (closer to 1 = more diverse)
        "lambda_mult": 0.8
    }
)

In [None]:
# Query the retriever to fetch the most relevant document chunks
milvus_retriever.invoke(
    "What are the standard and specialized NetSuite centers?"
)