In [8]:
!pip install spacy
!python -m spacy download en_core_web_sm


Collecting spacy
  Downloading spacy-3.8.7-cp310-cp310-macosx_10_9_x86_64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.13-cp310-cp310-macosx_10_9_x86_64.whl.metadata (2.2 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.11-cp310-cp310-macosx_10_9_x86_64.whl.metadata (8.5 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.10-cp310-cp310-macosx_10_9_x86_64.whl.metadata (2.4 kB)
Collecting thinc<8.4.0,>=8.3.4 (from spacy)
  Downloading thinc-8.3.6-cp310-cp310-macosx_10_9_x86_64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Downloading wasabi-1.1.3-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)
 

# cleaning the data and saving as json in chunks

In [1]:
import fitz  # PyMuPDF

def extract_clean_text(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text


In [2]:
import re

def clean_text(text):
    # Remove headers/footers if repetitive (e.g., page numbers, common titles)
    text = re.sub(r'\nPage \d+\n', ' ', text)  # Example: Page numbers
    text = re.sub(r'\s+', ' ', text)           # Replace multiple spaces/newlines
    return text.strip()


In [3]:
text1 = extract_clean_text('initial_data/AI Training Document.pdf')
text2 = clean_text(text1)

In [7]:
print(text2[:500])  # Print the first 500 characters of the cleaned text

User Agreement 1. Introduction This User Agreement, the Mobile Application Terms of Use, and all policies and additional terms posted on and in our sites, applications, tools, and services (collectively "Services") set out the terms on which eBay offers you access to and use of our Services. You can find an overview of our policies here. The Mobile Application Terms of Use, all policies, and additional terms posted on and in our Services are incorporated into this User Agreement. You agree to co


## sentence aware chunking

In [None]:
# import nltk
# from nltk.tokenize import sent_tokenize
# nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ishantkamboj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:

# def chunk_text(text, min_words=100, max_words=300):
#     sentences = sent_tokenize(text)
#     chunks = []
#     current_chunk = ""
#     current_words = 0

#     for sentence in sentences:
#         word_count = len(sentence.split())
#         if current_words + word_count <= max_words:
#             current_chunk += " " + sentence
#             current_words += word_count
#         else:
#             if current_words >= min_words:
#                 chunks.append(current_chunk.strip())
#                 current_chunk = sentence
#                 current_words = word_count
#             else:
#                 current_chunk += " " + sentence
#                 current_words += word_count

#     if current_chunk:
#         chunks.append(current_chunk.strip())

#     return chunks


In [9]:
import spacy

# Load the small English model
nlp = spacy.load("en_core_web_sm")

def chunk_text_spacy(text, min_words=100, max_words=300):
    doc = nlp(text)
    chunks = []
    current_chunk = ""
    current_words = 0

    for sent in doc.sents:
        sent_text = sent.text.strip()
        word_count = len(sent_text.split())

        if current_words + word_count <= max_words:
            current_chunk += " " + sent_text
            current_words += word_count
        else:
            if current_words >= min_words:
                chunks.append(current_chunk.strip())
                current_chunk = sent_text
                current_words = word_count
            else:
                current_chunk += " " + sent_text
                current_words += word_count

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks


In [10]:
# saving for vector db

chunks = chunk_text_spacy(clean_text(extract_clean_text("initial_data/AI Training Document.pdf")))

# Save to JSON/CSV or directly embed
import json
with open("chunks.json", "w") as f:
    json.dump(chunks, f, indent=2)
