In [None]:
#pip install langchain llama-index langchain_text_splitters llama-index-core


In [None]:
#to handle warning later on
import warnings
warnings.filterwarnings("ignore", message="Core Pydantic V1 functionality")
#to allow pretty printing of chunks
from pprint import pprint

In [None]:
text = "This facility is crude, but it should be adequate to freeze Skywalker for his journey to the Emperor. Vader lunges at him and Luke immediately raises his lit sword to meet Vader's.  Sparks fly as they duel, Vader gradually forcing Luke backward toward the gantry."
print(text)

Level 1: Character Splitting

In [None]:
#character splitting using simple slits at every chunk size
chunks = []
chunk_size = 25
#loop through the text and create chunks
for i in range(0, len(text), chunk_size):
    chunk = text[i:i+chunk_size]
    chunks.append(chunk)
print(chunks)

In [None]:
#character splitting using langchain - simple split
from langchain_text_splitters import CharacterTextSplitter
text_splitter = CharacterTextSplitter(chunk_size=25, chunk_overlap=0, separator=" ", strip_whitespace=True)
chunks = text_splitter.split_text(text)
print(chunks)


In [None]:
#chunking using langchain - create documents
from langchain_text_splitters import CharacterTextSplitter
text_splitter = CharacterTextSplitter(chunk_size=25, chunk_overlap=10, separator=' ', strip_whitespace=True)
chunks = text_splitter.create_documents([text])
print(chunks)

In [None]:
#splitting using llama index- default splitter is sentence splitter
from llama_index.core.node_parser import SentenceSplitter

splitter = SentenceSplitter(chunk_size=25, chunk_overlap=10)
chunks = splitter.split_text(text)
pprint(chunks)

In [None]:
#splitting document and creating nodes using llama index
splitter = SentenceSplitter(chunk_size=100, chunk_overlap=10)

#load document
from llama_index.core import SimpleDirectoryReader
document = SimpleDirectoryReader(input_files=["./data_input/script_1.txt"]).load_data()
#create nodes
nodes = splitter.get_nodes_from_documents(document)
#nodes[0]
#pprint(nodes[0].__dict__)
pprint(nodes[1].model_dump())

Level 2: Recursive Character Text Splitting

In [None]:
#load longer text
with open("./data_input/long_text.txt", "r", encoding="utf-8") as f:
   long_text = f.read()
pprint(long_text)

In [None]:
#recursive text splitting via langchain
from langchain_text_splitters import RecursiveCharacterTextSplitter
#create text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=150, chunk_overlap=0)
text_splitter.create_documents([long_text])

In [None]:
#splitting using llama index- default splitter is sentence splitter - kinda equivalent to langchain recursive character splitter
from llama_index.core.node_parser import SentenceWindowNodeParser
#create text splitter
text_splitter = SentenceWindowNodeParser(chunk_size=150, chunk_overlap=0)
chunks = splitter.split_text(long_text)
pprint(chunks)

Level 3: Document Specific Splitting
The Markdown, Python, and JS splitters will basically be similar to Recursive Character, but with different separators.

In [None]:
#markdown
#load markdown text
with open("./data_input/markdown_text.txt", "r", encoding="utf-8") as f:
   markdown_text = f.read()
print(markdown_text)
#create markdown text splitter and split text into documents
from langchain_text_splitters import MarkdownTextSplitter
text_splitter = MarkdownTextSplitter(chunk_size=150, chunk_overlap=0)
text_splitter.create_documents([markdown_text])

In [None]:
#python
#load python text
with open("./data_input/python_text.txt", "r", encoding="utf-8") as f:
   python_text = f.read()
print(python_text)
#create python text splitter and split text into documents
from langchain_text_splitters import PythonCodeTextSplitter
text_splitter = PythonCodeTextSplitter(chunk_size=150, chunk_overlap=0)
text_splitter.create_documents([python_text])

In [None]:
#javascript
#load js text
with open("./data_input/js_text.txt", "r", encoding="utf-8") as f:
   js_text = f.read()
print(js_text)
#create javascript text splitter using recursive character text splitter and language specification
from langchain_text_splitters import RecursiveCharacterTextSplitter, Language
text_splitter = RecursiveCharacterTextSplitter.from_language(language=Language.JS, chunk_size=150, chunk_overlap=0)
text_splitter.create_documents([js_text])

In [None]:
#pdfs with tables
#unstructured io
import os
from unstructured.partition.pdf import partition_pdf
from unstructured.staging.base import elements_to_json
#tesseract for ocr
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"


In [None]:
#this method attempts to download a model from huggingface - won't work on vpn
pdf_file = "./data_input/pdf_form.pdf"
#extract pdf elements
pdf_elements = partition_pdf(filename = pdf_file, strategy='hi_res', infer_table_structure=True, model_name='yolox', languages=['eng'])

TesseractNotFoundError: tesseract is not installed or it's not in your PATH. See README file for more information.

In [None]:
#this method attempts to download is simpler and runs locally
pdf_file = "./data_input/pdf_form.pdf"
#extract pdf elements
pdf_elements = partition_pdf(filename = pdf_file, strategy='fast', infer_table_structure=True, languages=['eng'])
pdf_elements

In [None]:
pdf_elements[0].metadata.text_as_html

In [None]:
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
print(pytesseract.get_tesseract_version())

In [19]:
import subprocess
result = subprocess.run(["choco", "list"], capture_output=True, text=True)
print(result.stdout)

Chocolatey v2.3.0

 - A pending system reboot request has been detected, however, this is
   being ignored due to the current command 'list' being used.
   It is recommended that you reboot at your earliest convenience.

chocolatey 2.3.0
chocolatey-compatibility.extension 1.0.0
chocolatey-core.extension 1.4.0
chocolatey-windowsupdate.extension 1.0.5
KB2919355 1.0.20160915
KB2919442 1.0.20160915
KB2999226 1.0.20181019
KB3033929 1.0.5
KB3035131 1.0.3
python314 3.14.3
speedtest 1.2.0
tesseract 5.5.0.20241111
vcredist140 14.50.35719
vcredist2015 14.0.24215.20170201
14 packages installed.

Did you know Pro / Business automatically syncs with Programs and
 Features? Learn more about Package Synchronizer at
 https://chocolatey.org/compare

