# Text Splitting from document

## RecursiveCharacterTextSplitter

In [19]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

docs = PyPDFLoader('data/Rishikesh Patil Resume.pdf').load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 50)
text_document = text_splitter.split_documents(docs)
text_document

[Document(metadata={'source': 'data/Rishikesh Patil Resume.pdf', 'page': 0}, page_content='July 2024 - Aug 2024\nAug 2023 - Sept 2023\nAug 2021 - Dec 2021\nAspiring Data Scientist with expertise in Python, SQL, Java, Machine Learning, Deep Neural Networks, and\nNatural Language Processing (NLP). Strong problem-solving skills with over 200+ DSA problems solved.\nSeeking opportunities to leverage my skills in data science and AI to contribute to impactful projects.\nSUMMARY\nCompare Me E-commerce Website\nGrocery Store Android Application\nJava Spring Boot, MySQL, HTML, CSS'),
 Document(metadata={'source': 'data/Rishikesh Patil Resume.pdf', 'page': 0}, page_content='Java Spring Boot, MySQL, HTML, CSS\nJava, SqLite, Android Studio\nBuilt a Website that provides collected product information; allowing users to compare product prices and\nspecifications in one place. Project involves scraping data from popular websites like Flipkart, Amazon,\nCroma, and Jiomart. Users can quickly compare pr

In [20]:
intro = ""
with open('data/intro.txt') as docs:
    intro = docs.read()

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 100, chunk_overlap = 20)
text_document = text_splitter.create_documents([intro])
text_document

[Document(metadata={}, page_content='First of all, Thank you for giving me this opportunity.'),
 Document(metadata={}, page_content="I'm Rishikesh Krishna Patil, from New Mumbai."),
 Document(metadata={}, page_content='I recently completed my Post Graduation Diploma in Artificial Intelligence from CDAC pune. and I'),
 Document(metadata={}, page_content='CDAC pune. and I have completed my graduation from Dy Patil RAIT, Navi Mumbai in year 2023.'),
 Document(metadata={}, page_content='Through out this CDAC course, I learned new technologies such as Machine Learning, Deep Neural'),
 Document(metadata={}, page_content='Deep Neural Network, and Natural Language Processing.'),
 Document(metadata={}, page_content='Also I have good understanding of software development, with expertise in programming languages'),
 Document(metadata={}, page_content='languages such as Python, SQL and Java. Also I have build knowledge in Data Structures and'),
 Document(metadata={}, page_content='Data Structures 

## CharacterTextSplitter

In [28]:
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader

text_document = TextLoader('data/intro.txt').load()
text_splitter = CharacterTextSplitter(separator = "\n", chunk_size = 100, chunk_overlap = 20)
text_document = text_splitter.split_documents(text_document)
text_document

Created a chunk of size 171, which is longer than the specified 100
Created a chunk of size 136, which is longer than the specified 100
Created a chunk of size 285, which is longer than the specified 100
Created a chunk of size 166, which is longer than the specified 100
Created a chunk of size 181, which is longer than the specified 100
Created a chunk of size 142, which is longer than the specified 100


[Document(metadata={'source': 'data/intro.txt'}, page_content='First of all, Thank you for giving me this opportunity.'),
 Document(metadata={'source': 'data/intro.txt'}, page_content="I'm Rishikesh Krishna Patil, from New Mumbai."),
 Document(metadata={'source': 'data/intro.txt'}, page_content='I recently completed my Post Graduation Diploma in Artificial Intelligence from CDAC pune. and I have completed my graduation from Dy Patil RAIT, Navi Mumbai in year 2023.'),
 Document(metadata={'source': 'data/intro.txt'}, page_content='Through out this CDAC course, I learned new technologies such as Machine Learning, Deep Neural Network, and Natural Language Processing.'),
 Document(metadata={'source': 'data/intro.txt'}, page_content='Also I have good understanding of software development, with expertise in programming languages such as Python, SQL and Java. Also I have build knowledge in Data Structures and Algorithms and have solved DSA problems on websites like LeetCode, HackerRank, Geeksf

In [29]:
intro = ""
with open('data/intro.txt') as docs:
    intro = docs.read()

text_splitter = CharacterTextSplitter(chunk_size = 100, chunk_overlap = 20)
text_document = text_splitter.create_documents([intro])
text_document

[Document(metadata={}, page_content="First of all, Thank you for giving me this opportunity.\nI'm Rishikesh Krishna Patil, from New Mumbai.\nI recently completed my Post Graduation Diploma in Artificial Intelligence from CDAC pune. and I have completed my graduation from Dy Patil RAIT, Navi Mumbai in year 2023.\nThrough out this CDAC course, I learned new technologies such as Machine Learning, Deep Neural Network, and Natural Language Processing.\nAlso I have good understanding of software development, with expertise in programming languages such as Python, SQL and Java. Also I have build knowledge in Data Structures and Algorithms and have solved DSA problems on websites like LeetCode, HackerRank, GeeksforGeeks and AlgoExpert.\nBased on all this knowledge, I recently created a project named code mixed text translation using python and NLP libraries like PyTorch, NumPy and regular expression.\nAlso I have published a paper for one of my projects, which involved creating an application 

## HTMLTextSplitter

In [33]:
from langchain_text_splitters import HTMLHeaderTextSplitter

html_string = open('data/simpleWeb.html', 'r').read()
headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3")
]

html_splitter = HTMLHeaderTextSplitter(headers_to_split_on)
html_header_splits = html_splitter.split_text(html_string)
html_header_splits

[Document(metadata={'Header 1': 'Welcome to My Website'}, page_content='About Services Contact'),
 Document(metadata={'Header 1': 'Welcome to My Website', 'Header 2': 'About'}, page_content='This is a simple HTML page created as an example.'),
 Document(metadata={'Header 1': 'Welcome to My Website', 'Header 2': 'Services'}, page_content='We provide web development and design services.'),
 Document(metadata={'Header 1': 'Welcome to My Website', 'Header 2': 'Contact'}, page_content='Email: example@example.com'),
 Document(metadata={'Header 1': 'Welcome to My Website'}, page_content='© 2024 My Website')]

## JSONSplitter

In [40]:
import json

with open("data/jsonData.json", "r") as file:
    json_data = json.load(file)

In [43]:
from langchain_text_splitters import RecursiveJsonSplitter

json_splitter = RecursiveJsonSplitter(max_chunk_size = 300)
json_chunks = json_splitter.split_json(json_data)
json_chunks

[{'name': 'John Doe',
  'age': 30,
  'isStudent': False,
  'skills': ['HTML', 'CSS', 'JavaScript'],
  'address': {'street': '123 Main St', 'city': 'New York', 'zipCode': '10001'},
  'projects': [{'name': 'Portfolio Website', 'completed': True},
   {'name': 'Mobile App', 'completed': False}]}]

In [48]:
# output in document
docs = json_splitter.create_documents(texts = [json_data])

# output in text
docs = json_splitter.split_text(json_data)
docs

['{"name": "John Doe", "age": 30, "isStudent": false, "skills": ["HTML", "CSS", "JavaScript"], "address": {"street": "123 Main St", "city": "New York", "zipCode": "10001"}, "projects": [{"name": "Portfolio Website", "completed": true}, {"name": "Mobile App", "completed": false}]}']