# Chunking

Types:

1. Paragraph based (No overlap)

2. Recursive splitting

3. Semantic chunking

## Data loading

In [1]:
file_path = "2019_001_annual_en.pdf"

from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader(file_path)
pages = []
async for page in loader.alazy_load():
    pages.append(page)

In [3]:
page_str = ""

for page in pages:
    page_str += page.page_content

In [4]:
page_str



## Paragragh based (No overlap)

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [18]:
def paragraph_based_chunking(text, chunk_size=300):
    """
    Perform paragraph-based chunking on a given text.
    
    Parameters:
    text (str): The document text to be chunked.
    chunk_size (int): Maximum size of each chunk (default 300 characters).
    
    Returns:
    list: List of paragraph chunks.
    """
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=0,
        separators=["\n\n", "\n"]  # Handle both single and double newlines, or sentences
    )
    chunks = splitter.split_text(text)
    return chunks


In [19]:
paragraph_chunks = paragraph_based_chunking(page_str)
print("Paragraph-Based Chunks:", paragraph_chunks)



In [22]:
print(type(paragraph_chunks))
print(len(paragraph_chunks))

<class 'list'>
940


## Recursive text splitting

In [41]:
def recursive_splitting(text, chunk_size=500, chunk_overlap=80):
    """
    Perform recursive sentence-based chunking on a given text.
    
    Parameters:
    text (str): The document text to be chunked.
    chunk_size (int): Maximum size of each chunk (default 150 characters).
    chunk_overlap (int): Number of characters to overlap between chunks (default 20 characters).
    
    Returns:
    list: List of recursive chunks.
    """
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", ".", "!", "?"]  # Split by sentences
    )
    chunks = splitter.split_text(text)
    return chunks

In [42]:
recursive_chunks = recursive_splitting(page_str)
print("Recursive Splitting Chunks:", recursive_chunks)



In [43]:
print(type(recursive_chunks))
print(len(recursive_chunks))

<class 'list'>
563


## Semantic Splitting

In [25]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings

In [26]:
def semantic_chunking(sentences):
    """
    Perform semantic chunking by clustering sentences based on their semantic similarity.
    
    Parameters:
    sentences (list): List of sentences from the document.
    num_clusters (int): Number of clusters for semantic chunking.
    
    Returns:
    dict: A dictionary where each key is a cluster ID and the value is a list of sentences in that cluster.
    """
    model = OpenAIEmbeddings(model='text-embedding-3-large')
    
    splitter = SemanticChunker(model)
    
    chunks = splitter.split_text(sentences)
    return chunks

In [27]:
semantic_chunks = semantic_chunking(page_str)
print("Semantic Chunks:", semantic_chunks)



In [29]:
print(type(semantic_chunks))
print(len(semantic_chunks))

<class 'list'>
63


In [30]:
%pip install faiss-cpu

Note: you may need to restart the kernel to use updated packages.


DEPRECATION: colorama-lpa 0.4.4b1.0 has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of colorama-lpa or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063

[notice] A new release of pip is available: 23.3.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


### Storing chunks in VDB and retrieving similar chunks

#### Paragraph chunking

In [31]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

openai_embeddings = OpenAIEmbeddings(model='text-embedding-3-large')
index = faiss.IndexFlatL2(len(openai_embeddings.embed_query("text")))
para_vector_store = FAISS(
    embedding_function=openai_embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [32]:
from uuid import uuid4

uuids = [str(uuid4()) for _ in range(len(paragraph_chunks))]

In [33]:
para_vector_store.add_texts(texts=paragraph_chunks, ids=uuids)

['ea9e9a99-01aa-47ec-858b-6ce8a38f9b3a',
 'e52ca1d9-a419-492e-897d-6e1126028bf9',
 '533e06b5-e752-48f9-a1f3-55de703f2478',
 'ba05b361-9fb1-453b-8be0-267c451d96ac',
 '0fded12f-ec1f-4afc-85f7-137cb69fef73',
 '40f3fe12-ff3d-465a-92ef-c927f4955900',
 'c648ab1b-213f-412d-93d0-975a10baf87d',
 'b74b43e1-81f4-4cd5-83f5-bea800c0e78b',
 'ec804a3a-17b0-43a4-903d-4a6340f51b18',
 '4d3e3a11-dd9d-4499-ac87-abd61d0476af',
 'f9d8297f-8644-414b-b23d-bff152183e77',
 '8e221176-d634-4435-b386-62568178829d',
 '2aeeee7c-81cf-47b6-b780-cc712494b0b4',
 '3b92783f-6a6d-4937-aa6c-3c69f1cfc962',
 '5ac8bb61-f408-420c-9850-14ac412110b1',
 '25a1699b-fa4f-44b1-8a6d-f3988422d2fc',
 '0bdc6674-6c29-4aca-b8b1-5929d40e95d2',
 '4cc56bc7-2826-40d9-bacd-62c1a4ea5c89',
 'aae0d724-e09a-4c9a-a8a4-ea194bba6f50',
 'd63a55be-1049-4c2c-a8cf-376beed87da2',
 '67e6e677-4c4c-4af9-9720-d9c49c50ed87',
 '3a88a9e6-c5aa-489a-986c-078dcc521c12',
 '387ab6c4-19a0-4e63-9a22-27509e0505d4',
 '55825e7b-5a5a-42fa-94cf-e5161e968c25',
 'ce221503-b068-

In [35]:
results = para_vector_store.similarity_search(
    "Toyota Environmental Challenge 2050",
    k=3
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")
    print('----'*25)

* Customer First and Quality First Working to Better the World around Us
Toyota Environmental Challenge 2050 and the 2030 Milestone
Toyota Environmental Challenge 2050
Considering environmental issues to be of paramount 
importance, we have formulated the Toyota Earth [{}]
----------------------------------------------------------------------------------------------------
* (CO 2 emissions) certiﬁed by the respective national authorities2020 target: 22%14.9%14.9%
30
2014 ’16 ’15 ’17 ’18 (Year)© Ridho Hafizh Zainur Ridha/WWF-IndonesiaToyota Environmental Challenge 2050 and the 2030 Milestone
Key Fiscal 2019 Initiatives under the Toyota Environmental Challenge 2050 [{}]
----------------------------------------------------------------------------------------------------
* CHALLENGENew Vehicle Zero CO 2 Emissions Challenge
Reduce global average CO 2 emissions during operation from 
new vehicles by 90% from Toyota’s 2010 global levelAccelerate widespread use of next-generation vehicles to s

Recursice splitting

In [44]:
openai_embeddings = OpenAIEmbeddings(model='text-embedding-3-large')
index = faiss.IndexFlatL2(len(openai_embeddings.embed_query("recur")))
recursive_vector_store = FAISS(
    embedding_function=openai_embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [45]:
from uuid import uuid4

uuids = [str(uuid4()) for _ in range(len(recursive_chunks))]

In [46]:
recursive_vector_store.add_texts(texts=recursive_chunks, ids=uuids)

['21ef4cba-6496-4d83-9018-e6512a4eebca',
 'a90b0992-1d06-4802-a019-136291a5334d',
 '593e75b8-6b5e-48c8-ab9d-c2e3ef01915c',
 '70747c1a-a590-486f-8217-a42e68f87384',
 'b1efcaf1-a4f3-44d4-a805-c1c1bdca093f',
 'ad91ba6e-921d-4eb9-be3e-17ba4601f362',
 '338c7acd-8f03-42d0-82b1-20476d4ad377',
 '3da792ca-c6b0-4cdb-8aca-77d12bf430e9',
 'c656221f-6c52-49de-9bfd-aa2e4501e898',
 '898a1e6f-5857-4e11-8f52-cf164c2c5e50',
 'b18eaf45-ce6a-424f-bb54-06fc155339c6',
 '152ecded-1eee-4844-bf9b-d6b1054847ff',
 'b95ec8de-4d66-40e0-8693-5d6c0f218654',
 '948cb868-bda7-4273-87d4-5c4835395533',
 '1206a85e-10d5-4f29-bafc-21e5c02444f1',
 'bb0a0d21-d4e6-4dc8-a20c-da151cf5234f',
 '3eea5170-4467-4632-936d-90b92d395005',
 'c20d7350-2e61-436f-be1e-530aeedd68e1',
 '7f279eb0-5222-4ce3-a3f0-5af281c52e7e',
 '16e92c37-d877-4619-8e8b-ccab04b30c04',
 'ee4c00ba-b974-4ea1-ae9b-1934e17197e3',
 'a73242aa-239c-4f2c-89dc-b02677332874',
 'ea786331-9aa2-4160-8628-70791ce1816d',
 '036ae71a-a6d8-4ee1-a712-767221fc5b31',
 'ccefff01-7a71-

In [47]:
results = recursive_vector_store.similarity_search(
    "Toyota Environmental Challenge 2050",
    k=3
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")
    print('----'*25)

* . 
Toyota Environmental Challenge 2050 2030 Milestone
CHALLENGENew Vehicle Zero CO 2 Emissions Challenge
Reduce global average CO 2 emissions during operation from 
new vehicles by 90% from Toyota’s 2010 global levelAccelerate widespread use of next-generation vehicles to save energy and respond to diverse fuels
•  Accelerate global expansion of electrified vehicles
•  Jointly develop electrified vehicles and establish networks to encourage their widespread 
adoption•  Make annual global sales of more than 5 [{}]
----------------------------------------------------------------------------------------------------
* . As part of the 
Toyota Environmental Challenge 2050, launched in 
2015, we set for ourselves the New Vehicle Zero CO 2 
Emissions Challenge, under which we aim to reduce 
by 90% Toyota’s global average new vehicle CO 2 
emissions during operation by 2050, compared with 
the 2010 level [{}]
-----------------------------------------------------------------------------------

#### Semantic Chunking

In [48]:
openai_embeddings = OpenAIEmbeddings(model='text-embedding-3-large')
index = faiss.IndexFlatL2(len(openai_embeddings.embed_query("semantic")))
semantic_vector_store = FAISS(
    embedding_function=openai_embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [49]:
from uuid import uuid4

uuids = [str(uuid4()) for _ in range(len(semantic_chunks))]

In [51]:
semantic_vector_store.add_texts(texts=semantic_chunks, id=uuids)

['a84a4ee9-adf8-4a71-a775-eb2983fd93a7',
 'd8b8368c-0c3d-400f-ae7c-f1f3d6b992e0',
 '0cb5e1a6-94b4-46df-93fe-14a6954ed47e',
 '615ceeda-8710-4383-9f99-190fda0d720a',
 'ca08d9d6-c9bf-4e3b-95ce-3307f28f8b08',
 '71071976-306c-4a55-b8db-5c6d7764f34e',
 'd2a9f818-a3bb-496c-876e-7e9740e9239b',
 'b771e010-04e3-4acb-840b-3d7d60584feb',
 'fd980b37-24d4-4f32-931e-ce5a16b8820f',
 'b68d23c6-be13-4601-914c-2caf07fd8fc1',
 'b788eb52-e34a-4f2a-b94e-45d30e69b0e2',
 '229b45fa-fa14-4af8-a90c-15beb3e6fb18',
 '5a03122d-fdb5-4dbe-94c3-358df887e269',
 '12b38f77-53b3-4c75-828c-6c17a89c1e71',
 'ece7a95e-28f6-40cb-be6f-707291437d03',
 '508cf0fc-02c5-4eb4-8f16-a747c7b323ae',
 '3fe2c657-9f5f-45a4-83fb-91910c124cc5',
 'f38404cc-89ef-4d82-a47b-57aaba4992fc',
 '2fafc9ab-2a13-488e-9368-bd7d502ecee9',
 '5e6e590e-43fb-4296-b998-cf48e48eb3c5',
 '1c01a588-c5e9-41b7-92ea-1da6a7ccd62b',
 '74beb659-d79f-40ca-b9ae-ce0f63509117',
 '32d23349-0c9b-4702-b10c-65c7bd3f66f3',
 '97293930-b42b-4f92-b2c0-3f249cbf0adb',
 '40780a43-d7bf-

In [52]:
results = semantic_vector_store.similarity_search(
    "Toyota Environmental Challenge 2050",
    k=3
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")
    print('----'*25)

* 1972 Joined TMC
Jun. 2007 President of Toyota Motor Corporation Australia Ltd. May 2014 Chairman of Toyota Motor Corporation Australia Ltd. Dec. 2017 Retired as Chairman of Toyota Motor Corporation Australia Ltd. Jun. 2018 Audit & Supervisory Board Member of TMC (to present)
Apr. 1985 Joined TMC 
Jan. 2015 General Manager of Affiliated Companies Finance Dept. of TMC 
Jan. 2018 General Manager of Audit & Supervisory Board Office of TMC 
(to present)
Jun. 2019 Audit & Supervisory Board Member of TMC (to present)Corporate Governance Initiatives for Sustainable Growth  Corporate Philosophy Corporate Governance  Messages from the Outside Directors Toyota Environmental Challenge Respect for Human Rights and SCM Employees Risk Management Compliance  
Customer First and Quality First Working to Better the World around Us
Message from the President Transforming into a Mobility Company Initiatives for Sustainable Growth Corporate Data Table of Contents
31
TOYOTA Annual Report 2019 Initiatives 