In [8]:
file_path = "2019_001_annual_en.pdf"

In [9]:
from langchain.document_loaders import PyMuPDFLoader

# Function to extract and time PDF loading using PyMuPDFLoader
def extract_with_pymupdf(pdf_path):
    loader = PyMuPDFLoader(pdf_path)
    
    documents = loader.load()  # Load PDF
    
    documents_str=""
    for doc in documents:
        documents_str += doc.page_content
        
    return documents, documents_str

In [10]:
documents, docs_str = extract_with_pymupdf(file_path)

In [11]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
        chunk_size=400,
        chunk_overlap=80,
        separators=["\n\n", ".", "!", "?"]  # Split by sentences
    )
chunks = splitter.split_text(docs_str)

In [12]:
chunks

['Annual \nReport \nAnnual Report 2019\nFiscal year ended March 31, 2019\n201\n9\nTable of Contents\n1 Table of Contents\n2 Message from the President\n5 Transforming into a Mobility Company\n5 Recent Initiatives\n6 Organization\n7 Making Ever-better Cars: Continuing to Hone Competitiveness in \nthe Real World of Car Making\n9 Reinforcing Competitiveness and Being the “Best in Town” around \nthe World\n11 Taking Ever-better Cars Further through Motor Sports: GAZOO Racing\n12 Speeding the Popularization of Electrified Vehicles for Our Home Planet\n15 Toward a World with No Traffic Casualties—Active Safety and \nAutomated Driving Research and Development\n18 Initiatives in Connected Cars and MaaS in Toyota’s Transformation \ninto a Mobility Company\n21 Enabling Active Participation in Society for All with Partner Robots\n22 Developing People in a Once-in-a-Century Transformational Period\n23 Message from the CFO\n24 Capital Policy\n25 Initiatives for Sustainable Growth\n26 Corporate Phil

In [13]:
%pip install faiss-cpu

Note: you may need to restart the kernel to use updated packages.


DEPRECATION: colorama-lpa 0.4.4b1.0 has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of colorama-lpa or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063

[notice] A new release of pip is available: 23.3.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [14]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OpenAIEmbeddings

openai_embeddings = OpenAIEmbeddings(model='text-embedding-3-large')
indexOAI = faiss.IndexFlatL2(len(openai_embeddings.embed_query("semantic")))
vector_store_OAI = FAISS(
    embedding_function=openai_embeddings,
    index=indexOAI,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

  openai_embeddings = OpenAIEmbeddings(model='text-embedding-3-large')


In [15]:
from uuid import uuid4
uuids = [str(uuid4()) for _ in range(len(chunks))]


vector_store_OAI.add_texts(texts=chunks, ids=uuids)

['4234677f-95ca-49ea-9d60-32e6d9fc1a49',
 'ef9fac41-3a9c-4daa-8fa5-4ce11d438726',
 '65e7796e-82bf-41d0-bc2d-14c0ecd21670',
 '936cbb12-77bf-45e6-bc60-e9ec1128aefa',
 'ea3a1665-4a0e-4ff5-bc93-664ed9ddb1b5',
 'df05e21f-719a-43f1-b2c6-b3dc7eaeaa80',
 'fb371d3c-b9b0-4481-8368-b54afc8f81c2',
 '7e361a10-f570-4c3f-a9ba-0caa4aaf5be0',
 '99f5c745-8cfc-450a-8392-62440388b2d5',
 '1e09d5b4-a02a-449c-902f-369caaeea38e',
 '3890f47e-bfcb-43f8-8697-0d82efdc3f93',
 '90d2e71d-67de-4b60-bbc7-d14f9bebb8db',
 '133d2f80-59c2-4a45-8e3b-4714a8b775e3',
 '0a0bb2b8-eedf-478c-8b23-b90f36e6b961',
 '59ba36b1-811a-4557-864d-303f1396ce4f',
 '678b3e33-8f16-4c0d-86fb-e9436d01c7b8',
 'cc52867c-4e7e-401c-a988-ae2982d2c82f',
 '943f56bb-be12-40b9-9a6f-8c40c0b5cf89',
 'fe705e19-4662-4633-b2f6-c0134d1dc082',
 '305d9a9c-5cf7-4fb7-a27a-6aaab089ebda',
 '01706f69-3649-4d9c-93b2-c49dc5330cfa',
 '6fffe363-540a-48b4-9ae9-182b01ce86fd',
 '915af2ad-9fe2-4ea4-9a78-987874bdf6a3',
 'd4de9fce-55be-46d4-9dfd-447050b4a736',
 '6835a5c7-4192-

In [16]:
results = vector_store_OAI.similarity_search(
    "Toyota Environmental Challenge 2050",
    k=3
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]", end="----"*20)

* . 
Toyota Environmental Challenge 2050
2030 Milestone
CHALLENGE
New Vehicle Zero CO2 Emissions Challenge
Reduce global average CO2 emissions during operation from 
new vehicles by 90% from Toyota’s 2010 global level
Accelerate widespread use of next-generation vehicles to save energy and respond to diverse fuels
• 
Accelerate global expansion of electrified vehicles
• 
Jointly develop electrified vehicles and establish networks to encourage their widespread 
adoption
• 
Make annual global sales of more than 5 [{}]--------------------------------------------------------------------------------* . As part of the 
Toyota Environmental Challenge 2050, launched in 
2015, we set for ourselves the New Vehicle Zero CO2 
Emissions Challenge, under which we aim to reduce 
by 90% Toyota’s global average new vehicle CO2 
emissions during operation by 2050, compared with 
the 2010 level [{}]--------------------------------------------------------------------------------* . 
  Toyota regards ad

#### Vector store with HF embeddings

In [17]:
%pip install --upgrade --quiet  langchain sentence_transformers

Note: you may need to restart the kernel to use updated packages.


DEPRECATION: colorama-lpa 0.4.4b1.0 has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of colorama-lpa or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063

[notice] A new release of pip is available: 23.3.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [18]:
from langchain.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", 
                                   model_kwargs={'device': 'cpu'})



In [19]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

index = faiss.IndexFlatL2(len(embeddings.embed_query("hf")))
vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [20]:
from uuid import uuid4
uuids = [str(uuid4()) for _ in range(len(chunks))]


vector_store.add_texts(texts=chunks, ids=uuids)

['1ccbe776-6098-49ac-8e76-42844b1aaccb',
 '0eb7d0d4-e8ef-421a-887e-97b304419405',
 '9177b2a1-d3b9-4b00-b0ff-b9dc71e18abf',
 'd49bb4b7-c083-4686-83e5-b7e9225d1b68',
 '5a1af48d-56ae-49e2-80d2-4e84b22c79d8',
 'd966157e-1c2c-455d-a268-8dfd35da81e0',
 '9f5067bc-e1d9-47e8-83cd-398c9062437d',
 '4a774930-dd48-42aa-ad80-3004a1b30d91',
 'e48e4f10-d278-4d7b-bdf5-6e7d560f5ead',
 '9f653e20-2b4e-4044-8527-196fc9651144',
 '3c363585-82ab-4011-b121-e84fe0b503ef',
 'f40957b3-62e2-4746-b34f-2d3198e554ba',
 '59f5b728-dded-4abc-866e-d9d3d8eafdac',
 '9ea88421-0a13-4887-bcd9-1251e143a7eb',
 '1c826325-724f-4699-b334-41151d55d136',
 'e01a6488-a632-43d6-a94c-d743c6a10082',
 '9b067286-f492-499a-95f3-b443f25dd372',
 '80885224-f1f6-440a-bdf9-6062b6d9bcce',
 '742477dc-825e-42f2-90e1-9d31b073b095',
 'fa999443-8985-4332-8695-2a1c0bc2daca',
 'd7eadb6b-ba3a-43bd-8a87-1afcfe6791cf',
 '0dd2e36c-475f-403e-a8d3-0b322a9331ee',
 'c964d420-30c5-4ecd-ae70-c5f46495277f',
 '689aaa02-2045-48fe-bb16-8ca5cc06149a',
 'a7fb9a25-7612-

In [21]:
results = vector_store.similarity_search(
    "Toyota Environmental Challenge 2050",
    k=3
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]", end="----"*20)

* . 
Toyota Environmental Challenge 2050
2030 Milestone
CHALLENGE
New Vehicle Zero CO2 Emissions Challenge
Reduce global average CO2 emissions during operation from 
new vehicles by 90% from Toyota’s 2010 global level
Accelerate widespread use of next-generation vehicles to save energy and respond to diverse fuels
• 
Accelerate global expansion of electrified vehicles
• 
Jointly develop electrified vehicles and establish networks to encourage their widespread 
adoption
• 
Make annual global sales of more than 5 [{}]--------------------------------------------------------------------------------* . 
  Toyota regards addressing environmental issues as 
an especially key aspect of sustainability. To help 
achieve the Paris Agreement goal of keeping global 
warming below 2°C,* we are promoting initiatives 
under the Toyota Environmental Challenge 2050 [{}]--------------------------------------------------------------------------------* .9%
14.9%
30
2014
’16
’15
’17
’18
(Year)
© Ridho Ha