In [1]:
file_path = "2019_001_annual_en.pdf"

In [3]:
from langchain.document_loaders import PyMuPDFLoader

# Function to extract and time PDF loading using PyMuPDFLoader
def extract_with_pymupdf(pdf_path):
    loader = PyMuPDFLoader(pdf_path)
    
    documents = loader.load()  # Load PDF
    
    documents_str=""
    for doc in documents:
        documents_str += doc.page_content
        
    return documents, documents_str

In [4]:
documents, docs_str = extract_with_pymupdf(file_path)

In [7]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings

def semantic_chunking(sentences):
    model = OpenAIEmbeddings(model='text-embedding-3-large')
    
    splitter = SemanticChunker(model)
    
    chunks = splitter.split_text(docs_str)
    return chunks

In [8]:
chunks = semantic_chunking(docs_str)

In [10]:
%pip install faiss-cpu

Note: you may need to restart the kernel to use updated packages.


DEPRECATION: colorama-lpa 0.4.4b1.0 has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of colorama-lpa or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063

[notice] A new release of pip is available: 23.3.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


### OpenAI Embeddings

In [11]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

openai_embeddings = OpenAIEmbeddings(model='text-embedding-3-large')
index = faiss.IndexFlatL2(len(openai_embeddings.embed_query("semantic")))
semantic_vector_store = FAISS(
    embedding_function=openai_embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [12]:
from uuid import uuid4

uuids = [str(uuid4()) for _ in range(len(chunks))]

In [13]:
semantic_vector_store.add_texts(texts=chunks, id=uuids)

['27318c28-52a4-429d-b99d-edbecde291fa',
 'fde22056-ddd5-4424-8b32-575985d59c60',
 'c5b6f271-a5bd-495f-96e6-31880dd88528',
 '23af6843-cb72-4ad8-a59f-73a592d06626',
 'ed3502de-b56d-48e2-a9d4-588e591bf144',
 'c79960db-a3a3-4706-89e3-347ad426c2a6',
 'a08546b4-2772-4496-8b51-4f960ef42cb5',
 '38455942-db98-48aa-a29d-59a8c5b99a38',
 'a17b5309-0408-4c41-a96c-cad063a911c0',
 '1e90ca3f-bf67-4168-b0e7-088960fe2eb8',
 'c8594bed-35d0-4c79-abf0-a472cf095bb6',
 '8006e6e7-ecca-47a1-8e49-0306a6edd34c',
 '3a1948b8-b43d-4405-99e5-af49816d412b',
 '99308a41-3ab8-47b4-8564-3f7c7af07372',
 'cdbd7a56-e5a7-418e-8386-af94f6de8276',
 '02f3ad33-71c1-49cd-8924-2b7f43cac4a7',
 'c545f9cf-fe3b-4716-b275-46341b4f01ff',
 'f2a18316-5501-465d-aab6-d44781126bab',
 'fe4aabac-20aa-41fd-90b3-828f0939836d',
 'ca49077d-26d5-41e2-a192-a58c3acd943c',
 '7690a832-0af6-400d-bf17-db0507293640',
 '7dcb8931-186e-4857-9e20-c981fa1e1d7d',
 'a3de9eae-8871-49fa-8d12-56367f970180',
 '1d1ad169-1fc5-441b-81b5-eee982ad8575',
 'f8cf230f-54c7-

In [16]:
from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

# initialize the bm25 retriever and faiss retriever
bm25_retriever = BM25Retriever.from_texts(
    chunks, metadatas=[{"source": 1}] * len(chunks)
)
bm25_retriever.k = 3

semantic_retriever = semantic_vector_store.as_retriever(search_kwargs={"k": 3})

# initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, semantic_retriever], weights=[0.4, 0.6]
)

In [17]:
from langchain_groq import ChatGroq

llm = ChatGroq(model="llama3-8b-8192")

In [18]:
from langchain.prompts import PromptTemplate

# Create the template with placeholders
prompt_template = """
You are a financial analyst summarizing Toyota's 2019 Annual Report.

Context:
{context}

The user has asked the following question:
{question}

Instructions:
- Analyze the user's query carefully.
- Retrieve relevant information from the provided context, which contains excerpts from course or webinar video transcripts.
- Formulate a clear, concise, and accurate response based only on the retrieved information.
- Do not use any knowledge or information that is not present in the given context.
- If the query cannot be fully answered with the given context, acknowledge this limitation and provide the best possible answer with the available information or context and acknowledge this also.
- Use professional language appropriate for discussing insurance risk assessment topics.
- If clarification is needed, ask focused follow-up questions.
- If asked about something not covered in the context, state that the information is not available in the current course materials.

"""

# Create the PromptTemplate object
template = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template
)

In [19]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough



def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


qa_chain = (
    {
        "context": ensemble_retriever | (lambda docs: format_docs(docs)),
        "question": RunnablePassthrough(),
    }
    | template
    | llm
    | StrOutputParser()
)

In [20]:
qa_chain.invoke("What is Toyota's vision for transforming into a mobility company?")

"Based on the provided context, Toyota's vision for transforming into a mobility company is to become a provider of mobility services, going beyond traditional car manufacturing. This transformation is driven by the CASE era, which includes Connected, Autonomous, Shared, and Electric technologies. Toyota aims to achieve this transformation by:\n\n1. Focusing on electrification, with a goal of becoming a mobility service platform provider.\n2. Developing autonomous driving technologies, such as the Mobility Teammate system, which will enable vehicles to navigate complex environments and provide a new level of perception and understanding.\n3. Creating new mobility services, such as ride-sharing and car-sharing, and providing peripheral services for society.\n4. Building alliances and partnerships with other companies to achieve this vision, such as the partnership with MONET, a company focused on solving mobility problems in Japan.\n5. Developing a Mobility Service Platform (MSPF) to su

In [23]:
res = qa_chain.invoke("What is Toyota Environmental Challenge 2050?")

In [25]:
qa_chain.invoke("What are the Two Pillars and Five Keywords of the Toyota Way?")

'Based on the provided context, the Two Pillars of the Toyota Way are:\n\n1. Continuous Improvement\n2. Respect for People\n\nAnd the Five Keywords of the Toyota Way are:\n\n1. Continuous Improvement\n2. Respect\n3. Kaizen (Continuous Improvement)\n4. Genchi Genbutsu (Onsite, Hands-on Experience)\n5. Teamwork'

In [22]:
def format_output_vertically(output, delimiter="."):
    """
    Format the output vertically by splitting based on a delimiter (default is period).
    
    Parameters:
    output (str): The string output from the LLM.
    delimiter (str): The character to split the string on. Default is a period.
    
    Returns:
    str: The vertically formatted string.
    """
    # Split the output by the delimiter and strip leading/trailing spaces
    split_output = [line.strip() for line in output.split(delimiter)]
    
    # Join the split parts with line breaks to print vertically
    return "\n".join(split_output)

In [24]:
formatted_result = format_output_vertically(res, delimiter=".")

# Print the formatted result
print(formatted_result)

Based on the provided context, Toyota Environmental Challenge 2050 is a comprehensive environmental initiative launched by Toyota Motor Corporation in 2015
The challenge is aimed at achieving a significant reduction in CO2 emissions and promoting sustainability through various means, including the development and popularization of eco-friendly vehicles
The challenge involves six specific targets, including:

1
New Vehicle Zero CO2 Emissions Challenge: To reduce global average new vehicle CO2 emissions during operation by 90% by 2050 compared to the 2010 level
2
Life Cycle Zero CO2 Emissions Challenge: To completely eliminate all CO2 emissions from the entire vehicle life cycle, including production, use, and disposal
3
Plant Zero CO2 Emissions Challenge: To achieve zero CO2 emissions at all Toyota plants by 2050
4
Challenge of Minimizing and Optimizing Water Usage: To minimize water usage and implement water discharge management based on individual local conditions
5
Challenge of Estab