In [1]:
!pip install langchain openai chromadb pypdf langchain-community langchain-openai tiktoken

Collecting langchain
  Downloading langchain-0.1.4-py3-none-any.whl (803 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.6/803.6 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting openai
  Downloading openai-1.10.0-py3-none-any.whl (225 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.1/225.1 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting chromadb
  Downloading chromadb-0.4.22-py3-none-any.whl (509 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m509.0/509.0 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pypdf
  Downloading pypdf-4.0.0-py3-none-any.whl (283 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m283.9/283.9 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-community
  Downloading langchain_community-0.0.16-py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m12.7 MB/s[0

In [2]:
from google.colab import userdata
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')

In [3]:
from langchain_openai import OpenAI

llm = OpenAI(openai_api_key=OPENAI_API_KEY)

In [4]:
llm.invoke(
    "Tell me a Joke?"
)

"\n\nWhy couldn't the bicycle stand up by itself?\n\nBecause it was two-tired!"

# Load Q & A

[Q&A with RAG](https://python.langchain.com/docs/use_cases/question_answering/quickstart)

In [20]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("/content/automateddrivingsystems.pdf")
pages = loader.load_and_split()

In [21]:
from langchain.chains.question_answering import load_qa_chain

chain = load_qa_chain(llm=llm, chain_type="map_reduce")

query = "what are the stages involved in ADS feature identification ?"

chain.run(input_documents=pages, question=query)

' The stages involved in ADS feature identification are: requirements gathering, feature identification, scenario generation, test case design, test execution, results analysis, bug reporting and fixing, retesting, verification and validation, deployment, and task analysis.'

# Retrieval Q&A

[PDF](https://python.langchain.com/docs/modules/data_connection/document_loaders/pdf)

In [22]:
# Load Document
loader = PyPDFLoader("/content/automateddrivingsystems.pdf")
documents = loader.load_and_split()

[Recursively split by character
](https://python.langchain.com/docs/modules/data_connection/document_transformers/recursive_text_splitter)

In [23]:
# Split the documents into chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)

texts = text_splitter.split_documents(documents)

[OPEN_AI Embeddings](https://python.langchain.com/docs/integrations/text_embedding/openai)

In [24]:
# Embeddings
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large",
                              openai_api_key=OPENAI_API_KEY)

[Vector stores](https://python.langchain.com/docs/modules/data_connection/vectorstores/)

In [25]:
# DB
from langchain_community.vectorstores import Chroma
db = Chroma.from_documents(texts, embeddings)



[Retrievers](https://python.langchain.com/docs/modules/data_connection/retrievers/vectorstore)

In [27]:
# expose this index in a retriever interface
retriever = db.as_retriever(
    search_type="similarity", search_kwargs={"k": 2}
)

# [RetrievalQA Chain](https://docs.smith.langchain.com/cookbook/hub-examples/retrieval-qa-chain)

In [28]:
# create a chain to answer questions
from langchain.chains import RetrievalQA
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True)

query = "what are the stages involved in ADS feature identification ?"

result = qa({"query": query})

  warn_deprecated(


In [29]:
result

{'query': 'what are the stages involved in ADS feature identification ?',
 'result': ' The stages involved in ADS feature identification are not mentioned in the provided context.',
 'source_documents': [Document(page_content='engaged. The stages invol ved in ADS feature identification were as follows .', metadata={'page': 23, 'source': '/content/automateddrivingsystems.pdf'}),
  Document(page_content='involved in the ADS feature identification process. \n \nFigure 1. ADS Feature Selection Process', metadata={'page': 24, 'source': '/content/automateddrivingsystems.pdf'})]}

# ConversationalRetrievalChain

[ConversationalRetrievalChain](https://api.python.langchain.com/en/latest/chains/langchain.chains.conversational_retrieval.base.ConversationalRetrievalChain.html)

In [31]:
from langchain.chains import ConversationalRetrievalChain

In [32]:
# load document
loader = PyPDFLoader("/content/automateddrivingsystems.pdf")
documents = loader.load()

# split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)

texts = text_splitter.split_documents(documents)

# select which embeddings we want to use
embeddings = OpenAIEmbeddings(model="text-embedding-3-large",
                              openai_api_key=OPENAI_API_KEY)


# create the vectorestore to use as the index
db = Chroma.from_documents(texts, embeddings)


# expose this index in a retriever interface
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k":2})



In [33]:
# create a chain to answer questions
qa = ConversationalRetrievalChain.from_llm(llm, retriever)

In [34]:
chat_history = []
query = "what are the stages involved in ADS feature identification ?"
result = qa({"question": query, "chat_history": chat_history})



In [35]:
result

{'question': 'what are the stages involved in ADS feature identification ?',
 'chat_history': [],
 'answer': " I don't know."}