In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader

In [3]:
#Extract data from the PDF
def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    
    documents = loader.load()

    return documents

In [4]:
extracted_data = load_pdf("data/")

In [5]:
extracted_data

[Document(page_content='', metadata={'source': 'data\\orgchem.pdf', 'page': 0}),
 Document(page_content='\t\n\t ', metadata={'source': 'data\\orgchem.pdf', 'page': 1}),
 Document(page_content='\t\n\t      Organic Chemistry: A Tenth Edition          AUTHOR JOHN MCMURRY, CORNELL UNIVERSITY (EMERITUS)          \n', metadata={'source': 'data\\orgchem.pdf', 'page': 2}),
 Document(page_content='\t\n\tOpenStax Rice University 6100 Main Street MS-375 Houston, Texas 77005  To learn more about OpenStax, visit https://openstax.org. Individual print copies and bulk orders can be purchased through our website.  ©2023 Rice University. Textbook content produced by OpenStax is licensed under a Creative Commons Attribution Non-Commercial ShareAlike 4.0 International License (CC BY-NC-SA 4.0). Under this license, any user of this textbook or the textbook contents herein can share, remix, and build upon the content for noncommercial purposes only. Any adaptations must be shared under the same type of lic

In [8]:
# split documents
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 150)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [9]:
text_chunks = text_split(extracted_data)
print("length of my chunk:", len(text_chunks))

length of my chunk: 2969


In [10]:
from langchain_openai import OpenAIEmbeddings

In [11]:
# define embedding
def embedding_in_use():
    embeddings = OpenAIEmbeddings()
    return embeddings

In [12]:
embeddings = embedding_in_use()

In [13]:
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x000001996CB2D3A0>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x00000199686EF3D0>, model='text-embedding-ada-002', dimensions=None, deployment='text-embedding-ada-002', openai_api_version='', openai_api_base=None, openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [14]:
from langchain.vectorstores import Chroma

In [29]:
vectordb = Chroma.from_documents(documents = text_chunks, embedding = OpenAIEmbeddings(), persist_directory = "./chroma")

In [30]:
query = "what is organic chemistry"

docs=vectordb.similarity_search(query, k=3)

print("Result", docs)

Result [Document(page_content='Organic Chemistry: A Tenth Edition          AUTHOR JOHN MCMURRY, CORNELL UNIVERSITY (EMERITUS)', metadata={'page': 2, 'source': 'data\\orgchem.pdf'}), Document(page_content='Organic Chemistry: A Tenth Edition          AUTHOR JOHN MCMURRY, CORNELL UNIVERSITY (EMERITUS)', metadata={'page': 2, 'source': 'data\\orgchem.pdf'}), Document(page_content='subs tanc e you’re interes ted in .\nHistoric ally, the term organic chemis trydates to the mid-1700s , when it w as use d to me an the chemis try of\nsubs tanc es found in living org anisms . Lit tle w as kno wn a bout chemis try at tha t time , and the b ehavior o f the\n“organic” subs tanc es isola ted from plants and animals seeme d diff erent from tha t of the “inorg anic” subs tanc es\nfound in minerals . Org anic c omp ounds w ere g enerally lo w-melting solids and w ere usually more difficult to\nisola te, purif y, and w ork with than high-melting inorg anic c omp ounds .\nBy the mid-1800s , ho wever, it w

In [31]:
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [32]:
from langchain.prompts import PromptTemplate
PROMPT=PromptTemplate(template=prompt_template, input_variables=["context", "question"])

In [33]:
from langchain_openai import ChatOpenAI

In [34]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

In [35]:
from langchain.chains import RetrievalQA
qa=RetrievalQA.from_chain_type(
    llm = llm,
    chain_type="stuff",
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT})

In [36]:
user_input = 'what is organic chemistry?'

In [37]:
result=qa({"query": user_input})

In [38]:
result['result']

'Organic chemistry is the study of the structure, properties, composition, reactions, and synthesis of carbon-containing compounds.'

In [38]:
# vectordb.delete_collection()