In [1]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

In [2]:
from pinecone import Pinecone, ServerlessSpec

from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers

  from tqdm.autonotebook import tqdm


In [None]:
#! py -m pip install langchain==0.1.9

In [36]:
# trỏ địa chỉ đến api_key
pc = Pinecone(api_key="0dcca609-9f73-471b-9e57-882afb68c546")

# tạo index (đã tạo rồi nên skip)
pc.create_index(name="cooking-chatbot", dimension=384, spec=ServerlessSpec(cloud='aws', region='us-east-1'))
index = pc.Index("cooking-chatbot")

In [34]:
PINECONE_API_KEY = '0dcca609-9f73-471b-9e57-882afb68c546'

In [37]:
pc = Pinecone(api_key=PINECONE_API_KEY)
#pc.list_indexes().names() # check if index exsist
index_name='cooking-chatbot'
index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [38]:
def load_data(data):
    loader = DirectoryLoader(data, glob='*.pdf', loader_cls=PyPDFLoader)

    documents = loader.load()
    return documents

In [31]:
extracted_data = load_data("data/")

Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 560 0 (offset 0)


In [32]:
# create text chucks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap=40)
    text_chunks = text_splitter.split_documents(extracted_data)

    return text_chunks

In [33]:
text_chunks = text_split(extracted_data)
print("length of text chunks: ", len(text_chunks))

length of text chunks:  2793


In [39]:
def download_huggingface_embedding():
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings             

In [40]:
embeddings = download_huggingface_embedding()

In [41]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [42]:
query_result = embeddings.embed_query("hi")
print('Length', len(query_result))

Length 384


In [43]:
persist_directory = 'db'

In [44]:
vectordb = Chroma.from_documents(documents=text_chunks, embedding=embeddings, persist_directory=persist_directory)

In [45]:
vectordb.persist()

  warn_deprecated(


In [46]:
vectordb = None

In [47]:
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

In [48]:
vectordb

<langchain_community.vectorstores.chroma.Chroma at 0x1a5857289a0>

In [50]:
retriever = vectordb.as_retriever(search_kwargs={"k": 2})

retriever

VectorStoreRetriever(tags=['Chroma', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000001A5857289A0>, search_kwargs={'k': 2})

In [51]:
docs = retriever.invoke("What are pasta ingredients?")

In [52]:
docs

[Document(page_content='protein  35 g\ncarbohydrates  47 g\npotassium  734 mgmain dishes         pastas', metadata={'page': 88, 'source': 'data\\dinners_cookbook_508-compliant.pdf'}),
 Document(page_content='main-dish pastas \n•\t cold\tfusilli\tpasta\twith\t\t\nsummer\tvegetables\n•\t mushroom \tpenne\n•\t rotini\twith\tspicy\tred\t\t\npepper\tand\talmond\tsauce\n•\t pasta\tcaprese\t\n•\t linguini \twith\tclam\tsauce\n•\t heavenly \tchicken\twith\t\t\nangel\thair\tpasta\t\n•\t whole-wheat \tbow\ttie\tpasta\t\t\nwith\tputtanesca \tsauce\n•\t turkey \tbolognese \twith\t\t\nshell\tpasta\t\n•\t lemon \tand\tgarlic\tpasta\t\t\nwith\tpan-seared \tscallops\n•\t classic \tmacaroni \tand\tcheese\n•\t sweet \tand\tsour\tseashells', metadata={'page': 76, 'source': 'data\\dinners_cookbook_508-compliant.pdf'})]

In [53]:
prompt_template='''
Use the following pieces of information to answer the user's question.
If you don't know the answer,just say that you don't know,don't try to makeup the answer.
Context: {context}
Question: {question}

Only return the helpful aswer below and nothing else.
Helpful Answer:
'''

In [54]:
PROMPT = PromptTemplate(template=prompt_template, input_variables=['context','question'])
chain_type_kwargs = {'prompt': PROMPT}

In [65]:
llm=CTransformers(
    model="TheBloke/Llama-2-7B-Chat-GGML",
    model_type="llama",
    config={'max_new_tokens':2048,'context_length' : 1024,'temperature':0.8}
)

Fetching 1 files: 100%|██████████| 1/1 [00:00<?, ?it/s]
Fetching 1 files: 100%|██████████| 1/1 [00:00<?, ?it/s]


In [66]:
qa=RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectordb.as_retriever(search_kwargs={"k":2}),
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs
)

In [62]:
import warnings 
warnings.filterwarnings('ignore') 

In [None]:
while True:
    user_input=input(f"Input Prompt:")
    result=qa({"query": user_input})
    print("Response : ", result["result"])