In [20]:
from langchain_community.llms import LlamaCpp
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone as PineconeVecDb
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.prompts import PromptTemplate, ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from pinecone import Pinecone

from dotenv import load_dotenv
import os

In [2]:
load_dotenv()
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_ENV_NAME = os.getenv('PINECONE_ENV_NAME')
PINECONE_INDEX = os.getenv('PINECONE_INDEX')

### Functions

In [3]:
# FUNCTIONS
def load_data(path: str):
    loader = DirectoryLoader(path=path, glob='*.pdf', loader_cls=PyPDFLoader)
    return loader.load()

def split_text(documents):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=20)
    return text_splitter.split_documents(documents)

### Extracting Data

In [17]:
extracted_data = load_data('../documents/')
overlap_data = split_text(extracted_data)

In [4]:
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/paraphrase-MiniLM-L6-v2')

In [5]:
# init pinecone
pinecone_client = Pinecone(api_key=PINECONE_API_KEY)

In [22]:
docsearch = PineconeVecDb.from_documents(overlap_data, embeddings, index_name=PINECONE_INDEX)

In [10]:
_index = PineconeVecDb(embedding=embeddings, index=pinecone_client.Index(PINECONE_INDEX), text_key='text')
retriever = _index.as_retriever()

In [9]:
q = 'What is a micro service?'
ans = _index.similarity_search(q, k=3)
print(ans)

[Document(page_content='running in different places, provided they have network connectivity to the queue. \nYour programs are decoupled from others; instead, your designs start to care about \nthe ins and outs of specialized micro-services, rather than the flow of data through a', metadata={'page': 142.0, 'source': '../documents/Go Microsevices.pdf'}), Document(page_content='components of a system into isolated and specialized micro-services. We started  \nan instance of NSQ by first running the lookup daemon nsqlookupd , before running \na single nsqd  instance and connecting them together via a TCP interface. We were', metadata={'page': 170.0, 'source': '../documents/Go Microsevices.pdf'}), Document(page_content="as Apple's Time Machine, Seagate, or network-attached storage products, to name \na few. Most consumer tools provide some key automatic functionality, along with \nan app or website for you to manage your policies and content. Often, especially for", metadata={'page': 227.0

In [12]:
prompt_template = """
Use the following informations to answer the question. If you don't know the answer, don't try to make it up. Only use the informations given!

Context: {context}
Question: {question}

Only return helpful answers below and nothing else.
Answer:
"""

In [15]:
prompt = PromptTemplate(template=prompt_template, input_variables=['context', 'question'])
chain_type_kwargs = {'prompt':prompt}

prompt2 = ChatPromptTemplate.from_template(template=prompt_template)

In [17]:
llama2 = LlamaCpp(
    model_path='../models/llama-2.gguf',
    n_gpu_layers=-1,
    n_batch=512,
    verbose=False
)

In [33]:
chain = (
    { "context": retriever, "question": RunnablePassthrough() }
    | prompt2
    | llama2
    | StrOutputParser()
)

chain.invoke("WHo is my chemical romance?")

'The name of your Chemical Romance is Tyler.'

In [58]:
qa = RetrievalQA.from_chain_type(
    llm=llama2,
    chain_type='stuff',
    retriever=docsearch.as_retriever(search_kwargs={'k':2}),
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs
)

In [63]:
res = qa({ "query" : "Why golang?"})

In [64]:
print(res['result'])

Golang is used because it is a good practice to have a stable Go environment for version 1.0.


In [65]:
res = qa({ 'query': 'is pitbull a dog?'})
print(res['result'])

Using the provided information and API, I have tried chaining the programs together and got the following results:

pitbull is a dog.

Therefore, the answer to your question is "Yes", pitbull is a dog.


In [66]:
print(res)

{'query': 'is pitbull a dog?', 'result': 'Using the provided information and API, I have tried chaining the programs together and got the following results:\n\npitbull is a dog.\n\nTherefore, the answer to your question is "Yes", pitbull is a dog.', 'source_documents': [Document(page_content="Chapter 4[ 115 ]confabulation\nschmooze\nNew World chat\nOld World chat\nconversation\nthrush\nwood warbler\nchew the fat\nshoot the breeze\nchitchat\nchatter\nThe results you get will most likely differ from what we have listed here since we're \nhitting a live API, but the important aspect here is that when we give a word or term \nas input to the program, it returns a list of synonyms as output, one per line.\nTry chaining your programs together in various orders to see what", metadata={'page': 129.0, 'source': 'documents/Go Microsevices.pdf'}), Document(page_content='you can. For an up-to-date list of what the vet tool will report on, check out the \ndocumentation at https://godoc.org/code.goo