In [1]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
from pinecone import Pinecone, ServerlessSpec
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers

  from tqdm.autonotebook import tqdm


In [2]:
PINECONE_API_KEY=""

In [3]:
## Data Loading from the PDF

def load_pdf(data):
    loader= DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    
    documents= loader.load()
    return documents

In [4]:
extracted_data= load_pdf(r"")

In [5]:
#Creating text chunks
def text_split(extracted_data):
    text_splitter= RecursiveCharacterTextSplitter(chunk_size= 500, chunk_overlap= 20)
    text_chunks= text_splitter.split_documents(extracted_data)

    return text_chunks

In [30]:
text_chunks= text_split(extracted_data)
print("Length of the chunks", len(text_chunks))
print(type(text_chunks))
print(type(text_chunks[0]))

Length of the chunks 1525
<class 'list'>
<class 'langchain_core.documents.base.Document'>


In [29]:
for doc in extracted_data[:2]:
    print(doc.page_content[:1000])
print(type(extracted_data))
print(type(extracted_data[0]))


Django JavaScript Integration: 
AJAX and jQuery
Develop AJAX applications using Django and jQuery
Jonathan Hayward
 
   BIRMINGHAM - MUMBAI  

<class 'list'>
<class 'langchain_core.documents.base.Document'>


In [7]:
#download embedding model
def download_hugging_face_embeddings():
    embeddings= HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [8]:
embeddings= download_hugging_face_embeddings()

  warn_deprecated(


In [9]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [10]:
index_name= "chatbot"

pc=Pinecone(api_key=PINECONE_API_KEY)

#creating index

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name= index_name,
        dimension= 384,
        metric= 'cosine',
        spec= ServerlessSpec(cloud= 'aws', region="us-east-1")
    )

In [11]:
index= pc.Index(index_name)
index

<pinecone.data.index.Index at 0x1bd081afdf0>

In [12]:
def create_embeddings(text_chunks, embeddings):
    texts= [chunk.page_content for chunk in text_chunks]
    return embeddings.embed_documents(texts)

In [13]:
def store_embeddings_in_pinecone(index, text_chunks, embeddings, batch_size= 1000):
    vectors= [(str(i), emb) for i, emb in enumerate(embeddings)]
    
    for i in range(0, len(vectors), batch_size):
        batch= vectors[i:i+batch_size]
        index.upsert(vectors= batch)
        print(f"upserted batch {i//batch_size+1}of {len(vectors)//batch_size+1}")

In [14]:
embeddings_list= create_embeddings(text_chunks, embeddings)

In [15]:
store_embeddings_in_pinecone(index,text_chunks, embeddings_list)

upserted batch 1of 2
upserted batch 2of 2


In [34]:
from langchain.vectorstores import Pinecone as LangChainPinecone

text_key= "page_content"
docsearch= LangChainPinecone(index= index, embedding= embeddings, text_key= text_key)

In [35]:
query = "Django javascript integration"

docs = docsearch.similarity_search(query, k=3)
print("Result:", docs)

Error during similarity search: ScoredVector has no attribute 'metadata' at ['['received_data', 'matches', 0]']['metadata']


In [None]:
prompt_template="""
Use the following peices of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else
Helpful answer:
"""

PROMPT= PromptTemplate(template=prompt_template, input_variables=["context", "question"] )
chain_type_kwargs= {"prompt": PROMPT}

In [None]:
llm= CTransformers(model="model/llama-2-7b-chat.ggmlv3.q4_0.bin",
                   model_type="llama",
                   config={'max_new_tokens': 512,
                            'temperature': 0.8})

In [None]:
qa= RetrievalQA.from_chain_type(
    llm= llm,
    chain_type= "stuff",
    retriever=docsearch.as_retriever(search_kwargs={'k':2}),
    return_source_documents= True,
    chain_type_kwargs= chain_type_kwargs
)

In [45]:
while True:
    user_input=input(f"Input Prompt")
    result=qa({"query": user_input})
    print("Response: ", result["result"])