### Building Chatbot using LangChain & Pinecone

#### Imports & Environments 

In [3]:
# imports
import os
import pandas as pd 
from dotenv import load_dotenv


In [33]:
# load the environment
load_dotenv()

True

#### Data Loads

In [5]:
# this comes from huggingface, so if you don' have it install pip install datasets
from datasets import load_dataset
data = load_dataset('squad', split='train')
data

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 87599
})

In [6]:
# convert this to pandas DF
squad_df = data.to_pandas()
squad_df.head(5)

Unnamed: 0,id,title,context,question,answers
0,5733be284776f41900661182,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,"{'text': ['Saint Bernadette Soubirous'], 'answ..."
1,5733be284776f4190066117f,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,"{'text': ['a copper statue of Christ'], 'answe..."
2,5733be284776f41900661180,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,"{'text': ['the Main Building'], 'answer_start'..."
3,5733be284776f41900661181,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,{'text': ['a Marian place of prayer and reflec...
4,5733be284776f4190066117e,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,{'text': ['a golden statue of the Virgin Mary'...


In [7]:
# See the question, context and answer of one data point
print(f"Question:\n{squad_df.iloc[0]['question']}")
print(f"\nContext:\n{squad_df.iloc[0]['context']}")
print(f"\nAnswers:\n{squad_df.iloc[0]['answers']['text']}")

Question:
To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?

Context:
Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.

Answers:
['Saint Bernadette Soubirous']


#### Data Processing 

In [8]:
# seems like lot of duplicate context
sum(squad_df['context'].duplicated())

68708

In [9]:
# dropping duplicates context etc so as to have just one context
squad_df.drop_duplicates(subset='context', keep='first', inplace=True)

In [10]:
# No of rows and columns
squad_df.shape

(18891, 5)

#### Embeddings

In [11]:
# import embeddings using langchain 
from langchain_openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings(model='text-embedding-ada-002')
texts = "Hello, how are you?"
embedding_vectors = embedding.embed_query(texts)
print(f'embedding_vectors length: {len(embedding_vectors)}\nvector : {embedding_vectors}')

embedding_vectors length: 1536
vector : [-0.008512482047080994, -0.0006931925308890641, 0.0034795545507222414, -0.03322974592447281, -0.012035530991852283, 0.019162403419613838, -0.009320235811173916, -0.00918975193053484, -0.017497189342975616, -0.010544292628765106, 0.03141540661454201, 0.010842540301382542, -0.016875838860869408, -0.00857461616396904, 0.007742009125649929, -0.01608051359653473, 0.02649432234466076, -0.007704727817326784, 0.029625922441482544, -0.012911632657051086, -0.021374406293034554, 0.003367711789906025, 0.01821795292198658, -0.0022384098265320063, 0.0019867634400725365, -0.010500798933207989, 0.01670186221599579, -0.01594381593167782, 0.016552738845348358, -0.026419760659337044, 0.005421270150691271, -0.0007219298859126866, -0.007474828977137804, -0.004259347449988127, 0.012047957628965378, -0.02158566564321518, 0.0011060013202950358, -0.009537708014249802, 0.023337868973612785, -0.015260332264006138, 0.02294020541012287, -0.0016232742927968502, 0.013259587809

In [52]:
# lets create a function for performing the embeddings
def get_embeddings (texts, model):
    embedding = OpenAIEmbeddings(model=model)
    result = embedding.embed_documents(texts)
    return result

#### Vector Database Setup

In [62]:
from pinecone import Pinecone,ServerlessSpec
from langchain.vectorstores import Pinecone as LangchainPinecone


# Initialize Pinecone client
pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))

# Define index name and embedding model
index_name = "ai-agent"
embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002")

# Check if the index exists, if not, create it
if index_name not in pc.list_indexes():
    pc.create_index(
        name=index_name,
        metric="dotproduct",
        dimension=1536,  # Adjust based on your embedding model
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
    print(f"Index '{index_name}' created successfully.")

# Connect to the Pinecone index
index = pc.Index(index_name)




Index 'ai-agent' created successfully.


In [63]:
pc.list_indexes()

[
    {
        "name": "ai-agent",
        "metric": "dotproduct",
        "host": "ai-agent-v8rrxbr.svc.aped-4627-b74a.pinecone.io",
        "spec": {
            "serverless": {
                "cloud": "aws",
                "region": "us-east-1"
            }
        },
        "status": {
            "ready": true,
            "state": "Ready"
        },
        "vector_type": "dense",
        "dimension": 1536,
        "deletion_protection": "disabled",
        "tags": null
    }
]

#### Indexing

In [64]:
# for the progress bar
from tqdm.auto import tqdm

In [65]:
# Creating a new df using the random sample of data from the original squad_df. 
df_sample = squad_df.sample(5, random_state=45)

# Lets create a batch size for the processing 
batch_size = 2

# The embedding model from OpenAI
model = 'text-embedding-ada-002'

In [None]:

# Loop through entire sample with a batch of 10 at a time. 
for i in tqdm(range(0, len(df_sample), batch_size)):
    
    # extract 1o rows
    batch = df_sample.iloc[i:i+batch_size]

    # get the meta data for each batch
    meta_data=[]


    # loop through each row in the batch and extract data
    for _, row in batch.iterrows(): 
        data = {
            'title': row['title'],
            'context': row['context']
        }
        meta_data.append(data)
    
    # Create the document and get the embeddings 
    docs=batch['context'].tolist()
    embedded_docs = get_embeddings(docs,model)
    ids = batch['id'].tolist()

    # Upsert into the vector DB
    upsert_data = zip(ids, embedded_docs, meta_data)
    index.upsert(vectors=upsert_data)


100%|██████████| 3/3 [00:02<00:00,  1.08it/s]


#### Define AI Agent

In [None]:
from langchain_groq import ChatGroq
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.chains import RetrievalQA

In [34]:
llm = ChatGroq(model_name='Gemma2-9b-It')

In [35]:
con_history = ConversationBufferWindowMemory(memory_key='chat_history', k=5, return_messages=True)

  con_history = ConversationBufferWindowMemory(memory_key='chat_history', k=5, return_messages=True)


In [80]:
from langchain.vectorstores import Pinecone
import pinecone

In [82]:
# Updated code with new libraries and classes
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings 

model = "text-embedding-ada-002"

# Initialize the vector store with the correct embedding method
embeddings = OpenAIEmbeddings(model=model)


vectorstore = PineconeVectorStore(index, embeddings, "context") # df['context'] column is the actual text field to search from

# Perform the similarity search, pure semantic, nothing genrative
query = "destruction of US fifth fleet"
results = vectorstore.similarity_search(query, k=2)


In [83]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()

)

In [85]:
query = "who is prince Gautama?"
qa.invoke(query) # retrieving the info

{'query': 'who is prince Gautama?',
 'result': 'Prince Gautama is mentioned in the first paragraph of text provided. \n\nAccording to the text, he was the young prince whose birth was followed by a prophecy from an astrologer.  \n'}

In [86]:
from langchain.agents import Tool

tools = [
    Tool(
    name = 'Knowledge Base',
    func = qa.invoke,
    description = ('use this when answering based on knwowledge')
    )
]

In [87]:
from langchain.agents import initialize_agent
from langchain.agents import AgentType

agent = initialize_agent(
    agent=AgentType.CHAT_CONVERSATIONAL_REACT_DESCRIPTION,
    tools=tools,
    llm=llm,
    verbose=True,
    max_iterations=3,
    early_stopping_method='generate',
    memory=con_history 
)

  agent = initialize_agent(


In [88]:
agent("when was university of notredame established") # chat gpt kind

  agent("when was university of notredame established") # chat gpt kind




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "Knowledge Base",
    "action_input": "When was the University of Notre Dame established?"
}
```
[0m
Observation: [36;1m[1;3m{'query': 'When was the University of Notre Dame established?', 'result': 'This document does not contain the answer to your question. \n'}[0m
Thought:[32;1m[1;3m```json
{
    "action": "Final Answer",
    "action_input": "I do not have access to past conversations or tool responses."
}
```[0m

[1m> Finished chain.[0m


{'input': 'when was university of notredame established',
 'chat_history': [],
 'output': 'I do not have access to past conversations or tool responses.'}