<a href="https://colab.research.google.com/github/OjashKush/RAG-chatbot/blob/main/Copy_of_main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
!pip install openai pinecone-client datasets tiktoken langchain pandas gradio



In [8]:
!pip install kaleido cohere



In [9]:
!pip install -U openai pinecone-client datasets



In [10]:
# Base Python data handling environment imports
import pandas as pd
import os
from tqdm.auto import tqdm
import time

# Pinecone is a cloud-based Vector Database we'll use
# to store embeddings
import pinecone

# OpenAI is used for the embedding LLM and GenAI model
# used to generate responses
import openai

# Langchain is middleware that ties together the components
# of the embedding and retrieval pipelines

# The embedding chain creates searchable vectors of our data
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone

# A link in the chain to operate a chat session
from langchain.chat_models import ChatOpenAI

# We'll maintain some memory of the chat so follow-up questions
# will be context-sensitive
from langchain.chains.conversation.memory \
import ConversationBufferWindowMemory
from langchain.chains import RetrievalQA

In [11]:
from getpass import getpass

In [12]:
# Use getpass to securely input the OpenAI API key
os.environ["OPENAI_API_KEY"] = getpass("Enter your OpenAI API key: ")

# Set the API key for the OpenAI library
openai.api_key = os.getenv("OPENAI_API_KEY")

# Define other variables
EMBEDDING_MODEL = "text-embedding-ada-002"
GENAI_MODEL = "gpt-3.5-turbo"


Enter your OpenAI API key: ··········


In [13]:
# Use getpass to securely input the Pinecone API key
os.environ["PINECONE_KEY"] = getpass("Enter your Pinecone API key: ")

# Set the Pinecone API key
PINECONE_KEY = os.environ["PINECONE_KEY"]
PINECONE_ENV = "gcp-starter"
PINECONE_INDEX_NAME = "default"  # this will be created below

Enter your Pinecone API key: ··········


In [14]:
#testing
df = pd.DataFrame({
    'text1': ["hello"],
    'text2': ["world"]
})

df['combined'] = df['text1'] + " " + df['text2']

print(df)

   text1  text2     combined
0  hello  world  hello world


In [15]:
#testing api
from openai import OpenAI
client = OpenAI()

def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

df['ada_embedding'] = df.combined.apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))

In [16]:
#testing api
import os
os.makedirs('output', exist_ok=True)  # Create the directory if it doesn't exist
df.to_csv('output/embedded_1k_reviews.csv', index=False)


In [17]:
URL = "https://rhkdemo.blob.core.windows.net/demodata/squad-content.tsv"
df = pd.read_csv(URL, sep='\t')
df.head()

Unnamed: 0.1,Unnamed: 0,id,subject,context
0,0,5733be284776f41900661182,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha..."
1,1,5733bf84d058e614000b61be,University_of_Notre_Dame,"As at most other universities, Notre Dame's st..."
2,2,5733bed24776f41900661188,University_of_Notre_Dame,The university is the major seat of the Congre...
3,3,5733a6424776f41900660f51,University_of_Notre_Dame,The College of Engineering was established in ...
4,4,5733a70c4776f41900660f64,University_of_Notre_Dame,All of Notre Dame's undergraduate students are...


In [18]:
df.shape

(18891, 4)

In [19]:
# Fetch only context knowledge about Dell
filtered_df = df.loc[df['subject'].isin(['Dell'])]
print(filtered_df['subject'].value_counts())
filtered_df.head()

Dell    45
Name: subject, dtype: int64


Unnamed: 0.1,Unnamed: 0,id,subject,context
7680,7680,570fc4805ab6b81900390fa7,Dell,Dell was listed at number 51 in the Fortune 50...
7681,7681,570fc65b80d9841400ab366b,Dell,"Originally, Dell did not emphasize the consume..."
7682,7682,570fc6df5ab6b81900390fbb,Dell,Dell had long stuck by its direct sales model....
7683,7683,570fc7d35ab6b81900390fcd,Dell,"In the shrinking PC industry, Dell continued t..."
7684,7684,570fc88e80d9841400ab367d,Dell,"Dell's manufacturing process covers assembly, ..."


In [20]:
pinecone.init(api_key = PINECONE_KEY, environment = PINECONE_ENV)
index_list = pinecone.list_indexes()
if len(index_list) == 0:
    print("Creating index...")
    pinecone.create_index(PINECONE_INDEX_NAME, dimension=1536, metric='dotproduct')

print(pinecone.describe_index(PINECONE_INDEX_NAME))
index = pinecone.Index(PINECONE_INDEX_NAME)

IndexDescription(name='default', metric='dotproduct', replicas=1, dimension=1536.0, shards=1, pods=1, pod_type='starter', status={'ready': True, 'state': 'Ready'}, metadata_config=None, source_collection='')


In [21]:
# This references the text-embedding-ada-002 OpenAI model we'll use to create embeddings
# Both for indexing ground knowledge content, and later when searching ground knowledge
# For RAG documents to include in LLM Prompts

embed = OpenAIEmbeddings(
    model = EMBEDDING_MODEL,
    openai_api_key= openai.api_key)

  warn_deprecated(


In [22]:

from openai import OpenAI
client = OpenAI()

In [23]:
# This is a for loop to create embeddings for each of the Dell articles, and
# Then add the embeddings and orgiional article text to the vector database!


batch_size =20

for i in tqdm(range(0, len(filtered_df), batch_size)):
    # OpenAPI has rate limits, and we use batches to slow the pace of embedding requests
    i_end = min(i+batch_size, len(filtered_df))
    batch = filtered_df.iloc[i:i_end]

    # When querying the Vector DB for nearest vectors, the metadata
    # is what is returned and added to the LLM Prompt (the "Grounding Knowledge")
    meta_data = [{"subject" : row['subject'], "context": row['context']}
             for i, row in batch.iterrows()]

    docs = batch['context'].tolist()

    emb_response = client.embeddings.create(input=docs, model=EMBEDDING_MODEL)

    # Extract embeddings from the response
    emb_vectors = [doc.embedding for doc in emb_response.data]


    emb_response = client.embeddings.create(input=docs, model=EMBEDDING_MODEL)
    # Get a list of documents to submit to OpenAI for embedding

    # The original ID keys are used as the PK in the Vector DB
    ids = batch['id'].tolist()

    # Add embeddings, associated metadata, and the keys to the vector DB
    to_upsert = zip(ids, emb_vectors, meta_data)
    index.upsert(vectors=to_upsert)

    # Pause for 10 seconds after each batch to avoid rate limits
    time.sleep(50)

  0%|          | 0/3 [00:00<?, ?it/s]

In [24]:

vectorstore = Pinecone(index, embed, "context")
query = "Who founded Dell?" #ask some question that's answerable with the content added to the Vector DB
vectorstore.similarity_search(query, k=3)

[Document(page_content="In 1986, Michael Dell brought in Lee Walker, a 51-year-old venture capitalist, as president and chief operating officer, to serve as Michael's mentor and implement Michael's ideas for growing the company. Walker was also instrumental in recruiting members to the board of directors when the company went public in 1988. Walker retired in 1990 due to health, and Michael Dell hired Morton Meyerson, former CEO and president of Electronic Data Systems to transform the company from a fast-growing medium-sized firm into a billion-dollar enterprise.", metadata={'subject': 'Dell'}),
 Document(page_content="Dell traces its origins to 1984, when Michael Dell created Dell Computer Corporation, which at the time did business as PC's Limited, while a student of the University of Texas at Austin. The dorm-room headquartered company sold IBM PC-compatible computers built from stock components. Dell dropped out of school to focus full-time on his fledgling business, after getting

In [25]:
# Create a reference to the OpenAI LLM
llm = ChatOpenAI(openai_api_key = openai.api_key,
                model_name = GENAI_MODEL,
                temperature = 0.0)

# Ensure the chat session includes memory of 5 previous messages
conv_mem = ConversationBufferWindowMemory(
    memory_key = 'history',
    k = 5,
    return_messages =True)

# Create the chain to manage the chat session
qa = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = "stuff",
    retriever = vectorstore.as_retriever())

  warn_deprecated(



Now have a conversation about the documents that were added to the grounding data vector database

In [26]:
qa.run("Who founded Dell computer?")

  warn_deprecated(


'Michael Dell founded Dell Computer Corporation.'