In [50]:
# Installing libs.
# !pip install pinecone-client
# !pip install tqdm
# !pip install pandas
# !pip install openai
# !pip install langchain
# !pip install python-dotenv
# !pip install tiktoken

In [69]:
# Base Python data handling environment imports 
import pandas as pd
import os
from tqdm.auto import tqdm
import time

# Pinecone is a cloud-based Vector Database we'll use 
# to store embeddings
from pinecone import Pinecone as pc, ServerlessSpec

# OpenAI is used for the embedding LLM and GenAI model 
# used to generate responses
import openai

# Langchain is middleware that ties together the components 
# of the embedding and retrieval pipelines 

# The embedding chain creates searchable vectors of our data
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone as lang_pine

# A link in the chain to operate a chat session
from langchain.chat_models import ChatOpenAI

# We'll maintain some memory of the chat so follow-up questions
# will be context-sensitive
from langchain.chains.conversation.memory \
import ConversationBufferWindowMemory
from langchain.chains import RetrievalQA

In [54]:
from dotenv  import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Access your environment variables

OPENAI_KEY=os.getenv("OPENAI_KEY")
openai.api_key = OPENAI_KEY
EMBEDDING_MODEL=os.getenv("EMBEDDING_MODEL")
GENAI_MODEL=os.getenv("GENAI_MODEL")
# too poor to run GPT 4 :)

PINECONE_KEY=os.getenv("PINECONE_KEY")
PINECONE_ENV=os.getenv("PINECONE_ENV")
PINECONE_INDEX_NAME="default"

print(GENAI_MODEL)
print(PINECONE_ENV)

gpt-3.5-turbo
us-west2-aws


Read input Data

In [55]:
URL = "https://rhkdemo.blob.core.windows.net/demodata/squad-content.tsv"
df = pd.read_csv(URL, sep='\t')


In [56]:
df.head()
# df.shape

Unnamed: 0.1,Unnamed: 0,id,subject,context
0,0,5733be284776f41900661182,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha..."
1,1,5733bf84d058e614000b61be,University_of_Notre_Dame,"As at most other universities, Notre Dame's st..."
2,2,5733bed24776f41900661188,University_of_Notre_Dame,The university is the major seat of the Congre...
3,3,5733a6424776f41900660f51,University_of_Notre_Dame,The College of Engineering was established in ...
4,4,5733a70c4776f41900660f64,University_of_Notre_Dame,All of Notre Dame's undergraduate students are...


In [66]:
filtered_df = df.loc[df['subject'].isin(['London'])]
print(filtered_df['subject'].value_counts())
filtered_df.head()

subject
London    77
Name: count, dtype: int64


Unnamed: 0.1,Unnamed: 0,id,subject,context
10043,10043,5726651c5951b619008f717f,London,London i/ˈlʌndən/ is the capital and most popu...
10044,10044,57266689f1498d1400e8de68,London,"London is a leading global city, with strength..."
10045,10045,57266847f1498d1400e8dea4,London,London has a diverse range of peoples and cult...
10046,10046,57266952f1498d1400e8ded4,London,London contains four World Heritage Sites: the...
10047,10047,5727c4f12ca10214002d95dc,London,"From 1898, it was commonly accepted that the n..."


Create a Vector DB

In [71]:
#initialize pine cone
pc = pc(api_key=PINECONE_KEY)

index_list = pc.list_indexes()
if len(index_list) == 0:
    print("Creating index...")
    pc.create_index(
    name="quickstart",
    dimension=1536, 
    metric="dotproduct",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)
    
print(pc.describe_index("quickstart"))
index = pc.Index("quickstart")

{'dimension': 1536,
 'host': 'quickstart-7na8j7z.svc.aped-4627-b74a.pinecone.io',
 'metric': 'dotproduct',
 'name': 'quickstart',
 'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
 'status': {'ready': True, 'state': 'Ready'}}


In [70]:
embed = OpenAIEmbeddings(
    model = EMBEDDING_MODEL,
    openai_api_key= OPENAI_KEY)

In [72]:
batch_size = 20 

for i in tqdm(range(0, len(filtered_df), batch_size)):
    
    # OpenAPI has rate limits, and we use batches to slow the pace of embedding requests
    i_end = min(i+batch_size, len(filtered_df))
    batch = filtered_df.iloc[i:i_end]
    # print(batch)
    # When querying the Vector DB for nearest vectors, the metadata 
    # is what is returned and added to the LLM Prompt (the "Grounding Knowledge")
    meta_data = [{"subject" : row['subject'], 
              "context": row['context']} 
             for i, row in batch.iterrows()]
    
    # Get a list of documents to submit to OpenAI for embedding  
    docs = batch['context'].tolist() 
    emb_vectors = embed.embed_documents(docs) 
    # print(i, "hurry")
    # The original ID keys are used as the PK in the Vector DB
    ids = batch['id'].tolist()
    
    # Add embeddings, associated metadata, and the keys to the vector DB
    to_upsert = zip(ids, emb_vectors, meta_data)    
    index.upsert(vectors=to_upsert)
    
    # Pause for 10 seconds after each batch to avoid rate limits
    time.sleep(10) 

100%|██████████| 4/4 [00:47<00:00, 11.96s/it]


In [62]:
vectorstore = lang_pine(index, embed, "context")
query = "Who founded Dell?" #ask some question that's answerable with the content added to the Vector DB
vectorstore.similarity_search(query, k=3)

[Document(page_content="In 1986, Michael Dell brought in Lee Walker, a 51-year-old venture capitalist, as president and chief operating officer, to serve as Michael's mentor and implement Michael's ideas for growing the company. Walker was also instrumental in recruiting members to the board of directors when the company went public in 1988. Walker retired in 1990 due to health, and Michael Dell hired Morton Meyerson, former CEO and president of Electronic Data Systems to transform the company from a fast-growing medium-sized firm into a billion-dollar enterprise.", metadata={'subject': 'Dell'}),
 Document(page_content="Dell traces its origins to 1984, when Michael Dell created Dell Computer Corporation, which at the time did business as PC's Limited, while a student of the University of Texas at Austin. The dorm-room headquartered company sold IBM PC-compatible computers built from stock components. Dell dropped out of school to focus full-time on his fledgling business, after getting

In [63]:
# Create a reference to the OpenAI LLM
llm = ChatOpenAI(openai_api_key = OPENAI_KEY,
                model_name = GENAI_MODEL,
                temperature = 0.0)

# Ensure the chat session includes memory of 5 previous messages
conv_mem = ConversationBufferWindowMemory(
    memory_key = 'history',
    k = 5,
    return_messages =True)

# Create the chain to manage the chat session
qa = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = "stuff",
    retriever = vectorstore.as_retriever())

  warn_deprecated(


In [73]:
qa.run("What do people like about london?")

"People like London for its diverse range of strengths and attractions. Some of the key aspects that people appreciate about London include its vibrant arts scene, strong presence in commerce and finance, world-class education institutions, rich cultural heritage, iconic landmarks, extensive entertainment options, diverse fashion scene, top-notch healthcare facilities, bustling media industry, and efficient transport system. Additionally, London's status as a leading global city, its historical significance, and its reputation as a top tourist destination contribute to its appeal for many people."

In [74]:
qa.run("is it expensive to live there?")

'Yes, it is expensive to live in London. London is known for having one of the highest property prices in Europe, with the average price per square meter in central London being significantly higher than other G8 European capital cities. Additionally, London is considered one of the most expensive cities in the world, alongside cities like Tokyo and Moscow. The cost of living in London, including housing, transportation, and other expenses, is generally high compared to many other cities.'

In [75]:
qa.run("Does dell make surfboards?")

'No, Dell does not manufacture surfboards. Dell is primarily known for selling personal computers, servers, data storage devices, and other technology-related products.'

In [76]:
qa.run("Do they make laptops?")

'Yes, Dell makes laptops. They assemble 95% of Dell notebooks in their facilities in Penang, Malaysia, and Xiamen, China. Dell is known for its range of laptops, including the XPS line of notebooks.'

In [77]:
qa.run("Who founded Dell computer?")

'Dell Computer Corporation was founded by Michael Dell in 1984.'