In this notebook we use a demo transcript and store it into Pinecone and retrieve data according to a query.
- Combined Speaker in Transcript
- Uses the new Pinecone Serverless
- Doc : https://docs.pinecone.io/docs/new-api

Author: Sartaj

In [None]:
# # Have the `transcript.csv` file in the same directory as this notebook
# # Create a .env file with the following variables:
# OPENAI_API_KEY
# PINECONE_SERVERLESS_API_KEY

In [None]:
# %pip install --upgrade pinecone-client #3.0.0

In [1]:
import pandas as pd
import dotenv
dotenv.load_dotenv()

True

Read Data and Pre-process

In [None]:
transcript = pd.read_csv('transcript.csv')
transcript.head()

In [None]:
transcript.dropna(inplace=True)
transcript.drop(['Unnamed: 0', 'end_time' ], axis=1, inplace=True)
transcript.head()

In [None]:
# Combining speakers's rows
import warnings
warnings.filterwarnings("ignore")
to_remove=[]
for i, record in transcript.iterrows():
    if i < len(transcript)-1:
        if transcript.speaker_label[i]== transcript.speaker_label[i+1]:
            transcript['text'][i]=transcript['text'][i]+" "+transcript['text'][i+1]
            to_remove.append(i+1)

transcript = transcript.drop(to_remove).reset_index(drop=True)

In [None]:
transcript.head()

Creating Embeddings

In [None]:
import os
import openai
from langchain.embeddings.openai import OpenAIEmbeddings

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') or 'OPENAI_API_KEY'

In [None]:
model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=OPENAI_API_KEY
)

In [None]:
texts = [
    'this is the first chunk of text',
    'then another second chunk of text is here'
]

res = embed.embed_documents(texts)
len(res), len(res[0]) # (x,1536) 1536 is the embedding size

In [None]:
def create_embedding(text):
    return embed.embed_documents([text])[0]

# get first row of transcript
test_embedding_function = create_embedding(transcript.iloc[0]['text'])
len(test_embedding_function) # 1 X 1536

Pinecone

In [None]:
from pinecone import Pinecone, ServerlessSpec
import time

PINECONE_API_KEY = os.getenv('PINECONE_SERVERLESS_API_KEY') or 'PINECONE_SERVERLESS_API_KEY'
index_name = 'langchain-retrieval-transcript'
namespace = 'new_namespace_2'

pinecone = Pinecone(api_key=PINECONE_API_KEY)

In [None]:
print(pinecone.list_indexes())
for index in pinecone.list_indexes():
    print(pinecone.delete_index(index))

In [None]:
# create index
if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        name=index_name,
        metric='cosine',
        dimension=len(res[0]),  # model_name = 'text-embedding-ada-002'; 1536 dim of text-embedding-ada-002
        
        spec=ServerlessSpec(
        cloud='aws', 
        region='us-west-2'
        # pod_type="p1.x1",
        ) 
    )   
    
while not pinecone.describe_index(index_name).status['ready']:
    time.sleep(1)


In [None]:
index = pinecone.Index(index_name)
index.describe_index_stats()

In [None]:
# index.delete(delete_all=True, namespace=namespace)

Inserting data into Pinecone

In [None]:
from tqdm.auto import tqdm
from uuid import uuid4

batch_limit = 90
texts = []
metadatas = []
meeting_id = 1
start_id = 0

for i, record in tqdm(transcript.iterrows()):
    # first get metadata fields for this record
    metadata = {
        'speaker': record['speaker_label'],
        'start_time': round(record['start_time'], 4), # limit to 4 decimal places 
        'meeting_id': meeting_id,
        'text': record['text'], # Storing the text in the metadata for now, later we'd need to decode it from vectors
    }

    record_texts = record['text']

    texts.append(record_texts)
    metadatas.append(metadata)

    # print(texts)
    # print(metadatas)

    # if we've reached the batch limit, then index the batch
    if len(texts) >= batch_limit:
        #ids = [str(uuid4()) for _ in range(len(texts))]
        ids = [str(i+1) for i in range(start_id,(start_id + len(texts)))]
        start_id += len(texts)
        embeds = embed.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas), namespace=namespace)
        texts = []
        metadatas = []
        meeting_id += 1

# add any remaining texts to the index
if len(texts) > 0:
    #ids = [str(uuid4()) for _ in range(len(texts))]
    ids = [str(i+1) for i in range(start_id,(start_id + len(texts)))]
    embeds = embed.embed_documents(texts)
    index.upsert(vectors=zip(ids, embeds, metadatas))
    
time.sleep(5)    

In [None]:
index.describe_index_stats()   

Querying Pinecone DB

In [None]:
# Need to look up about using LangChain for retrieval

In [None]:
query  = "What was talked regarding United States Congress?"
downstr_response = index.query(
    vector= embed.embed_documents([query])[0],
    # filter={
    #     "meeting_id": {"$in":[1, 2]}
    # },
    namespace=namespace, 
    top_k=10,
    include_metadata=True,
)
downstr_response

In [None]:
delta = 5
id = 60

# build a window of size +- delta of all numbers around id
window = [str(i) for i in range(id-delta, id+delta+1)]

fetch_response = index.fetch(ids=window,namespace=namespace)
fetch_response