In this notebook we use a demo transcript and store it into Pinecone and retrieve data according to a query.
- Combined Speaker in Transcript

Author: Sartaj

In [109]:
# # Have the `transcript.csv` file in the same directory as this notebook
# # Create a .env file with the following variables:
# OPENAI_API_KEY
# PINECONE_API_KEY
# PINECONE_ENVIRONMENT

In [1]:
import pandas as pd
import dotenv
dotenv.load_dotenv()

True

Read Data and Pre-process

In [2]:
transcript = pd.read_csv('transcript.csv')
transcript.head()

Unnamed: 0.1,Unnamed: 0,start_time,end_time,speaker_label,text
0,0,0.00015,0.0835,Speaker 1,How do I defend myself and my Children against...
1,1,0.08365,0.128983,Speaker 1,And you said to yourself that the police may n...
2,2,0.129167,0.135,Speaker 1,"Tell me,"
3,3,0.13515,0.189983,Speaker 2,you know what the best chance you got in a sit...
4,4,0.19015,0.203983,Speaker 2,She just stated that


In [3]:
transcript.dropna(inplace=True)
transcript.drop(['Unnamed: 0', 'end_time' ], axis=1, inplace=True)
transcript.head()

Unnamed: 0,start_time,speaker_label,text
0,0.00015,Speaker 1,How do I defend myself and my Children against...
1,0.08365,Speaker 1,And you said to yourself that the police may n...
2,0.129167,Speaker 1,"Tell me,"
3,0.13515,Speaker 2,you know what the best chance you got in a sit...
4,0.19015,Speaker 2,She just stated that


In [6]:
# Combining speakers's rows
import warnings
warnings.filterwarnings("ignore")
to_remove=[]
for i, record in transcript.iterrows():
    if i < len(transcript)-1:
        if transcript.speaker_label[i]== transcript.speaker_label[i+1]:
            transcript['text'][i]=transcript['text'][i]+" "+transcript['text'][i+1]
            to_remove.append(i+1)

transcript = transcript.drop(to_remove).reset_index(drop=True)

In [9]:
transcript.head()

Unnamed: 0,start_time,speaker_label,text
0,0.00015,Speaker 1,How do I defend myself and my Children against...
1,0.13515,Speaker 2,you know what the best chance you got in a sit...
2,0.48465,Narrator,Thank you all for coming today. My name is Kee...
3,1.232333,Speaker 1,"Fun is,"
4,1.241667,Narrator,is pretty much what I'm thinking about and wha...


Creating Embeddings

In [10]:
import os
import openai
from langchain.embeddings.openai import OpenAIEmbeddings

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') or 'OPENAI_API_KEY'

In [11]:
model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=OPENAI_API_KEY
)

In [12]:
texts = [
    'this is the first chunk of text',
    'then another second chunk of text is here'
]

res = embed.embed_documents(texts)
len(res), len(res[0]) # (x,1536) 1536 is the embedding size

(2, 1536)

In [13]:
def create_embedding(text):
    return embed.embed_documents([text])[0]

# get first row of transcript
test_embedding_function = create_embedding(transcript.iloc[0]['text'])
len(test_embedding_function) # 1 X 1536

1536

Pinecone

In [14]:
import pinecone
import time

PINECONE_API_KEY = os.getenv('PINECONE_API_KEY') or 'PINECONE_API_KEY'
PINECONE_ENVIRONMENT = os.getenv('PINECONE_ENVIRONMENT') or 'PINECONE_ENVIRONMENT'
index_name = 'langchain-retrieval-transcript'
namespace = 'meeting_topic'

pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENVIRONMENT)

In [22]:
print(pinecone.list_indexes())
for index in pinecone.list_indexes():
    print(pinecone.delete_index(index))

[]


In [23]:
# create index
if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        name=index_name,
        metric='cosine',
        dimension=len(res[0])  # model_name = 'text-embedding-ada-002'; 1536 dim of text-embedding-ada-002
    )   
    
while not pinecone.describe_index(index_name).status['ready']:
    time.sleep(1)


In [24]:
index = pinecone.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [None]:
# index.delete(delete_all=True, namespace=namespace)

Inserting data into Pinecone

In [25]:
from tqdm.auto import tqdm
from uuid import uuid4

batch_limit = 50
texts = []
metadatas = []
meeting_id = 1

for i, record in tqdm(transcript.iterrows()):
    # first get metadata fields for this record
    metadata = {
        'speaker': record['speaker_label'],
        'start_time': round(record['start_time'], 4), # limit to 4 decimal places 
        'meeting_id': meeting_id,
        'text': record['text'], # Storing the text in the metadata for now, later we'd need to decode it from vectors
    }

    record_texts = record['text']

    texts.append(record_texts)
    metadatas.append(metadata)

    # print(texts)
    # print(metadatas)
    
    # if we have reached the batch_limit we can add texts
    if len(texts) >= batch_limit:
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = embed.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas), namespace=namespace)
        texts = []
        metadatas = []
        meeting_id += 1

# add any remaining texts
if len(texts) > 0:
    ids = [str(uuid4()) for _ in range(len(texts))]
    embeds = embed.embed_documents(texts)
    index.upsert(vectors=zip(ids, embeds, metadatas))
    
time.sleep(5)    

142it [00:01, 71.23it/s]


In [26]:
index.describe_index_stats()   

{'dimension': 1536,
 'index_fullness': 0.00142,
 'namespaces': {'': {'vector_count': 42},
                'meeting_topic': {'vector_count': 100}},
 'total_vector_count': 142}

Querying Pinecone DB

In [None]:
# Need to look up about using LangChain for retrieval

In [27]:
query  = "What was talked regarding United States Congress?"
downstr_response = index.query(
    vector= embed.embed_documents([query])[0],
    # filter={
    #     "meeting_id": {"$in":[1, 2]}
    # },
    namespace=namespace, 
    top_k=10,
    include_metadata=True,
)
downstr_response

{'matches': [{'id': '87d2d060-00b0-4efe-9639-91a584ac25d4',
              'metadata': {'meeting_id': 1.0,
                           'speaker': 'Narrator',
                           'start_time': 4.0588,
                           'text': 'view gun ownership as a right in this '
                                   'country?'},
              'score': 0.788970232,
              'values': []},
             {'id': 'c90e7f0e-d3fb-489d-a5d4-4823a576363e',
              'metadata': {'meeting_id': 1.0,
                           'speaker': 'Speaker 2',
                           'start_time': 4.9689,
                           'text': "I can't help but to think about how many "
                                   'people have been lost because of this '
                                   'perceived right or this outdated '
                                   'document.'},
              'score': 0.788026214,
              'values': []},
             {'id': '517181e5-3433-40a3-a3cf-3f5c54f8d056',
