In [2]:
import pandas as pd
import dotenv
dotenv.load_dotenv()
import os
from langchain.embeddings.openai import OpenAIEmbeddings
from pinecone import Pinecone, ServerlessSpec
import time

  from tqdm.autonotebook import tqdm


In [3]:
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') or 'OPENAI_API_KEY'
PINECONE_API_KEY = os.getenv('PINECONE_SERVERLESS_API_KEY') or 'PINECONE_SERVERLESS_API_KEY'
index_name = 'langchain-retrieval-transcript'

namespace = 'langchain'

In [4]:
pinecone = Pinecone(api_key=PINECONE_API_KEY)
print(pinecone.list_indexes())

model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=OPENAI_API_KEY
)


{'indexes': [{'dimension': 1536,
              'host': 'langchain-retrieval-transcript-kp69ciw.svc.apw5-4e34-81fa.pinecone.io',
              'metric': 'cosine',
              'name': 'langchain-retrieval-transcript',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-west-2'}},
              'status': {'ready': True, 'state': 'Ready'}}]}


  warn_deprecated(


In [None]:
# create index
if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        name=index_name,
        metric='cosine',
        dimension=1536,  # model_name = 'text-embedding-ada-002'; 1536 dim of text-embedding-ada-002
        
        spec=ServerlessSpec(
        cloud='aws', 
        region='us-west-2'
        # pod_type="p1.x1",
        ) 
    )   
    
while not pinecone.describe_index(index_name).status['ready']:
    time.sleep(1)


In [5]:
index = pinecone.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'langchain': {'vector_count': 144}},
 'total_vector_count': 144}

In [None]:
transcript = pd.read_csv('aws_parsed_transcript.csv')
print(transcript.shape)
transcript.head()

In [None]:
from tqdm.auto import tqdm
from uuid import uuid4

batch_limit = 90
texts = []
metadatas = []
meeting_id = 1
start_id = 0

for i, record in tqdm(transcript.iterrows()):
    # first get metadata fields for this record
    metadata = {
        'speaker': record['speaker_label'],
        'start_time': round(record['start_time'], 4), # limit to 4 decimal places 
        'meeting_id': meeting_id,
        'text': record['text'], # Storing the text in the metadata for now, later we'd need to decode it from vectors
    }

    record_texts = record['text']

    texts.append(record_texts)
    metadatas.append(metadata)

    # print(texts)
    # print(metadatas)

    # if we've reached the batch limit, then index the batch
    if len(texts) >= batch_limit:
        #ids = [str(uuid4()) for _ in range(len(texts))]
        ids = [str(i+1) for i in range(start_id,(start_id + len(texts)))]
        start_id += len(texts)
        embeds = embed.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas), namespace=namespace)
        texts = []
        metadatas = []
        meeting_id += 1

# add any remaining texts to the index
if len(texts) > 0:
    #ids = [str(uuid4()) for _ in range(len(texts))]
    ids = [str(i+1) for i in range(start_id,(start_id + len(texts)))]
    embeds = embed.embed_documents(texts)
    index.upsert(vectors=zip(ids, embeds, metadatas), namespace=namespace)
    
time.sleep(5)    

In [46]:
time.sleep(10)
index.describe_index_stats()   

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'langchain': {'vector_count': 144}},
 'total_vector_count': 144}

In [6]:
from langchain.vectorstores import Pinecone
from langchain.schema import SystemMessage, HumanMessage, AIMessage
from langchain.chat_models import ChatOpenAI

text_field = 'text' # the field in the metadata that contains the text and would be used for retrieval
vector_store = Pinecone(
    index, embed.embed_query, text_field, namespace=namespace)



In [9]:
query = 'I want to talk about the future of the company'
vector_store.similarity_search(query, k=5, namespace=namespace)


[Document(page_content='Aside, aside from the goofy name, I thought that, um, and of course we cant talk about any of it on youtube. But man, that was like a really awesome look back at like heres some wins and heres some exciting things that weve done. Um And so I think we just need to get better at this as a company. I think this exercise is an opportunity that um I think we do a little bit of that, but this is doing it more. So, uh Sonia, you have some questions here for elitists.', metadata={'meeting_id': 1.0, 'speaker': 'spk_0', 'start_time': 22.5338}),
 Document(page_content='to come back and we talk about it on Monday in our little social.', metadata={'meeting_id': 2.0, 'speaker': 'spk_3', 'start_time': 41.7078}),
 Document(page_content='uh cool. Well, were almost up on time but I did wanna share one thing that kind of came in like basically Friday, it was like late Thursday for me. So this uh this is another thing, a lot, a lot of these things, I dont have like a ton of context

In [10]:
def augment_prompt(query: str):
    # get top 3 results from knowledge base
    results = vector_store.similarity_search(query, k=3, namespace=namespace)
    source_knowledge = "\n".join([x.page_content for x in results])
    
    augmented_prompt = f"""Using the contexts below, answer the query in as much detail as possible.

    Contexts:
    {source_knowledge}

    Query: {query}"""
    return augmented_prompt

In [11]:
print(augment_prompt(query))

Using the contexts below, answer the query in as much detail as possible.

    Contexts:
    the door for NASCAR Partnership. Hey NASCAR goes fast and turns left because it get loud.
Its something Ricky Bobby would have said if anyone hasnt seen Talladega Nights,  they should, thats
 the homework for this weekend
more speed. The Ricky Bobby and me really did like the no trade offs there. Parker.

    Query: What was discussed about NASCAR?


In [12]:
query  = 'What was discussed about NASCAR?'

messages = [
    SystemMessage(content="You are a helpful assistant."),
    HumanMessage(content="Hi AI, how are you today?"),
    AIMessage(content="I'm great thank you. How can I help you?"),
]


chat = ChatOpenAI(
    openai_api_key=os.environ["OPENAI_API_KEY"],
    model='gpt-3.5-turbo'
)

# create a new user prompt
prompt = HumanMessage(
    content=augment_prompt(query)
)
# add to messages
messages.append(prompt)

res = chat(messages)

print(res.content)

From the given contexts, it appears that there was a discussion about NASCAR and its partnership. The mention of NASCAR going fast and turning left suggests the nature of NASCAR racing. It is also mentioned that NASCAR is loud. There is a reference to the movie "Talladega Nights" and the character Ricky Bobby, who is known for his love of NASCAR. The mention of "no trade offs" implies that there is a positive opinion or sentiment towards NASCAR and its qualities. Overall, the discussion seems to revolve around the excitement, speed, and enjoyment associated with NASCAR racing.


In [13]:
vector_store.similarity_search(query, k=5, namespace=namespace)

[Document(page_content='the door for NASCAR Partnership. Hey NASCAR goes fast and turns left because it get loud.', metadata={'meeting_id': 2.0, 'speaker': 'spk_3', 'start_time': 39.9427}),
 Document(page_content='Its something Ricky Bobby would have said if anyone hasnt seen Talladega Nights,  they should, thats\r\n the homework for this weekend', metadata={'meeting_id': 2.0, 'speaker': 'spk_4', 'start_time': 41.6003}),
 Document(page_content='more speed. The Ricky Bobby and me really did like the no trade offs there. Parker.', metadata={'meeting_id': 2.0, 'speaker': 'spk_4', 'start_time': 41.4151}),
 Document(page_content='What about like a New and Christies uh product keynote at commit?  Are we that tailing this with\r\n that?', metadata={'meeting_id': 1.0, 'speaker': 'spk_5', 'start_time': 14.7357}),
 Document(page_content='Thats where we were last I heard.', metadata={'meeting_id': 1.0, 'speaker': 'spk_3', 'start_time': 3.0255})]