In [5]:
!pip install -qU \
    openai==0.27.7 \
    "pinecone-client[grpc]"==2.2.1 \
    pinecone-datasets==0.5.1 \
    langchain==0.0.162 \
    tiktoken==0.4.0

In [6]:
from datasets import load_dataset
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
import time
from tqdm.auto import tqdm
from uuid import uuid4
from sentence_transformers.util import cos_sim
import pinecone
import os
from langchain.vectorstores import Pinecone



In [14]:
#Accessing the data

dataset = pd.read_csv('transcript.csv')
dataset.head()

Unnamed: 0.1,Unnamed: 0,start_time,end_time,speaker_label,text
0,0,0.00015,0.0835,Speaker 1,How do I defend myself and my Children against...
1,1,0.08365,0.128983,Speaker 1,And you said to yourself that the police may n...
2,2,0.129167,0.135,Speaker 1,"Tell me,"
3,3,0.13515,0.189983,Speaker 2,you know what the best chance you got in a sit...
4,4,0.19015,0.203983,Speaker 2,She just stated that


In [15]:
#Vector Embedding using SentenceTransformer, either we can use from hugging face or below one
mpnet = SentenceTransformer('all-mpnet-base-v2')

mpnet

SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [17]:
#dropping unimportant columns and merging adjacent speaker together
dataset.drop(['Unnamed: 0', 'end_time' ], axis=1, inplace=True)
import warnings
warnings.filterwarnings("ignore")
to_remove=[]
for i, record in dataset.iterrows():
    if i < len(dataset)-1:
        if dataset.speaker_label[i]== dataset.speaker_label[i+1]:
            dataset['text'][i]=dataset['text'][i]+" "+dataset['text'][i+1]
            to_remove.append(i+1)

dataset = dataset.drop(to_remove).reset_index(drop=True)
            
             
dataset.head()


Unnamed: 0,start_time,speaker_label,text
0,0.00015,Speaker 1,How do I defend myself and my Children against...
1,0.13515,Speaker 2,you know what the best chance you got in a sit...
2,0.48465,Narrator,Thank you all for coming today. My name is Kee...
3,1.232333,Speaker 1,"Fun is,"
4,1.241667,Narrator,is pretty much what I'm thinking about and wha...


In [19]:
#Convert text into embedding
def create_embedding(text):
    return mpnet.encode([text])[0]

# get first row of transcript
test_embedding_function = create_embedding(dataset.iloc[0]['text'])
len(test_embedding_function)  #maximum dimension size is 768

768

PINECONE

In [20]:

# Load Pinecone API key
api_key = os.environ.get('PINECONE_API_KEY') or 'API_key'
# Set Pinecone environment. Find next to API key in console
env = os.environ.get('PINECONE_ENVIRONMENT') or "gcp-starter"

pinecone.init(api_key=api_key, environment=env)

index_name = "hello-pinecone"
# Delete the index, if an index of the same name already exists
if index_name in pinecone.list_indexes():
    pinecone.delete_index(index_name)

In [21]:

if index_name in pinecone.list_indexes():
    pinecone.delete_index(index_name)

# we create a new index
pinecone.create_index(
    name=index_name,
     metric='cosine',# this vector embedding best work with cosine
    dimension=768  #max dimension is 768 
)

# wait for index to be initialized
while not pinecone.describe_index(index_name).status['ready']:
    time.sleep(1)

#checking once
index = pinecone.GRPCIndex(index_name)
index.describe_index_stats()

#loading data into pinecone 
batch_limit = 90
texts = []
metadatas = []
meeting_id = 1
namespace = 'meeting_gun_law'
row_number=1 #to extract the delta
for i, record in tqdm(dataset.iterrows()):
    # First get metadata fields for this record
    metadata = {
        'speaker': record['speaker_label'],
        'start_time': round(record['start_time'], 4),
        'meeting_id': meeting_id,
        'text': record['text'],
        'row_number':row_number
    }

    record_texts = record['text']
    texts.append(record_texts)
    metadatas.append(metadata)

    if len(texts) >= batch_limit:
        # Update Pinecone vector database with the batch
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = mpnet.encode(texts)
        
        index.upsert(vectors=zip(ids, embeds, metadatas), namespace=namespace)
        #print(metadatas)
        texts = []
        metadatas = []

    meeting_id += 1
    row_number +=1

# Update Pinecone vector database with the remaining records (if any)
if texts:
    ids = [str(uuid4()) for _ in range(len(texts))]
    embeds = mpnet.encode(texts)
    index.upsert(vectors=zip(ids, embeds, metadatas), namespace=namespace)
    #print(metadatas)

time.sleep(5)

0it [00:00, ?it/s]

In [22]:
index.describe_index_stats() 

{'dimension': 768,
 'index_fullness': 0.00142,
 'namespaces': {'': {'vector_count': 0},
                'meeting_gun_law': {'vector_count': 142}},
 'total_vector_count': 142}

In [116]:

embeddings = HuggingFaceEmbeddings(model_name="all-mpnet-base-v2")
text_field = "text"

# switch back to normal index for langchain
index = pinecone.Index(index_name)

vectorstore = Pinecone(
    index, embeddings.embed_query, text_field
)
#if using the same sentence-transformer embedding model, the input pass by similarity_search to mpnet.embeds causes problem 
#as they are not the same format.
#model name (model_name="all-mpnet-base-v2") is same as sentence-transformer(used above) but this is through hugging face

In [117]:
query = "What action United States congress took for gun industry?"
query1="What happened in Connecticut?"
vectorstore.similarity_search(
    query,
    namespace=namespace,# our search query
    k=50  # return 3 most relevant docs
)
#with k less than 10, no agent was not able to generate output

[Document(page_content="The United States Congress needs to remove all legislation that's introduced to absolve the gun industry of any accountability.", metadata={'meeting_id': 129.0, 'row_number': 129.0, 'speaker': 'Speaker 7', 'start_time': 40.985}),
 Document(page_content='view gun ownership as a right in this country?', metadata={'meeting_id': 19.0, 'row_number': 19.0, 'speaker': 'Narrator', 'start_time': 4.0588}),
 Document(page_content='was pouring billions of dollars into the communities that are suffering from gun violence,', metadata={'meeting_id': 122.0, 'row_number': 122.0, 'speaker': 'Speaker 2', 'start_time': 39.1023}),
 Document(page_content='What do you guys think are actual like good ideas in terms of gun regulation. If any,', metadata={'meeting_id': 29.0, 'row_number': 29.0, 'speaker': 'Narrator', 'start_time': 6.5899}),
 Document(page_content="There's no changes to gun policy in this country that are not incredibly controversial and divisive.", metadata={'meeting_id'

CONVERSATIONAL AGENT

In [123]:
from langchain.chat_models import ChatOpenAI
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.chains import RetrievalQA

# chat completion llm
llm = ChatOpenAI(
    openai_api_key="API_key",
    model_name='gpt-3.5-turbo',
    temperature=0.0
)
# conversational memory
conversational_memory = ConversationBufferWindowMemory(
    memory_key='chat_history',
    k=5,
    return_messages=True
)
# retrieval qa chain
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
    
)


In [119]:
qa.run(query)

'The United States Congress has taken various actions related to the gun industry. Some notable actions include passing the Gun Control Act of 1968, which established licensing requirements for gun dealers and prohibited certain individuals from owning firearms. Additionally, Congress has periodically debated and considered legislation on issues such as background checks for gun purchases, assault weapons bans, and funding for gun violence research. It is important to note that specific actions and legislation can vary over time, so it is recommended to refer to the most recent developments for the most accurate information.'

In [120]:
from langchain.agents import Tool

tools = [
    Tool(
        name='Knowledge Base1',
        func=qa.run,
        description=(
            'This tool will be used to answer the query'
        )
    )
]

In [121]:
from langchain.agents import initialize_agent

agent = initialize_agent(
    agent='chat-conversational-react-description',
    tools=tools,
    llm=llm,
    verbose=True,
    max_iterations=3,
    early_stopping_method='generate',
    memory=conversational_memory
)

In [122]:
agent("What action United States congress took for gun industry?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m{
    "action": "Knowledge Base1",
    "action_input": "United States congress action on gun industry"
}[0m
Observation: [36;1m[1;3mAs an AI language model, I don't have real-time information. However, I can provide you with some general information about United States Congress actions on the gun industry. 

The United States Congress has taken various actions related to the gun industry over the years. These actions have included passing legislation on gun control, firearms regulations, and background checks. For example, the Gun Control Act of 1968 established licensing requirements for gun dealers and prohibited certain individuals from owning firearms. 

Additionally, the Brady Handgun Violence Prevention Act of 1993 mandated background checks for firearm purchases from licensed dealers. More recently, there have been discussions and debates in Congress about implementing stricter gun control measures, such as expandin

{'input': 'What action United States congress took for gun industry?',
 'chat_history': [],
 'output': 'The United States Congress has taken various actions related to the gun industry over the years. These actions have included passing legislation on gun control, firearms regulations, and background checks. For example, the Gun Control Act of 1968 established licensing requirements for gun dealers and prohibited certain individuals from owning firearms. Additionally, the Brady Handgun Violence Prevention Act of 1993 mandated background checks for firearm purchases from licensed dealers. More recently, there have been discussions and debates in Congress about implementing stricter gun control measures, such as expanding background checks and banning certain types of firearms. However, the specific actions taken by Congress can vary depending on the political climate and the priorities of lawmakers at any given time. For the most up-to-date information on recent actions taken by the Uni