Pinecone database storing sentence-transformer embedding. It extract +10 and -10 number of conversations and return only sorted unique conversation (with no repeation). For future, sorting should be done with meeting id and timestamp.
Author: Prachitee


In [None]:
!pip install sentence-transformers

In [2]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd

from sentence_transformers.util import cos_sim


In [3]:
mpnet = SentenceTransformer('all-mpnet-base-v2')

mpnet

SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [4]:

#res = embed.embed_documents(texts)
#len(res), len(res[0]) # (x,1536) 1536 is the embedding size
sentences = [
    "the fifty mannequin heads floating in the pool kind of freaked them out",
    "she swore she just saw her sushi move",
    "he embraced his new life as an eggplant",
    "my dentist tells me that chewing bricks is very bad for your teeth",
    "the dental specialist recommended an immediate stop to flossing with construction materials"
]
embeddings = mpnet.encode(sentences)

sim = np.zeros((len(sentences), len(sentences)))

for i in range(len(sentences)):
    sim[i:,i] = cos_sim(embeddings[i], embeddings[i:])

len(sim[0])

5

In [5]:
transcript = pd.read_csv('transcript.csv')
transcript.head()

Unnamed: 0.1,Unnamed: 0,start_time,end_time,speaker_label,text
0,0,0.00015,0.0835,Speaker 1,How do I defend myself and my Children against...
1,1,0.08365,0.128983,Speaker 1,And you said to yourself that the police may n...
2,2,0.129167,0.135,Speaker 1,"Tell me,"
3,3,0.13515,0.189983,Speaker 2,you know what the best chance you got in a sit...
4,4,0.19015,0.203983,Speaker 2,She just stated that


In [6]:
transcript.isna().sum()

Unnamed: 0       0
start_time       0
end_time         0
speaker_label    0
text             0
dtype: int64

In [7]:
transcript.dtypes

Unnamed: 0         int64
start_time       float64
end_time         float64
speaker_label     object
text              object
dtype: object

In [8]:

transcript.drop(['Unnamed: 0', 'end_time' ], axis=1, inplace=True)
transcript.head()

Unnamed: 0,start_time,speaker_label,text
0,0.00015,Speaker 1,How do I defend myself and my Children against...
1,0.08365,Speaker 1,And you said to yourself that the police may n...
2,0.129167,Speaker 1,"Tell me,"
3,0.13515,Speaker 2,you know what the best chance you got in a sit...
4,0.19015,Speaker 2,She just stated that


In [9]:
len(transcript)

954

In [10]:
import warnings
warnings.filterwarnings("ignore")
to_remove=[]
for i, record in transcript.iterrows():
    if i < len(transcript)-1:
        if transcript.speaker_label[i]== transcript.speaker_label[i+1]:
            transcript['text'][i]=transcript['text'][i]+" "+transcript['text'][i+1]
            to_remove.append(i+1)

transcript = transcript.drop(to_remove).reset_index(drop=True)
            
             

In [11]:
transcript.head()

Unnamed: 0,start_time,speaker_label,text
0,0.00015,Speaker 1,How do I defend myself and my Children against...
1,0.13515,Speaker 2,you know what the best chance you got in a sit...
2,0.48465,Narrator,Thank you all for coming today. My name is Kee...
3,1.232333,Speaker 1,"Fun is,"
4,1.241667,Narrator,is pretty much what I'm thinking about and wha...


In [12]:
transcript.dtypes

start_time       float64
speaker_label     object
text              object
dtype: object

In [13]:
transcript.speaker_label.astype("string")
transcript.text.astype("string")

0      How do I defend myself and my Children against...
1      you know what the best chance you got in a sit...
2      Thank you all for coming today. My name is Kee...
3                                                Fun is,
4      is pretty much what I'm thinking about and wha...
                             ...                        
137    that those of us who are racially oppressed, w...
138    I wasn't surprised by the answers I heard, but...
139    I think at the end of the day, we all just wan...
140    Some of the stuff I heard in there is like fai...
141    I would like to do some training with some of ...
Name: text, Length: 142, dtype: string

In [14]:
def create_embedding(text):
    return mpnet.encode([text])[0]

# get first row of transcript
test_embedding_function = create_embedding(transcript.iloc[0]['text'])
len(test_embedding_function)  #maximum dimension size is 768

768

In [15]:
for i,record in transcript.iterrows():
    l=0
    
    if len(transcript.iloc[i]['text']) > l:
        l=len(transcript.iloc[i]['text'])

In [16]:
l

80

In [25]:
import pinecone
import os

# Load Pinecone API key
api_key = os.environ.get('PINECONE_API_KEY') or 'your_api_key'
# Set Pinecone environment. Find next to API key in console
env = os.environ.get('PINECONE_ENVIRONMENT') or "gcp-starter"

pinecone.init(api_key=api_key, environment=env)

index_name = "hello-pinecone"
# Delete the index, if an index of the same name already exists
if index_name in pinecone.list_indexes():
    pinecone.delete_index(index_name)

In [26]:
import time

if index_name in pinecone.list_indexes():
    pinecone.delete_index(index_name)

# we create a new index
pinecone.create_index(
    name=index_name,
     metric='cosine',# this vector embedding best work with cosine
    dimension=768  #max dimension is 768 
)

# wait for index to be initialized
while not pinecone.describe_index(index_name).status['ready']:
    time.sleep(1)

In [27]:
index = pinecone.GRPCIndex(index_name)
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 0}},
 'total_vector_count': 0}

In [28]:
from tqdm.auto import tqdm
from uuid import uuid4

batch_limit = 90
texts = []
metadatas = []
meeting_id = 1
namespace = 'meeting_gun_law'
row_number=1 #to extract the delta
for i, record in tqdm(transcript.iterrows()):
    # First get metadata fields for this record
    metadata = {
        'speaker': record['speaker_label'],
        'start_time': round(record['start_time'], 4),
        'meeting_id': meeting_id,
        'text': record['text'],
        'row_number':row_number
    }

    record_texts = record['text']
    texts.append(record_texts)
    metadatas.append(metadata)

    if len(texts) >= batch_limit:
        # Update Pinecone vector database with the batch
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = mpnet.encode(texts)
        
        index.upsert(vectors=zip(ids, embeds, metadatas), namespace=namespace)
        #print(metadatas)
        texts = []
        metadatas = []

    meeting_id += 1
    row_number +=1

# Update Pinecone vector database with the remaining records (if any)
if texts:
    ids = [str(uuid4()) for _ in range(len(texts))]
    embeds = mpnet.encode(texts)
    index.upsert(vectors=zip(ids, embeds, metadatas), namespace=namespace)
    #print(metadatas)

time.sleep(5)


0it [00:00, ?it/s]

In [33]:
index.describe_index_stats() 

{'dimension': 768,
 'index_fullness': 0.00142,
 'namespaces': {'': {'vector_count': 0},
                'meeting_gun_law': {'vector_count': 142}},
 'total_vector_count': 142}

In [63]:
query  = "What was talked regarding United States Congress?"
response = index.query(
    vector= mpnet.encode([query])[0],
    # filter={
    #     "meeting_id": {"$in":[1, 2]}
    # },
    namespace=namespace, 
    top_k=10,
    include_metadata=True,
)
response
for match in response['matches']:
    print(f"{match['score']:.2f}: {match['metadata']['text']} :{match['metadata']['start_time']} ")

0.31: The United States Congress needs to remove all legislation that's introduced to absolve the gun industry of any accountability. :40.985 
0.31: day when we talk about the constitution as like this sacred document. I mean, it was written by a bunch of rich white guys who own :4.5253 
0.30: There's no changes to gun policy in this country that are not incredibly controversial and divisive. :40.7703 
0.28: was pouring billions of dollars into the communities that are suffering from gun violence, :39.1023 
0.27: view gun ownership as a right in this country? :4.0588 
0.26: We forget to talk about white nationalism and the fact that this country was a settler colonial country based on :9.5133 
0.25: Us :42.6313 
0.25: This is an area that resists reform, resists regulation. We don't get to see the inner workings, :12.5158 
0.23: From what I've heard so far, the two main sort of points I'm hearing is that :36.9723 
0.22: Thank you all for coming today. My name is Keegan Hamilton. I'm a 

In [64]:
# Extract the start_time information from the metadata and store +-10 conversation's start time.


existing_timestamp=[]

for hit in response["matches"]:
    metadata = hit["metadata"]
    start_time = metadata["start_time"]
    row_number = metadata["row_number"]

    # Get the 10 text spoken before and after the identified timestamp
    start_row_number=  max(0, row_number-10)
    end_row_number= row_number+10
    
    # Query the Pinecone database to get the relevant text within the specified time window
    relevant_texts_response = index.query(
        vector = mpnet.encode([metadata["text"]])[0],
        filter={"row_number": {"$gte": start_row_number, "$lte": end_row_number}},
        top_k=10,  
        namespace=namespace,
        include_metadata=True
    )
    for match in relevant_texts_response["matches"]:
        existing_timestamp.append(match['metadata']['start_time'])


In [65]:
existing_timestamp.sort()
unique_timestamp= set(existing_timestamp)


In [72]:
relevant_match= []
relevant_text=[]
for single_timestamp in unique_timestamp:
    response = index.query(
        vector = mpnet.encode([metadata["text"]])[0],
        filter={"start_time": {"$eq": single_timestamp}},
        top_k=10,  
        namespace=namespace,
        include_metadata=True
    )
    relevant_match.append(response)
    for match in response['matches']:
        relevant_text.append(match['metadata']['text'])
   
   

In [73]:
 
relevant_text

["Thank you all for coming today. My name is Keegan Hamilton. I'm a correspondent with Vice News.",
 'Facts determined. I guess when I, when I pick up a gun loaded, not loaded, am I shooting,',
 "is pretty much what I'm thinking about and what I'm feeling when I'm shooting a gun, it's a good time.",
 ' Fun is,',
 "things like that where it's just about doing everything correctly.",
 'I usually feel tired after shooting a lot, a lot of the training is extensive. And as an instructor,',
 'your hand? And how many are new gun owners who just got into to gun ownership and shooting?',
 'I started, you know, closer to 2018 2019 when I just turned 18',
 'I grew up in San Antonio and as a first generation American growing up, I wanted to be a good American.',
 "When I was growing up. It was always the bad guys that had guns. And I felt like I didn't have the emotional maturity at that time to own guns.",
 'actually used to be a gun control activist. And there were certain events that were',
 'v