In [4]:
!pip install sentence-transformers





In [5]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd

from sentence_transformers.util import cos_sim


In [6]:
mpnet = SentenceTransformer('all-mpnet-base-v2')

mpnet

SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [7]:

#res = embed.embed_documents(texts)
#len(res), len(res[0]) # (x,1536) 1536 is the embedding size
sentences = [
    "the fifty mannequin heads floating in the pool kind of freaked them out",
    "she swore she just saw her sushi move",
    "he embraced his new life as an eggplant",
    "my dentist tells me that chewing bricks is very bad for your teeth",
    "the dental specialist recommended an immediate stop to flossing with construction materials"
]
embeddings = mpnet.encode(sentences)

sim = np.zeros((len(sentences), len(sentences)))

for i in range(len(sentences)):
    sim[i:,i] = cos_sim(embeddings[i], embeddings[i:])

len(sim[0])

5

In [112]:
transcript = pd.read_csv('transcript.csv')
transcript.head()

Unnamed: 0.1,Unnamed: 0,start_time,end_time,speaker_label,text
0,0,0.00015,0.0835,Speaker 1,How do I defend myself and my Children against...
1,1,0.08365,0.128983,Speaker 1,And you said to yourself that the police may n...
2,2,0.129167,0.135,Speaker 1,"Tell me,"
3,3,0.13515,0.189983,Speaker 2,you know what the best chance you got in a sit...
4,4,0.19015,0.203983,Speaker 2,She just stated that


In [113]:
transcript.isna().sum()

Unnamed: 0       0
start_time       0
end_time         0
speaker_label    0
text             0
dtype: int64

In [114]:
transcript.dtypes

Unnamed: 0         int64
start_time       float64
end_time         float64
speaker_label     object
text              object
dtype: object

In [115]:

transcript.drop(['Unnamed: 0', 'end_time' ], axis=1, inplace=True)
transcript.head()

Unnamed: 0,start_time,speaker_label,text
0,0.00015,Speaker 1,How do I defend myself and my Children against...
1,0.08365,Speaker 1,And you said to yourself that the police may n...
2,0.129167,Speaker 1,"Tell me,"
3,0.13515,Speaker 2,you know what the best chance you got in a sit...
4,0.19015,Speaker 2,She just stated that


In [116]:
len(transcript)

954

In [117]:
import warnings
warnings.filterwarnings("ignore")
to_remove=[]
for i, record in transcript.iterrows():
    if i < len(transcript)-1:
        if transcript.speaker_label[i]== transcript.speaker_label[i+1]:
            transcript['text'][i]=transcript['text'][i]+" "+transcript['text'][i+1]
            to_remove.append(i+1)

transcript = transcript.drop(to_remove).reset_index(drop=True)
            
             

In [119]:
transcript.head()

Unnamed: 0,start_time,speaker_label,text
0,0.00015,Speaker 1,How do I defend myself and my Children against...
1,0.13515,Speaker 2,you know what the best chance you got in a sit...
2,0.48465,Narrator,Thank you all for coming today. My name is Kee...
3,1.232333,Speaker 1,"Fun is,"
4,1.241667,Narrator,is pretty much what I'm thinking about and wha...


In [120]:
transcript.dtypes

start_time       float64
speaker_label     object
text              object
dtype: object

In [121]:
transcript.speaker_label.astype("string")
transcript.text.astype("string")

0      How do I defend myself and my Children against...
1      you know what the best chance you got in a sit...
2      Thank you all for coming today. My name is Kee...
3                                                Fun is,
4      is pretty much what I'm thinking about and wha...
                             ...                        
137    that those of us who are racially oppressed, w...
138    I wasn't surprised by the answers I heard, but...
139    I think at the end of the day, we all just wan...
140    Some of the stuff I heard in there is like fai...
141    I would like to do some training with some of ...
Name: text, Length: 142, dtype: string

In [122]:
def create_embedding(text):
    return mpnet.encode([text])[0]

# get first row of transcript
test_embedding_function = create_embedding(transcript.iloc[0]['text'])
len(test_embedding_function)  #maximum dimension size is 768

768

In [125]:
for i,record in transcript.iterrows():
    l=0
    
    if len(transcript.iloc[i]['text']) > l:
        l=len(transcript.iloc[i]['text'])

In [126]:
l

80

In [127]:
import pinecone
import os

# Load Pinecone API key
api_key = os.environ.get('PINECONE_API_KEY') or 'API key'
# Set Pinecone environment. Find next to API key in console
env = os.environ.get('PINECONE_ENVIRONMENT') or "gcp-starter"

pinecone.init(api_key=api_key, environment=env)

index_name = "hello-pinecone"
# Delete the index, if an index of the same name already exists
if index_name in pinecone.list_indexes():
    pinecone.delete_index(index_name)

In [128]:
import time

if index_name in pinecone.list_indexes():
    pinecone.delete_index(index_name)

# we create a new index
pinecone.create_index(
    name=index_name,
     metric='cosine',# this vector embedding best work with cosine
    dimension=768  #max dimension is 768 
)

# wait for index to be initialized
while not pinecone.describe_index(index_name).status['ready']:
    time.sleep(1)

In [129]:
index = pinecone.GRPCIndex(index_name)
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 0}},
 'total_vector_count': 0}

In [131]:
from tqdm.auto import tqdm
from uuid import uuid4

batch_limit = 90
texts = []
metadatas = []
meeting_id = 1
namespace = 'meeting_gun_law'

for i, record in tqdm(transcript.iterrows()):
    # First get metadata fields for this record
    metadata = {
        'speaker': record['speaker_label'],
        'start_time': round(record['start_time'], 4),
        'meeting_id': meeting_id,
        'text': record['text'],
    }

    record_texts = record['text']
    texts.append(record_texts)
    metadatas.append(metadata)

    if len(texts) >= batch_limit:
        # Update Pinecone vector database with the batch
        embeds = mpnet.encode(texts)
        
        index.upsert(vectors=zip(ids, embeds, metadatas), namespace=namespace)
        #print(metadatas)
        texts = []
        metadatas = []

    meeting_id += 1

# Update Pinecone vector database with the remaining records (if any)
if texts:
    embeds = mpnet.encode(texts)
    index.upsert(vectors=zip(ids, embeds, metadatas), namespace=namespace)
    #print(metadatas)

time.sleep(5)


0it [00:00, ?it/s]

In [133]:
index.describe_index_stats() 

{'dimension': 768,
 'index_fullness': 0.0005,
 'namespaces': {'': {'vector_count': 0},
                'meeting_gun_law': {'vector_count': 50}},
 'total_vector_count': 50}

In [134]:
query  = "What was talked regarding United States Congress?"
response = index.query(
    vector= mpnet.encode([query])[0],
    # filter={
    #     "meeting_id": {"$in":[1, 2]}
    # },
    namespace=namespace, 
    top_k=10,
    include_metadata=True,
)
response
for match in response['matches']:
    print(f"{match['score']:.2f}: {match['metadata']['text']}")

0.31: The United States Congress needs to remove all legislation that's introduced to absolve the gun industry of any accountability.
0.30: There's no changes to gun policy in this country that are not incredibly controversial and divisive.
0.28: was pouring billions of dollars into the communities that are suffering from gun violence,
0.25: Us
0.23: From what I've heard so far, the two main sort of points I'm hearing is that
0.22: security who had a handgun who took a shot at the chest and there are armor piercing rounds.
0.21: just kind of want to draw people's attention to the fact that there have been a lot of different views on this panel.
0.20: a handgun against the person who had an AR-15. He had a rifle.
0.18: Like, aren't we concerned about neighbors?
0.18: They asked the police chief of Buffalo after the shooting, if you were armed with those types of weapons,
