#### `Semantic Search with Pinecone and Embeddings using HuggingFace Model`

In [1]:
import os
import pinecone
from pinecone import Pinecone, ServerlessSpec
import pandas as pd
import numpy as np
from tqdm import tqdm
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer

* `Reading Data`

In [2]:
FOLDER_PATH = os.path.join(os.getcwd(), 'dataset-semantic')
df = pd.read_csv(os.path.join(FOLDER_PATH, 'articles_new.csv'))
df['class'] = ['class-a', 'class-b'] * 250
df.head()

Unnamed: 0,title,id,class
0,Mental Note Vol. 24,3054,class-a
1,Your Brain On Coronavirus,3055,class-b
2,Mind Your Nose,3056,class-a
3,The 4 Purposes of Dreams,3057,class-b
4,Surviving a Rod Through the Head,3058,class-a


* `Embeddings using HuggingFace Model`

In [3]:
model_hugging = SentenceTransformer(model_name_or_path='all-MiniLM-L6-v2', device='cpu')
model_hugging

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [4]:
vect_length = len(model_hugging.encode(df['title'][0]))
print('Length of Embedding model: {}'.format(vect_length))

Length of Embedding model: 384


* `Pinecode In Code`

In [5]:
_ = load_dotenv(override=True)
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')

In [11]:
pinecone = Pinecone(api_key=PINECONE_API_KEY)

try:
    print('Deleting all indexes')
    _ = [pinecone.delete_index(name=index_name['name']) for index_name in pinecone.list_indexes()]
except Exception as e:
    print('Error In Deleting Indexes: {}'.format(e))
    
    
index_name = 'semantic-search-course'
if index_name not in pinecone.list_indexes():
    print('Creating Index: {}'.format(index_name))
    pinecone.create_index(
        name=index_name,
        dimension=vect_length,
        metric='cosine',
        spec=ServerlessSpec(cloud='aws', region='us-east-1')
    )
    print('Done Creating Index: {}'.format(index_name))
    
    
index = pinecone.Index(index_name)
index

Deleting all indexes
Creating Index: semantic-search-course
Done Creating Index: semantic-search-course


<pinecone.data.index.Index at 0x236eef431f0>

* `Upserting In Pinecone`

In [16]:
faild_ids = []
batch_size = 16
for batch_start in range(0, len(df), batch_size):
    
    try:
        batch_end = min(len(df), batch_start + batch_size)
        
        title_batch = df['title'][batch_start:batch_end].tolist()
        ids_batch = df['id'][batch_start:batch_end].astype(str).tolist()
        metadata_classes = df['class'][batch_start:batch_end].tolist()
        
        # Get Embedding using model
        embeds_batch = model_hugging.encode(title_batch).tolist() 
        
        
        
        # Prepare Dataset to Upserting
        to_upsert = [(ids, embeds, {'class': cls})
                                    for ids, embeds, cls in zip(ids_batch, embeds_batch, metadata_classes)]
        
        # Upserting to pinecone
        index.upsert(to_upsert)
    except Exception as e:
        print('Error In Upserting: {}'.format(e))
        faild_ids.append(ids_batch)
    

* `Query In Pinecone`

In [64]:
query_text = 'Neutral Technology'

# Embedding using model
query_embeds = model_hugging.encode(query_text).tolist()

# Search In Pinecone
result = index.query(vector=[query_embeds], top_k=5, include_metadata=True,)
result['matches']

[{'id': '3552',
  'metadata': {'class': 'class-a'},
  'score': 0.773610115,
  'values': []},
 {'id': '3368',
  'metadata': {'class': 'class-a'},
  'score': 0.36518091,
  'values': []},
 {'id': '3393',
  'metadata': {'class': 'class-b'},
  'score': 0.345991075,
  'values': []},
 {'id': '3107',
  'metadata': {'class': 'class-b'},
  'score': 0.330545843,
  'values': []},
 {'id': '3524',
  'metadata': {'class': 'class-a'},
  'score': 0.269790083,
  'values': []}]

* `Deleting In Pinecone`

In [None]:
# Deleting Records
_ = index.delete(ids=['3087', '3144'])

* `Fetching In Pinecone`

In [43]:
index.fetch(ids=['3191'])['vectors']['3191']['values'][:10]

[-0.0863685459,
 -0.0565062389,
 -0.0596554354,
 0.0517030247,
 0.0311833397,
 -0.106662802,
 -0.0539679676,
 0.0847911164,
 -0.0917322934,
 0.0162194]

* `Update In Pinecone`

In [65]:
text_updating = 'Osama Abo Bakr'
embeds_updating = model_hugging.encode(text_updating).tolist()

_ = index.update(
    id='3191',
    values=embeds_updating
)

In [66]:
index.fetch(ids=['3191'])['vectors']['3191']['values'][:10]

[-0.0474119,
 0.0783691928,
 -0.0581575111,
 0.047289934,
 -0.0269384403,
 -0.0185756385,
 0.10783001,
 0.00541882776,
 -0.0265732408,
 0.0184976868]

* `Updating Using Upsert`

In [61]:
text_updating = 'Osama Abo Bakr'
embeds_updating = model_hugging.encode(text_updating).tolist()

print(embeds_updating)

_ = index.upsert(
    vectors=[('3191', embeds_updating)]
)

index.fetch(ids=['3191'])['vectors']['3191']['values'][:10]

[-0.047411900013685226, 0.07836919277906418, -0.05815751105546951, 0.04728993400931358, -0.026938440278172493, -0.01857563853263855, 0.10783001035451889, 0.005418827757239342, -0.026573240756988525, 0.018497686833143234, -0.013193309307098389, -0.03892706707119942, 0.067252017557621, -0.005205471999943256, 0.00435540871694684, 0.13630607724189758, 0.035901233553886414, 0.03279680386185646, -0.018097978085279465, 0.004009360913187265, 0.020394407212734222, 0.01781482622027397, 0.05665455758571625, 0.029246747493743896, -0.03364397957921028, 0.0023684061598032713, 0.0679248720407486, 0.06725198030471802, -0.017632409930229187, 0.0058637396432459354, 0.002474074950441718, 0.01614825241267681, 0.07860083132982254, -0.03003884293138981, 0.007647181395441294, 0.025310400873422623, 0.03912106528878212, 0.051341909915208817, 0.015569137409329414, -0.01429154071956873, 0.016430802643299103, -0.05383088067173958, 0.0016967522678896785, -0.009152666665613651, 0.08717261254787445, -0.0558974109590

[-0.0474119,
 0.0783691928,
 -0.0581575111,
 0.047289934,
 -0.0269384403,
 -0.0185756385,
 0.10783001,
 0.00541882776,
 -0.0265732408,
 0.0184976868]