## **Install DB and Embedding Model**

In [2]:
!pip install pinecone-client -q
!pip install sentence_transformers -q # Embedding model ( sentence transformer )

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m51.6 MB/s[0m eta [36m0:00:00[0m
[?25h

## **Making Embeddings**

### **Load Data**

In [7]:
cricket_news = """
The T20 World Cup 2024 is in full swing, bringing excitement and drama to cricket fans worldwide.
India's team, captained by Rohit Sharma, is preparing for a crucial match against Ireland, with standout player Jasprit Bumrah expected to play a pivotal role in their campaign.
The tournament has already seen controversy, particularly concerning the pitch conditions at Nassau County International Cricket Stadium in New York, which came under fire after a low-scoring game between Sri Lanka and South Africa.
"""

football_news = """
The world of football is buzzing with excitement as major tournaments and league matches continue to captivate fans globally.
In the UEFA Champions League, the semi-final matchups have been set, with defending champions Real Madrid set to face Manchester City, while Bayern Munich will take on Paris Saint-Germain.
Both ties promise thrilling encounters, featuring some of the best talents in world football.
"""

election_news = """
As election season heats up, the latest developments reveal a highly competitive atmosphere across several key races.
The presidential election has seen intense campaigning from all major candidates, with recent polls indicating a tight race.
Incumbent President Jane Doe is seeking re-election on a platform of economic stability and healthcare reform, while her main rival, Senator John Smith, focuses on education and climate change initiatives."""


ai_revolution_news = """
The AI revolution continues to transform industries and reshape the global economy.
Significant advancements in artificial intelligence have led to breakthroughs in healthcare, with AI-driven diagnostics improving patient outcomes and reducing costs.
Autonomous systems are becoming increasingly prevalent in logistics and transportation, enhancing efficiency and safety."""


## **Sentence Transformer**

In [4]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer('all-mpnet-base-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

### **Perform Embeddings**

In [9]:
embeddings = embedding_model.encode([cricket_news, football_news, election_news, ai_revolution_news])

embeddings

array([[-0.02901842,  0.0192444 , -0.0181424 , ...,  0.00644327,
        -0.01740812, -0.01381658],
       [-0.00384662, -0.07271519, -0.00284145, ..., -0.02027755,
         0.02123847, -0.03015987],
       [-0.02962372,  0.05711373,  0.01119961, ...,  0.0131924 ,
         0.02634867,  0.01807423],
       [-0.01667612,  0.05068192, -0.05662728, ..., -0.00878626,
        -0.02318501, -0.04949613]], dtype=float32)

In [10]:
len(embeddings[0])  # dimension of embeddings in 'all-mpnet-base-v2' model is 768

768

### **Create Environmental Variable**

In [61]:
!pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1


In [64]:
import os
from dotenv import load_dotenv

load_dotenv()

api_key = os.getenv("PINECORNE_API_KEY")

## **Store Embeddings In Pincone Database**

In [65]:
from pinecone import Pinecone
from pinecone import ServerlessSpec

pc = Pinecone(api_key=api_key)
spec = ServerlessSpec(cloud="aws", region="us-east-1")

### **Create Index**

In [66]:
pc.create_index("example-index", dimension=768, metric="cosine", spec=spec)

### **Use Index**

In [67]:
# See the index
pc.list_indexes()

{'indexes': [{'dimension': 768,
              'host': 'example-index-u37s0o9.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'example-index',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}

In [68]:
# use index
index= pc.Index("example-index")

In [69]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

### **Add Data To Pinecone Index**

In [70]:
index.upsert([
    {"id": "id1", "values": embeddings[0], "metadata": {"source": "cricket"}},
    {"id": "id2", "values": embeddings[1], "metadata": {"source": "football"}},
    {"id": "id3", "values": embeddings[2], "metadata": {"source": "election"}},
    {"id": "id4", "values": embeddings[3], "metadata": {"source": "ai_revolution"}}
])

{'upserted_count': 4}

In [71]:
index.describe_index_stats() # This can also see in pinecone indexes site

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 4}},
 'total_vector_count': 4}

## **Similarity Search**

In [72]:
query = "technology"
query_embedding = embedding_model.encode([query])

In [73]:
len(query_embedding[0])

768

In [74]:
# convert to list
query_embedding = query_embedding[0].tolist()

In [75]:
# check similar document from vector database
index.query(vector=query_embedding, top_k=1, include_values=False, include_metadata=True) 

{'matches': [{'id': 'id4',
              'metadata': {'source': 'ai_revolution'},
              'score': 0.218479618,
              'values': []}],
 'namespace': '',
 'usage': {'read_units': 6}}

In [76]:
index.query(vector=query_embedding, top_k=2, include_values=False, include_metadata=True) 

{'matches': [{'id': 'id4',
              'metadata': {'source': 'ai_revolution'},
              'score': 0.218479618,
              'values': []},
             {'id': 'id1',
              'metadata': {'source': 'cricket'},
              'score': 0.0995326,
              'values': []}],
 'namespace': '',
 'usage': {'read_units': 6}}

## **CRUD Operations**

### **Add Data**

In [77]:
blockchain_news = """
The blockchain industry continues to evolve rapidly, marked by significant technological advancements and regulatory developments.
This month, the spotlight is on the launch of Ethereum 3.0, which promises enhanced scalability and security features.
This upgrade is expected to drastically reduce transaction fees and increase processing speeds, making decentralized applications (dApps) more efficient and user-friendly.
"""

embedding_query = embedding_model.encode([blockchain_news])
embedding_query = embedding_query[0].tolist()

In [78]:
embedding_query

[0.02840227261185646,
 -0.0021523695904761553,
 -0.010122592560946941,
 -0.011638355441391468,
 -0.038507044315338135,
 -0.019574182108044624,
 0.04022802412509918,
 0.017896199598908424,
 -0.03143160790205002,
 -0.013866004534065723,
 0.03915023058652878,
 0.0746336355805397,
 -0.028726568445563316,
 0.07695430517196655,
 0.025922685861587524,
 -0.05927301198244095,
 0.060219887644052505,
 0.00616870354861021,
 -0.0014248922234401107,
 -0.0057087186723947525,
 0.05366029962897301,
 -0.04634955897927284,
 -0.014793291687965393,
 -0.05512924864888191,
 0.039273906499147415,
 -0.010504324920475483,
 0.013568432070314884,
 0.04381578788161278,
 -0.03908746689558029,
 -0.04724558815360069,
 0.011776000261306763,
 0.04276015982031822,
 -0.06342216581106186,
 0.038481224328279495,
 2.4313781068485696e-06,
 -0.022821687161922455,
 -0.04746832698583603,
 0.04970318451523781,
 -0.07722826302051544,
 0.027682799845933914,
 -0.04655935987830162,
 -0.014909307472407818,
 -0.009227187372744083,
 0.

In [79]:
len(embedding_query)

768

In [80]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 4}},
 'total_vector_count': 4}

In [81]:
# add to database
index.upsert([
    {"id": "id5", "values": embedding_query, "metadata": {"source": "blockchain"}}
])

{'upserted_count': 1}

In [82]:
# check the status
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 5}},
 'total_vector_count': 5}

### **Read Data**

In [83]:
results = index.fetch(ids=["id1", "id2"])

results

{'namespace': '',
 'usage': {'read_units': 1},
 'vectors': {'id1': {'id': 'id1',
                     'metadata': {'source': 'cricket'},
                     'values': [-0.0290184207,
                                0.0192444045,
                                -0.0181424,
                                0.0633586347,
                                -0.0395194106,
                                -0.0544055477,
                                -0.0402265,
                                -0.0219997242,
                                -0.0467268974,
                                0.041237697,
                                0.013172416,
                                -0.0143096773,
                                0.0422757305,
                                0.0756244436,
                                0.0763016567,
                                -0.0530844629,
                                -0.0126223583,
                                -0.0420723483,
                                

### **Update Data**

In [84]:
embedding_query = embedding_model.encode("This is sample document about generative AI").tolist()

index.upsert([
    ("id3", embedding_query, {"source": "gen ai"})
])

{'upserted_count': 1}

In [85]:
result = index.fetch(ids=["id3"])
result

{'namespace': '',
 'usage': {'read_units': 1},
 'vectors': {'id3': {'id': 'id3',
                     'metadata': {'source': 'gen ai'},
                     'values': [0.0174305327,
                                -0.0290010683,
                                -0.0335818306,
                                -0.000871601922,
                                -0.042215541,
                                0.0295835342,
                                0.0745305195,
                                -0.00894235168,
                                -0.0314410478,
                                0.0250650588,
                                0.0490644723,
                                -0.0251699109,
                                0.022712402,
                                0.0440000631,
                                0.0629395,
                                -0.0993508771,
                                0.0334789827,
                                0.0161244776,
                              

### **Detete Data**

In [86]:
index.delete(ids=["id3"])

{}

In [87]:
index.fetch(ids=["id3"])

{'namespace': '', 'usage': {'read_units': 1}, 'vectors': {}}

## **Delete Whole Index**

In [88]:
pc.delete_index("example-index")