In [1]:
import pinecone


In [2]:
from RAG.utils.common import setup_env


setup_env()

In [3]:
import os
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])



In [16]:
index_name = "data-index"

if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)
pc.create_index(
    name=index_name,
    dimension=6,
    metric="cosine",
    spec=ServerlessSpec(
        cloud='aws', 
        region='us-east-1'
    ) 
) 


In [9]:
import torch


index = pc.Index(index_name)

index.upsert(
    vectors=[
        {"id": "vec1", "values": [1.0, 1.5]},
        {"id": "vec2", "values": [2.0, 1.0]},
        {"id": "vec3", "values": [0.1, 3.0]},
    ],
    namespace="ns1"
)

index.upsert(
    vectors=[
        {"id": "vec1", "values": torch.Tensor([3.099999,1.0])},
        {"id": "vec2", "values": [3.0, -2.0]},
        {"id": "vec3", "values": [0.5, -1.5]},
    ],
    namespace="ns2"
)


{'upserted_count': 3}

In [12]:
torch.Tensor([3.099999,1.0]).shape[0]

2

In [7]:
print(index.describe_index_stats())

# Returns:
# {'dimension': 2,
#  'index_fullness': 0.0,
#  'namespaces': {'ns1': {'vector_count': 3}, 'ns2': {'vector_count': 3}},
#  'total_vector_count': 6}


{'dimension': 2,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 3}, 'ns2': {'vector_count': 3}},
 'total_vector_count': 6}


In [17]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(
            model_name_or_path="all-mpnet-base-v2", 
            device="cpu"
        )

[2024-06-03 12:49:42,358: INFO: SentenceTransformer: Load pretrained SentenceTransformer: all-mpnet-base-v2]




In [22]:
text_chunks_embeddings = embedding_model.encode(
            ["hlo,this is java","hla,ff,dfdsf sdfd"],
            batch_size = 32,
            convert_to_tensor=True,
        )

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [26]:
from sentence_transformers import SentenceTransformer

# Load the embedding model

# Example text data
texts = ["hlo,this is java", "hla,ff,dfdsf sdfd"]

# Encode the texts
text_chunks_embeddings = embedding_model.encode(
    texts,
    batch_size=32,
    convert_to_tensor=True
)

# Output the embeddings
print(text_chunks_embeddings.shape[-1])


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

768


In [27]:
[ i for i in text_chunks_embeddings]

[tensor([ 5.8442e-03, -1.0187e-02, -1.6322e-02,  1.6710e-02,  5.1998e-02,
          7.7586e-03, -2.3198e-02,  1.2283e-02,  6.0619e-02, -1.3371e-02,
         -2.8602e-02,  6.0705e-02,  4.1096e-02,  2.6600e-02, -1.3797e-02,
         -6.8680e-02,  3.3351e-02,  9.3601e-03,  3.4823e-02,  4.0274e-02,
          8.2890e-03,  1.8189e-02,  2.8010e-03,  4.6494e-02,  3.3717e-02,
         -3.4791e-02, -6.0934e-02, -1.0105e-02, -3.6036e-02, -1.0975e-02,
         -6.9552e-03,  5.7988e-02, -2.3155e-02,  7.0431e-03,  1.8792e-06,
         -3.9408e-02,  2.5526e-02,  3.4057e-02, -3.2963e-02,  4.8075e-02,
         -3.2240e-02,  5.0653e-02, -3.5866e-02,  2.7519e-03,  7.1662e-03,
         -2.0072e-02,  3.9645e-02, -1.4715e-02,  4.2030e-02,  6.1112e-02,
          1.7971e-02,  7.3310e-03,  4.8076e-03, -1.1830e-02,  7.9378e-02,
          1.2216e-02, -8.0887e-04,  3.3244e-02,  4.9288e-02,  1.0247e-02,
         -1.4559e-03, -7.6025e-03, -1.1716e-02, -1.6309e-03,  3.0789e-02,
          2.9121e-03,  1.0984e-02,  2.

In [5]:
[list(range(20))[i:i+15] for i in range(0,20,15)]

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], [15, 16, 17, 18, 19]]