In [6]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
Downloading multiprocess-0.70.16-py312-none-any.whl (146 kB)
Downloading xxhash-3.5.0-cp312-cp312-win_amd64.whl (30 kB)
Installing collected packages: xxhash, multiprocess, datasets
Successfully installed datasets-3.1.0 multiprocess-0.70.16 xxhash-3.5.0


In [7]:
import os
import time
import uuid
from pinecone import Pinecone
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv
from datasets import load_dataset

# Load environment variables
load_dotenv()

# Set up Pinecone API key and index name
api_key = os.getenv("PINECONE_API_KEY")
index_name = "l6v2"
custom_namespace = 'chatbot-legal'

# Initialize Pinecone with the API key
pc = Pinecone(api_key=api_key)
index = pc.Index(index_name)

# Check initial index stats
print("BEFORE:", index.describe_index_stats())

# Initialize the embedding model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Load the dataset from Hugging Face
dataset = load_dataset("nisaar/Lawyer_GPT_India", split="train")

# Iterate through each row in the dataset
for row in dataset:
    question = row["question"]  # Question
    answer = row["answer"]  # Answer

    # Generate embeddings for both the question and answer
    embeddings = model.encode([question, answer])

    # Upsert the embeddings into Pinecone with unique IDs and metadata
    index.upsert(
        vectors=[
            # Embedding for the question (question)
            {
                "id": str(uuid.uuid4()),
                "values": embeddings[0].tolist(),
                "metadata": {
                    "question": question,
                    "answer": answer,
                    "created_at": int(time.time())
                },
            },
            # Embedding for the answer (answer)
            {
                "id": str(uuid.uuid4()),
                "values": embeddings[1].tolist(),
                "metadata": {
                    "question": question,
                    "answer": answer,
                    "created_at": int(time.time())
                },
            },
        ],
        namespace=custom_namespace,
    )

# Check index stats after upserting
print("Record count AFTER adding knowledge ->", index.describe_index_stats())


BEFORE: {'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.07k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


150_lawergpt_dataset_qna_v1_train.jsonl:   0%|          | 0.00/72.4k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/150 [00:00<?, ? examples/s]

Record count AFTER adding knowledge -> {'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'chatbot-legal': {'vector_count': 424}},
 'total_vector_count': 424}
