In [2]:
import dotenv
import os

In [3]:
dotenv.load_dotenv()

#gemini api key
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") 

#pinecone api key
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY") 

In [35]:
import pandas as pd
import json
# documnet load
try:
    df = pd.read_json('data.json')
except ValueError:
    
    with open('data.json', 'r') as f:
        data = [json.loads(line) for line in f]
    df = pd.DataFrame(data)



print(df.head())

   id                                      combined_text      source  \
0   1  Question: Who is at risk for Lymphocytic Chori...  medDataset   
1   2  Question: What are the symptoms of Lymphocytic...  medDataset   
2   3  Question: Who is at risk for Lymphocytic Chori...  medDataset   
3   4  Question: How to diagnose Lymphocytic Choriome...  medDataset   
4   5  Question: What are the treatments for Lymphocy...  medDataset   

                    category  \
0            viral infection   
1            viral infection   
2            viral infection   
3  viral infection diagnosis   
4            viral infection   

                                            entities  \
0  ['lymphocytic choriomeningitis', 'lymphocytic ...   
1  ['Lymphocytic Choriomeningitis Virus', 'Lympho...   
2  ['Lymphocytic Choriomeningitis', 'Lymphocytic ...   
3  ['lymphocytic choriomeningitis', 'lymphocytic ...   
4  ['Lymphocytic Choriomeningitis', 'LCMV', 'asep...   

                                     

In [23]:
import os
import json
import pandas as pd
import google.generativeai as genai
from tqdm.auto import tqdm
from pinecone import ServerlessSpec
from pinecone import Pinecone as pc


In [15]:
genai.configure(api_key=GEMINI_API_KEY)

In [36]:
EMBEDDING_MODEL = "models/gemini-embedding-001"  
PINECONE_INDEX_NAME = "medical-rag-index"    
DATA_FILE_PATH = "data.json" 

embedding_dimension = 3072 #gemini embedding

In [37]:
print(f"Loading data from {DATA_FILE_PATH}...")
try:
    df = pd.read_json(DATA_FILE_PATH)
    print(f"Successfully loaded {len(df)} records.")
except Exception as e:
    print(f"Error loading JSON file: {e}")
    exit()

Loading data from data.json...
Successfully loaded 9790 records.


In [38]:
print("Initializing Pinecone...")
pine_client= pc(
    api_key = os.getenv("PINECONE_API_KEY"), 
    )

Initializing Pinecone...


In [39]:
if PINECONE_INDEX_NAME not in pine_client.list_indexes().names():

    print("Creating index")
    pine_client.create_index(name=PINECONE_INDEX_NAME,

                      metric="cosine",
                      dimension=3072,
                      spec=ServerlessSpec(
                        cloud="aws",
                        region="us-east-1"
                        ),
    )
    print(pine_client.describe_index(PINECONE_INDEX_NAME))

Creating index
{'deletion_protection': 'disabled',
 'dimension': 3072,
 'host': 'medical-rag-index-3iyzw21.svc.aped-4627-b74a.pinecone.io',
 'metric': 'cosine',
 'name': 'medical-rag-index',
 'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
 'status': {'ready': True, 'state': 'Ready'},
 'tags': None,
 'vector_type': 'dense'}


In [40]:
index = pine_client.Index(PINECONE_INDEX_NAME)
print(index.describe_index_stats())

{'dimension': 3072,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}


In [41]:
BATCH_SIZE = 100
print(f"Starting to generate embeddings and upsert to Pinecone in batches of {BATCH_SIZE}...")

Starting to generate embeddings and upsert to Pinecone in batches of 100...


In [42]:
for i in tqdm(range(0, len(df), BATCH_SIZE)):
    batch_df = df.iloc[i:i + BATCH_SIZE]
    
    # Get the text content to embed
    texts_to_embed = batch_df["combined_text"].tolist()
    

    result = genai.embed_content(
        model=EMBEDDING_MODEL,
        content=texts_to_embed,
        task_type="RETRIEVAL_DOCUMENT"
    )
    embeddings = result['embedding']

    # Prepare data for Pinecone upsert
    vectors_to_upsert = []
    for idx, row in batch_df.iterrows():
        # Pinecone requires a string ID
        vector_id = str(row["id"])
        
        # Get the corresponding embedding
        embedding_vector = embeddings[batch_df.index.get_loc(idx)]
        

        metadata = {
            "text": row.get("combined_text", ""),
            "source": row.get("source", ""),
            "category": row.get("category", ""),
            "entities": row.get("entities", "[]"),
            "keywords": row.get("keywords", "[]"),
            "safety_level": row.get("safety_level", "medium"),
            "char_count": row.get("char_count", 0)
        }
        
        vectors_to_upsert.append({
            "id": vector_id,
            "values": embedding_vector,
            "metadata": metadata
        })
        
    # 3. Upsert the batch to Pinecone
    index.upsert(vectors=vectors_to_upsert)

print("\nUpsert process completed.")

  0%|          | 0/98 [00:00<?, ?it/s]


Upsert process completed.


In [43]:
print(index.describe_index_stats())

{'dimension': 3072,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 9790}},
 'total_vector_count': 9790,
 'vector_type': 'dense'}
