In [3]:
import dotenv
import os

dotenv.load_dotenv(".env")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")


In [4]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)


  from tqdm.autonotebook import tqdm


In [None]:
import pandas as pd
import pinecone

# Load your DataFrame
df = pd.read_csv("../DATA/Embedded_Files/testing_tokens.csv")  # Replace with your file path

# Ensure the "Embedding" column is in the correct format (list of floats)
df["tokens"] = df["tokens"].apply(eval)  # Convert string representation of list to actual list

# Check the dimension of the vectors
vector_dimension = len(df["tokens"].iloc[0])
print(f"Vector dimension: {vector_dimension}")

# Create or connect to a Pinecone index
index_name = "medical-tests-index"

pc.create_index(
    name=index_name,
    dimension=vector_dimension, # Use the correct dimension
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)
index = pc.Index(index_name)

# Prepare data for upsert
vectors_to_upsert = []
for idx, row in df.iterrows():
    # Extract embedding and metadata
    embedding = [float(x) for x in row["tokens"]]
    metadata = {
        "test_name": row["Test Name"],
        "source": row["Source"],
        "url": row["URL"]
    }
    
    # Create a unique ID for each vector (e.g., using the row index)
    vector_id = f"vec_{idx}"
    
    # Append to the list of vectors to upsert
    vectors_to_upsert.append((vector_id, embedding, metadata))

# Upsert data into Pinecone
index.upsert(vectors=vectors_to_upsert)

print(f"Upserted {len(vectors_to_upsert)} vectors into Pinecone index '{index_name}'.")

In [11]:
!pip install langchain


Collecting langchain
  Downloading langchain-0.3.16-py3-none-any.whl.metadata (7.1 kB)
Collecting PyYAML>=5.3 (from langchain)
  Using cached PyYAML-6.0.2-cp311-cp311-win_amd64.whl.metadata (2.1 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain)
  Using cached SQLAlchemy-2.0.37-cp311-cp311-win_amd64.whl.metadata (9.9 kB)
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain)
  Using cached aiohttp-3.11.11-cp311-cp311-win_amd64.whl.metadata (8.0 kB)
Collecting langchain-core<0.4.0,>=0.3.32 (from langchain)
  Downloading langchain_core-0.3.32-py3-none-any.whl.metadata (6.3 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.3 (from langchain)
  Downloading langchain_text_splitters-0.3.5-py3-none-any.whl.metadata (2.3 kB)
Collecting langsmith<0.4,>=0.1.17 (from langchain)
  Downloading langsmith-0.3.2-py3-none-any.whl.metadata (14 kB)
Collecting numpy<2,>=1.22.4 (from langchain)
  Using cached numpy-1.26.4-cp311-cp311-win_amd64.whl.metadata (61 kB)
Collecting pydantic<3.0.0,>=2.7.4 (from langc

  You can safely remove it manually.
  You can safely remove it manually.


In [29]:
import pandas as pd
import pinecone
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Load your DataFrame
df = pd.read_csv("../DATA/Embedded_Files/testing_tokens.csv")  # Replace with your file path

# Ensure the "tokens" column is in the correct format (list of floats)
df["tokens"] = df["tokens"].apply(eval)  # Convert string representation of list to actual list

# Chunk the data using LangChain
chunk_size = 256
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)

def chunk_embeddings(tokens, chunk_size):
    """Splits the embedding list into chunks of specified size."""
    return [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]

# Create or connect to a Pinecone index
index_name = "medical-tests-index"

pc.create_index(
    name=index_name,
    dimension=chunk_size, # Use the correct dimension
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

index = pc.Index(index_name)


In [41]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pinecone

def upsert_data_to_pinecone(df, chunk_size=256, batch_size=100):

    # Ensure the "tokens" column is in the correct format (list of floats)
    df["tokens"] = df["tokens"].apply(eval)  # Convert string representation of list to actual list

    # Chunk the data using LangChain
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)

    def chunk_embeddings(tokens, chunk_size):
        """Splits the embedding list into chunks of specified size."""
        return [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]

    # Create or connect to a Pinecone index
    index_name = "medical-tests-index"

    index = pc.Index(index_name)

    # Prepare data for upserting into Pinecone
    print("Uploading data to Pinecone...")
    data_to_upsert = []
    for idx, row in df.iterrows():
        token_chunks = chunk_embeddings(row["tokens"], chunk_size)
        
        for chunk_idx, chunk in enumerate(token_chunks):
            if len(chunk) == chunk_size:  # Ensure valid chunk size
                vector_id = f"vec_{idx}_chunk_{chunk_idx}"
                metadata = {
                    "test_name": row["Test Name"],
                    "source": row["Source"],
                    "url": row["URL"]
                }
                # Convert all values in the chunk to float
                chunk = [float(x) for x in chunk]
                data_to_upsert.append({"id": vector_id, "values": chunk, "metadata": metadata})

    # Batch upload
    for i in range(0, len(data_to_upsert), batch_size):
        batch = data_to_upsert[i:i + batch_size]
        index.upsert(vectors=batch)
        if i % 10 == 0:
            print(f"Uploaded batch {i // batch_size + 1} of {len(data_to_upsert) // batch_size + 1}")

    index.upsert(vectors=batch)
    print("Data upload complete!")



In [42]:
df2 = pd.read_csv("../DATA/Embedded_Files/medlinePlus_tokens.csv")  # Replace with your file path

In [43]:
upsert_data_to_pinecone(df2)

Uploading data to Pinecone...
Uploaded batch 1 of 15
Uploaded batch 2 of 15
Uploaded batch 3 of 15
Uploaded batch 4 of 15
Uploaded batch 5 of 15
Uploaded batch 6 of 15
Uploaded batch 7 of 15
Uploaded batch 8 of 15
Uploaded batch 9 of 15
Uploaded batch 10 of 15
Uploaded batch 11 of 15
Uploaded batch 12 of 15
Uploaded batch 13 of 15
Uploaded batch 14 of 15
Uploaded batch 15 of 15
Data upload complete!


In [None]:
# Query Pinecone to count matching chunks
query_filter = {"test_name": {"$eq": "HIV Viral Load"}}

# Fetch metadata-matching chunks
fetch_result = index.query(vector=[0] * chunk_size,  # Dummy vector
                           top_k=10000,  # Fetch a large number to ensure all results are included
                           filter=query_filter,
                           namespace="ns1",
                           include_metadata=True)

# Count matching chunks
matching_chunks_count = len(fetch_result["matches"])
print(f"Number {matching_chunks_count}")


Number of chunks with 'Urine Protein And Urine Protein Creatinine Ratio': 0
