In [1]:
import dotenv
import os

dotenv.load_dotenv(".env")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")


In [2]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)


  from tqdm.autonotebook import tqdm


In [3]:
import pandas as pd
import pinecone
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Load your DataFrame
df = pd.read_csv("../DATA/Embedded_Files/testing_tokens.csv")  # Replace with your file path

# Ensure the "tokens" column is in the correct format (list of floats)
df["tokens"] = df["tokens"].apply(eval)  # Convert string representation of list to actual list

# Chunk the data using LangChain
chunk_size = 256
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)

def chunk_embeddings(tokens, chunk_size):
    """Splits the embedding list into chunks of specified size."""
    return [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]

# Create or connect to a Pinecone index
index_name = "medical-tests-index"


index = pc.Index(index_name)


In [4]:
import os
import fitz  # PyMuPDF
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Function to extract text from PDF and save to txt file
def save_pdf_text(pdf_path, output_folder):
    doc = fitz.open(pdf_path)
    text = "\n".join([page.get_text("text") for page in doc])
    file_number = len(os.listdir(output_folder)) + 1
    output_path = os.path.join(output_folder, f"{file_number}.txt")
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(text)
    return output_path

# Function to extract headings from text
def extract_headings_and_text(text):
    lines = text.split("\n")
    extracted_data = []
    current_heading = "Unknown"
    buffer = []
    for line in lines:
        if line.isupper() and len(line) > 3:  # Heuristic for headings
            if buffer:
                extracted_data.append((current_heading, " ".join(buffer)))
                buffer = []
            current_heading = line.strip()
        else:
            buffer.append(line.strip())
    if buffer:
        extracted_data.append((current_heading, " ".join(buffer)))
    return extracted_data

# Function to create embeddings
def embeddingCreator(text, modelName="o200k_base"):
    encoding = tiktoken.get_encoding(modelName)
    return encoding.encode(text)

# Function to split embeddings into chunks
def chunk_embeddings(tokens, chunk_size):
    return [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]




In [15]:

# Process PDF files and save text
pdf_folder = "../DATA/Books/New folder"
txt_folder = "../DATA/TextFiles"
os.makedirs(txt_folder, exist_ok=True)


for file in os.listdir(pdf_folder):
    file_path = os.path.join(pdf_folder, file)
    if os.path.isfile(file_path) and file.endswith(".pdf"):
        print(f"Processing PDF: {file}")
        save_pdf_text(file_path, txt_folder)


Processing PDF: 159358_AMAGlossaryofMedicalTerms_Ver1.0.pdf
Processing PDF: FDMT503ABloodChemHndts.pdf
Processing PDF: interpretation-of-full-blood-count-parameters-in-health-and-disease.pdf
Processing PDF: Interpreting+Laboratory+Tests.pdf
Processing PDF: laboratory-reference-ranges.pdf
Processing PDF: Lab_Values_Table_PSAP.pdf
Processing PDF: ln_hematology_mlt_final.pdf
Processing PDF: main.pdf
Processing PDF: Oxford-Handbook-of-Clinical-and-Laboratory-Investigation.pdf


In [6]:

# Process saved text files
txt_folder = "../DATA/TextFiles"
data_to_upsert = []
txt_files = sorted(os.listdir(txt_folder), key=lambda x: int(x.split(".")[0]))
for txt_file in txt_files:
    txt_path = os.path.join(txt_folder, txt_file)
    with open(txt_path, "r", encoding="utf-8") as f:
        text_data = f.read()
    
    extracted_data = extract_headings_and_text(text_data)
    text_data = " ".join([text for _, text in extracted_data])
    tokens = embeddingCreator(text_data)
    token_chunks = chunk_embeddings(tokens, 256)

    heading_index = 0
    for chunk_idx, chunk in enumerate(token_chunks):
        if len(chunk) == 256:
            while heading_index < len(extracted_data) - 1 and len(extracted_data[heading_index][1]) < 256:
                heading_index += 1
            assigned_heading = extracted_data[heading_index][0]
            
            vector_id = f"{txt_file}_chunk_{chunk_idx}"
            metadata = {"test_name": assigned_heading, "source": txt_file, "url": "NaN"}
            data_to_upsert.append({"id": vector_id, "values": chunk, "metadata": metadata})


In [8]:
# Batch upload to Pinecone
batch_size = 100
for i in range(0, len(data_to_upsert), batch_size):
    batch = data_to_upsert[i:i + batch_size]
    # Convert all values in the chunk to float
    for item in batch:
        item['values'] = [float(x) for x in item['values']]
    index.upsert(vectors=batch, namespace="ns1")
    print(f"Uploaded batch {i // batch_size + 1} of {len(data_to_upsert) // batch_size + 1}")

print("All data uploaded successfully!")

Uploaded batch 1 of 24
Uploaded batch 2 of 24
Uploaded batch 3 of 24
Uploaded batch 4 of 24
Uploaded batch 5 of 24
Uploaded batch 6 of 24
Uploaded batch 7 of 24
Uploaded batch 8 of 24
Uploaded batch 9 of 24
Uploaded batch 10 of 24
Uploaded batch 11 of 24
Uploaded batch 12 of 24
Uploaded batch 13 of 24
Uploaded batch 14 of 24
Uploaded batch 15 of 24
Uploaded batch 16 of 24
Uploaded batch 17 of 24
Uploaded batch 18 of 24
Uploaded batch 19 of 24
Uploaded batch 20 of 24
Uploaded batch 21 of 24
Uploaded batch 22 of 24
Uploaded batch 23 of 24
Uploaded batch 24 of 24
All data uploaded successfully!


In [41]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pinecone

def upsert_data_to_pinecone(df, chunk_size=256, batch_size=100):

    # Ensure the "tokens" column is in the correct format (list of floats)
    df["tokens"] = df["tokens"].apply(eval)  # Convert string representation of list to actual list

    # Chunk the data using LangChain
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)

    def chunk_embeddings(tokens, chunk_size):
        """Splits the embedding list into chunks of specified size."""
        return [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]

    # Create or connect to a Pinecone index
    index_name = "medical-tests-index"

    index = pc.Index(index_name)

    # Prepare data for upserting into Pinecone
    print("Uploading data to Pinecone...")
    data_to_upsert = []
    for idx, row in df.iterrows():
        token_chunks = chunk_embeddings(row["tokens"], chunk_size)
        
        for chunk_idx, chunk in enumerate(token_chunks):
            if len(chunk) == chunk_size:  # Ensure valid chunk size
                vector_id = f"vec_{idx}_chunk_{chunk_idx}"
                metadata = {
                    "test_name": row["Test Name"],
                    "source": row["Source"],
                    "url": row["URL"]
                }
                # Convert all values in the chunk to float
                chunk = [float(x) for x in chunk]
                data_to_upsert.append({"id": vector_id, "values": chunk, "metadata": metadata})

    # Batch upload
    for i in range(0, len(data_to_upsert), batch_size):
        batch = data_to_upsert[i:i + batch_size]
        index.upsert(vectors=batch)
        if i % 10 == 0:
            print(f"Uploaded batch {i // batch_size + 1} of {len(data_to_upsert) // batch_size + 1}")

    index.upsert(vectors=batch)
    print("Data upload complete!")



In [42]:
df2 = pd.read_csv("../DATA/Embedded_Files/medlinePlus_tokens.csv")  # Replace with your file path

In [43]:
upsert_data_to_pinecone(df2)

Uploading data to Pinecone...
Uploaded batch 1 of 15
Uploaded batch 2 of 15
Uploaded batch 3 of 15
Uploaded batch 4 of 15
Uploaded batch 5 of 15
Uploaded batch 6 of 15
Uploaded batch 7 of 15
Uploaded batch 8 of 15
Uploaded batch 9 of 15
Uploaded batch 10 of 15
Uploaded batch 11 of 15
Uploaded batch 12 of 15
Uploaded batch 13 of 15
Uploaded batch 14 of 15
Uploaded batch 15 of 15
Data upload complete!


In [None]:
# Query Pinecone to count matching chunks
query_filter = {"test_name": {"$eq": "HIV Viral Load"}}

# Fetch metadata-matching chunks
fetch_result = index.query(vector=[0] * chunk_size,  # Dummy vector
                           top_k=10000,  # Fetch a large number to ensure all results are included
                           filter=query_filter,
                           namespace="ns1",
                           include_metadata=True)

# Count matching chunks
matching_chunks_count = len(fetch_result["matches"])
print(f"Number {matching_chunks_count}")


Number of chunks with 'Urine Protein And Urine Protein Creatinine Ratio': 0
