In [1]:
from sentence_transformers import SentenceTransformer
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
df = pd.read_csv("../DATA/Scraped_Data/medical_tests_interpretation.csv")
df2 = pd.read_csv("../DATA/Scraped_Data/testing_scraped_content.csv")

In [18]:
import pandas as pd
import re

def chunk_text(text, chunk_size=128):
    words = re.findall(r'\b\w+\b', text)
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

def process_csv(df, output_csv, source):
    
    # Initialize an empty list to store the new rows
    df['Description'] = df['Description'].fillna('')
    new_rows = []
    
    # Iterate over each row in the DataFrame
    for index, row in df.iterrows():
        test_name = row['Test Name']
        description = row['Description']
        url = row['URL']
        
        # Chunk the description into 128-word chunks
        chunks = chunk_text(description, chunk_size=128)
        
        # Create a new row for each chunk
        for chunk in chunks:
            new_row = {
                'Test Name': test_name,
                'Description': chunk,
                'Source': source,
                'URL': url
            }
            new_rows.append(new_row)
    
    # Create a new DataFrame from the list of new rows
    new_df = pd.DataFrame(new_rows)
    
    # Write the new DataFrame to a CSV file
    new_df.to_csv(output_csv, index=False)


In [19]:
process_csv(df, "medlineplus_chunks.csv",source=  "medlineplus")

In [None]:
process_csv(df2, "testing_chunks.csv", source="testing.com")

# Creating Embeddings

In [12]:
import pandas as pd
import concurrent.futures
from sentence_transformers import SentenceTransformer

# Load CSV files
df1 = pd.read_csv("../DATA/Chunked_data/medlineplus_chunks.csv")
df2 = pd.read_csv("../DATA/Chunked_data/testing_chunks.csv")
df3 = pd.read_csv("../DATA/Chunked_data/files_chunks.csv")

# Load the embedding model once (reduces overhead)
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

def embeddingCreator(text):
    """
    Generates embeddings for a given text using SentenceTransformer.
    """
    return embedding_model.encode(text).tolist()

def Scrapper_With_Source(df, source):
    """
    Extracts embeddings and metadata from a DataFrame when a fixed source is given.
    """
    data = []
    for i, row in df.iterrows():
        text = row["Description"]
        test_name = row["Test Name"]
        url = row["URL"]

        tokens = embeddingCreator(text)
        data.append({"Test Name": test_name, "tokens": tokens, "text": text, "Source": source, "URL": url})
        
        if i % 100 == 0:  # Print progress every 100 rows
            print(f"Processed {i}/{len(df)} rows (Source: {source})")

    return pd.DataFrame(data)

def Scrapper_With_Out_Source(df):
    """
    Extracts embeddings and metadata from a DataFrame when the source column exists.
    """
    data = []
    for i, row in df.iterrows():
        text = row["Description"]
        test_name = row["Test Name"]
        url = row["URL"]
        source = row["Source"]

        tokens = embeddingCreator(text)
        data.append({"Test Name": test_name, "tokens": tokens, "text": text, "Source": source, "URL": url})

        if i % 100 == 0:  # Print progress every 100 rows
            print(f"Processed {i}/{len(df)} rows (Dynamic Source)")

    return pd.DataFrame(data)

def writeToFile(df_tokens, filename):
    """
    Writes the DataFrame to a CSV file.
    """
    df_tokens.to_csv(filename, index=False, encoding="utf-8")

def Create_Embedding_With_Source(df, source, path):
    """
    Wrapper function to process a DataFrame and store embeddings when source is known.
    """
    df["Description"] = df["Description"].fillna('')
    df_tokens = Scrapper_With_Source(df, source)
    writeToFile(df_tokens, path)

def Create_Embedding_With_Out_Source(df, path):
    """
    Wrapper function to process a DataFrame and store embeddings when source is in CSV.
    """
    df["Description"] = df["Description"].fillna('')
    df_tokens = Scrapper_With_Out_Source(df)
    writeToFile(df_tokens, path)

# Parallel execution of embedding creation
if __name__ == "__main__":
    with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
        futures = {
            executor.submit(Create_Embedding_With_Out_Source, df3, "../DATA/Embedded_Files/files_tokens.csv"): "files",
            executor.submit(Create_Embedding_With_Source, df1, "medlinePlus.com", "../DATA/Embedded_Files/medlinePlus_tokens.csv"): "medlineplus",
            executor.submit(Create_Embedding_With_Source, df2, "testing.com", "../DATA/Embedded_Files/testing_com_tokens.csv"): "testing",
        }
        
        for future in concurrent.futures.as_completed(futures):
            task_name = futures[future]
            try:
                future.result()  # This will raise exceptions if any occur
                print(f"✅ Successfully processed {task_name}")
            except Exception as e:
                print(f"❌ Error processing {task_name}: {e}")


Processed 0/3449 rows (Source: testing.com)
Processed 0/2803 rows (Source: medlinePlus.com)
Processed 0/2906 rows (Dynamic Source)
Processed 100/3449 rows (Source: testing.com)
Processed 100/2803 rows (Source: medlinePlus.com)
Processed 100/2906 rows (Dynamic Source)
Processed 200/3449 rows (Source: testing.com)
Processed 200/2803 rows (Source: medlinePlus.com)
Processed 200/2906 rows (Dynamic Source)
Processed 300/3449 rows (Source: testing.com)
Processed 300/2803 rows (Source: medlinePlus.com)
Processed 300/2906 rows (Dynamic Source)
Processed 400/3449 rows (Source: testing.com)
Processed 400/2803 rows (Source: medlinePlus.com)
Processed 400/2906 rows (Dynamic Source)
Processed 500/3449 rows (Source: testing.com)
Processed 500/2803 rows (Source: medlinePlus.com)
Processed 500/2906 rows (Dynamic Source)
Processed 600/3449 rows (Source: testing.com)
Processed 600/2803 rows (Source: medlinePlus.com)
Processed 600/2906 rows (Dynamic Source)
Processed 700/3449 rows (Source: testing.com)
P