In [24]:
import os
import fitz
import pandas as pd
import re

In [25]:
df = pd.read_csv("../DATA/Scraped_Data/medical_tests_interpretation.csv")
df2 = pd.read_csv("../DATA/Scraped_Data/testing_scraped_content.csv")

In [26]:
CHUNK_SIZE = 325

In [30]:

def chunk_text(text, chunk_size=CHUNK_SIZE):
    words = re.findall(r'\b\w+\b', text)
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

def process_csv(df, source):
    
    # Initialize an empty list to store the new rows
    df['Description'] = df['Description'].fillna('')
    new_rows = []
    
    # Iterate over each row in the DataFrame
    for index, row in df.iterrows():
        test_name = row['Test Name']
        description = row['Description']
        url = row['URL']
        
        # Chunk the description into 128-word chunks
        chunks = chunk_text(description, chunk_size=128)
        
        # Create a new row for each chunk
        for chunk in chunks:
            new_row = {
                'Test Name': test_name,
                'Description': chunk,
                'Source': source,
                'URL': url
            }
            new_rows.append(new_row)
    
    # Create a new DataFrame from the list of new rows
    new_df = pd.DataFrame(new_rows)
    return new_df


In [31]:
med_df = process_csv(df,source=  "medlineplus")

In [32]:
# Write the new DataFrame to a CSV file
med_df.to_csv(f"medlineplus_chunks_{CHUNK_SIZE}.csv", index=False)

In [33]:
test_df = process_csv(df2, source="testing.com")

In [34]:
test_df.to_csv(f"testing_chunks_{CHUNK_SIZE}.csv", index=False)

# EXTRACT DATA FROM PDF

In [35]:

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    return "\n".join([page.get_text("text") for page in doc])

# Function to split text into chunks
def create_text_chunks(text, chunk_size=CHUNK_SIZE):
    words = text.split()
    return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

# Function to extract headings and map content to them
def extract_headings_and_text(text):
    lines = text.split("\n")
    extracted_data = []
    current_heading = None
    buffer = []
    
    for line in lines:
        if line.isupper() and len(line) > 3:  # Heuristic for headings
            if buffer:
                extracted_data.append((current_heading, " ".join(buffer)))
                buffer = []
            current_heading = line.strip()
        else:
            buffer.append(line.strip())
    
    if buffer:
        extracted_data.append((current_heading, " ".join(buffer)))
    
    return extracted_data

# Function to create a structured DataFrame
def create_dataframe_from_text_files(txt_folder):
    data_list = []
    txt_files = sorted(os.listdir(txt_folder), key=lambda x: int(x.split(".")[0]))
    
    for txt_file in txt_files:
        txt_path = os.path.join(txt_folder, txt_file)
        with open(txt_path, "r", encoding="utf-8") as f:
            text_data = f.read()
        
        extracted_data = extract_headings_and_text(text_data)
        text_chunks = create_text_chunks(text_data)
        
        for chunk in text_chunks:
            test_name, source = "No_testName", "No_Source"
            
            for heading, content in extracted_data:
                if chunk in content:
                    test_name = heading
                    source = heading
                    break
            
            data_list.append({
                "Test Name": test_name,
                "Description": chunk,
                "Source": source,
                "URL": "No_URl"
            })
    
    return pd.DataFrame(data_list)

# Function to save DataFrame to a CSV file
def save_dataframe(df, output_path):
    df.to_csv(output_path, index=False, encoding="utf-8")




In [36]:
# Main execution
txt_folder = "../DATA/TextFiles"
df = create_dataframe_from_text_files(txt_folder)
save_dataframe(df, f"books_{CHUNK_SIZE}.csv")

print("Data processing complete. CSV file saved.")

Data processing complete. CSV file saved.


# Creating Embeddings

In [38]:
import pandas as pd
import concurrent.futures
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [39]:
embedding_model = SentenceTransformer('sentence-transformers/msmarco-bert-base-dot-v5')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [40]:


# Load CSV files
df1 = pd.read_csv(f"../DATA/Chunked_data/medlineplus_chunks_{CHUNK_SIZE}.csv")
df2 = pd.read_csv(f"../DATA/Chunked_data/testing_chunks_{CHUNK_SIZE}.csv")
df3 = pd.read_csv(f"../DATA/Chunked_data/books_{CHUNK_SIZE}.csv")

# Load the embedding model once (reduces overhead)


def embeddingCreator(text):
    """
    Generates embeddings for a given text using SentenceTransformer.
    """
    return embedding_model.encode(text).tolist()

def Scrapper_With_Source(df, source):
    """
    Extracts embeddings and metadata from a DataFrame when a fixed source is given.
    """
    data = []
    for i, row in df.iterrows():
        text = row["Description"]
        test_name = row["Test Name"]
        url = row["URL"]

        tokens = embeddingCreator(text)
        data.append({"Test Name": test_name, "tokens": tokens, "text": text, "Source": source, "URL": url})
        
        if i % 100 == 0:  # Print progress every 100 rows
            print(f"Processed {i}/{len(df)} rows (Source: {source})")

    return pd.DataFrame(data)

def Scrapper_With_Out_Source(df):
    """
    Extracts embeddings and metadata from a DataFrame when the source column exists.
    """
    data = []
    for i, row in df.iterrows():
        text = row["Description"]
        test_name = row["Test Name"]
        url = row["URL"]
        source = row["Source"]

        tokens = embeddingCreator(text)
        data.append({"Test Name": test_name, "tokens": tokens, "text": text, "Source": source, "URL": url})

        if i % 100 == 0:  # Print progress every 100 rows
            print(f"Processed {i}/{len(df)} rows (Dynamic Source)")

    return pd.DataFrame(data)

def writeToFile(df_tokens, filename):
    """
    Writes the DataFrame to a CSV file.
    """
    df_tokens.to_csv(filename, index=False, encoding="utf-8")

def Create_Embedding_With_Source(df, source, path):
    """
    Wrapper function to process a DataFrame and store embeddings when source is known.
    """
    df["Description"] = df["Description"].fillna('')
    df_tokens = Scrapper_With_Source(df, source)
    writeToFile(df_tokens, path)

def Create_Embedding_With_Out_Source(df, path):
    """
    Wrapper function to process a DataFrame and store embeddings when source is in CSV.
    """
    df["Description"] = df["Description"].fillna('')
    df_tokens = Scrapper_With_Out_Source(df)
    writeToFile(df_tokens, path)




In [41]:
# Parallel execution of embedding creation
if __name__ == "__main__":
    with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
        futures = {
            executor.submit(Create_Embedding_With_Out_Source, df3, f"../DATA/Embedded_Files/files_tokens_{CHUNK_SIZE}.csv"): "files",
            executor.submit(Create_Embedding_With_Source, df1, f"medlinePlus.com", "../DATA/Embedded_Files/medlinePlus_tokens_{CHUNK_SIZE}.csv"): "medlineplus",
            executor.submit(Create_Embedding_With_Source, df2, f"testing.com", "../DATA/Embedded_Files/testing_com_tokens_{CHUNK_SIZE}.csv"): "testing",
        }
        
        for future in concurrent.futures.as_completed(futures):
            task_name = futures[future]
            try:
                future.result()  # This will raise exceptions if any occur
                print(f"✅ Successfully processed {task_name}")
            except Exception as e:
                print(f"❌ Error processing {task_name}: {e}")

Processed 0/3449 rows (Source: testing.com)
Processed 0/2803 rows (Source: medlinePlus.com)
Processed 0/1148 rows (Dynamic Source)
Processed 100/2803 rows (Source: medlinePlus.com)
Processed 100/3449 rows (Source: testing.com)
Processed 200/2803 rows (Source: medlinePlus.com)
Processed 100/1148 rows (Dynamic Source)
Processed 200/3449 rows (Source: testing.com)
Processed 300/2803 rows (Source: medlinePlus.com)
Processed 300/3449 rows (Source: testing.com)
Processed 400/2803 rows (Source: medlinePlus.com)
Processed 200/1148 rows (Dynamic Source)
Processed 400/3449 rows (Source: testing.com)
Processed 500/2803 rows (Source: medlinePlus.com)
Processed 500/3449 rows (Source: testing.com)
Processed 600/2803 rows (Source: medlinePlus.com)
Processed 300/1148 rows (Dynamic Source)
Processed 600/3449 rows (Source: testing.com)
Processed 700/2803 rows (Source: medlinePlus.com)
Processed 700/3449 rows (Source: testing.com)
Processed 800/2803 rows (Source: medlinePlus.com)
Processed 400/1148 rows 