In [1]:
%load_ext dotenv
%dotenv

In [2]:
import pandas as pd
import os
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv, find_dotenv
import pinecone
from sentence_transformers import SentenceTransformer

In [3]:
symptom_df = pd.read_csv("../data/preprocessed_symptom_dataset.csv", encoding = "cp1252")

In [4]:
# Create a unique identifier for each section combining course_id and section_id
symptom_df['unique_id'] = symptom_df['disease'].astype(str) + '-' + symptom_df['symptom'].astype(str)

In [5]:
# Create metadata for each section
symptom_df['metadata'] = symptom_df.apply(lambda row: {
    "disease": row['disease'],
    "symptom": row['symptom'],
    "description": row['description'],
    "weight": row['weight'],
    "precautions": row['precautions'] if isinstance(row['precautions'], list) else []
}, axis=1)


In [6]:
weights = {
    'disease': 2.0,
    'symptom': 1.5,
    'description': 1.0,
    'precautions': 1.0
}

In [7]:
# Initialize the model
model = SentenceTransformer('multi-qa-distilbert-cos-v1')

In [8]:
def create_embeddings(row, model, weights):
    """
    Create a combined embedding for a metadata row.
    
    Parameters:
    - row: A row of the dataframe.
    - model: The embedding model used for encoding.
    - weights: A dictionary of weights for individual fields.
    
    Returns:
    - A combined embedding vector.
    """
    # Extract weights
    weight_disease = weights.get('disease', 2.0)
    weight_symptom = weights.get('symptom', 1.5)
    weight_description = weights.get('description', 1.0)
    weight_precautions = weights.get('precautions', 1.0)

    # Encode individual components
    emb_disease = model.encode(row['disease'], show_progress_bar=False) * weight_disease
    emb_symptom = model.encode(row['symptom'], show_progress_bar=False) * weight_symptom * row['weight']  # Incorporate symptom weight
    emb_description = model.encode(row['description'], show_progress_bar=False) * weight_description
    print(f"Row precautions: {row['precautions']}")

    # Handle precautions as a list
    if isinstance(row['precautions'], list) and len(row['precautions']) > 0:
        emb_precautions = sum(model.encode(precaution, show_progress_bar=False) for precaution in row['precautions']) * weight_precautions
    else:
        # If precautions is missing or an empty list
        emb_precautions = model.encode('No Precautions Available', show_progress_bar=False) * weight_precautions

    # Combine embeddings by averaging them
    combined_embedding = (emb_disease + emb_symptom + emb_description + emb_precautions) / \
                         (weight_disease + weight_symptom* row['weight'] + weight_description + weight_precautions)
    
    return combined_embedding


In [9]:
# Apply the method
symptom_df['embedding'] = symptom_df.apply(lambda row: create_embeddings(row, model, weights), axis=1)

Row precautions: ['lie down', 'avoid sudden change in body', 'avoid abrupt head movment', 'relax']
Row precautions: ['lie down', 'avoid sudden change in body', 'avoid abrupt head movment', 'relax']
Row precautions: ['lie down', 'avoid sudden change in body', 'avoid abrupt head movment', 'relax']
Row precautions: ['lie down', 'avoid sudden change in body', 'avoid abrupt head movment', 'relax']
Row precautions: ['lie down', 'avoid sudden change in body', 'avoid abrupt head movment', 'relax']
Row precautions: ['lie down', 'avoid sudden change in body', 'avoid abrupt head movment', 'relax']
Row precautions: ['bath twice', 'avoid fatty spicy food', 'drink plenty of water', 'avoid too many products']
Row precautions: ['bath twice', 'avoid fatty spicy food', 'drink plenty of water', 'avoid too many products']
Row precautions: ['bath twice', 'avoid fatty spicy food', 'drink plenty of water', 'avoid too many products']
Row precautions: ['bath twice', 'avoid fatty spicy food', 'drink plenty of w

In [10]:
# Save to file
symptom_df.to_pickle('../embeddings/symptom_embeddings.pkl')  # Save as pickle for easy reloading