In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('/content/wearable_features (1) (1).csv')

Adding a columns for heath_category and health_recommendations

In [3]:
# Step 1: Define health score categories
def categorize_health(score):
    if score < 40:
        return "High Risk"
    elif 40 <= score < 70:
        return "Moderate Risk"
    else:
        return "Healthy"

# Apply the health score category
df["Health_Category"] = df["health_score"].apply(categorize_health)

# Step 2: Define recommendations based on health conditions
def generate_recommendation(row):
    recommendations = []

    # Blood Oxygen & Heart Rate
    if row["blood_oxygen_level"] < 90:
        recommendations.append("Your blood oxygen levels are low. Consider breathing exercises and consulting a doctor.")
    if row["heart_rate"] > 100:
        recommendations.append("Your heart rate is elevated. Reduce stress and monitor your cardiovascular health.")

    # Sleep
    if row["sleep_duration"] < 6:
        recommendations.append("Improve sleep hygiene. Aim for at least 7 hours of sleep to support recovery and metabolism.")

    # Stress Levels
    if row["stress_level_High"]:
        recommendations.append("Your stress levels are high. Engage in relaxation techniques such as meditation or deep breathing exercises.")

    # Exercise
    if row["exercise_duration"] < 30:
        recommendations.append("Increase physical activity. Aim for at least 30 minutes of moderate exercise daily for cardiovascular and metabolic health.")
    elif row["exercise_duration"] > 90:
        recommendations.append("Avoid excessive exercise as it may lead to overtraining and increased cortisol levels.")

    # Lifestyle (Smoking & Alcohol)
    if row["smoker"] == "Yes":
        recommendations.append("Consider quitting smoking to improve lung and cardiovascular health.")
    if row["alcohol_consumption"] == "High":
        recommendations.append("Reduce alcohol consumption for better metabolic and liver health.")

    return " ".join(recommendations) if recommendations else "Maintain a balanced lifestyle and continue healthy habits."

# Apply recommendations
df["Health_Recommendation"] = df.apply(generate_recommendation, axis=1)

# Display a sample of the transformed data
df[["health_score", "Health_Category", "blood_oxygen_level", "sleep_duration", "exercise_duration", "stress_level_High", "Health_Recommendation"]].head()

Unnamed: 0,health_score,Health_Category,blood_oxygen_level,sleep_duration,exercise_duration,stress_level_High,Health_Recommendation
0,25.992071,High Risk,90.658505,6.518206,1.219675,False,Your heart rate is elevated. Reduce stress and...
1,64.30845,Moderate Risk,97.13336,7.522446,1.378399,True,Your heart rate is elevated. Reduce stress and...
2,78.003546,Healthy,99.555594,8.583386,0.803857,False,Your heart rate is elevated. Reduce stress and...
3,16.454106,High Risk,91.4449,4.425767,0.0,False,Your heart rate is elevated. Reduce stress and...
4,42.528202,Moderate Risk,96.092103,6.008918,0.807389,True,Your stress levels are high. Engage in relaxat...


saving as json with relevant columns

In [4]:
import json

# Remove user_id from structured dataset for RAG
optimized_data = df[[
    "Health_Category", "health_score", "blood_oxygen_level",
    "sleep_duration", "exercise_duration", "stress_level_High", "Health_Recommendation"
]]

# Save optimized dataset for retrieval (without user_id)
optimized_csv_file = "/content/structured_health_data.csv"
optimized_data.to_csv(optimized_csv_file, index=False)

optimized_json_file = "/content/structured_health_data.json"
optimized_data_json = optimized_data.to_dict(orient="records")

with open(optimized_json_file, "w") as file:
    json.dump(optimized_data_json, file, indent=4)

print(f"Optimized dataset saved : {optimized_csv_file}, {optimized_json_file}")


Optimized dataset saved (without user_id): /content/structured_health_data.csv, /content/structured_health_data.json


Build Vector for embedding

In [5]:
!pip install pinecone


Collecting pinecone
  Downloading pinecone-6.0.2-py3-none-any.whl.metadata (9.0 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Downloading pinecone-6.0.2-py3-none-any.whl (421 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/421.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.7/421.9 kB[0m [31m2.7 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m419.8/421.9 kB[0m [31m6.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m421.9/421.9 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)
Installing collected packages: pinecone-plugin-interface, pinecone
Successfully installed pinecone-6.0.2 pinecone-plugin-interface-0.0.7


In [6]:
import os
from pinecone import Pinecone, ServerlessSpec

# Set Pinecone API key and environment
PINECONE_API_KEY = "Your Key"
PINECONE_ENV = "Your env"

# Initialize Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)

# Define an index name
index_name = "health-recommendations"

# Create the index if it doesn’t already exist
if index_name not in [index.name for index in pc.list_indexes()]:
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region=PINECONE_ENV)
    )

print(f"Pinecone index '{index_name}' is ready.")


Pinecone index 'health-recommendations' is ready.


In [7]:
!pip install openai




In [8]:
import openai

# Set your OpenAI API key
OPENAI_API_KEY = "Your Key"  # Replace with your actual API key
openai.api_key = OPENAI_API_KEY


In [9]:
from pinecone import Pinecone

# Reconnect to Pinecone
PINECONE_API_KEY = "Your Key"
pc = Pinecone(api_key=PINECONE_API_KEY)

# Define index name
index_name = "health-recommendations"

# Connect to the existing index
index = pc.Index(index_name)

# Check the total number of records in the Pinecone index
index_stats = index.describe_index_stats()
print("Pinecone Index Stats:", index_stats)


Pinecone Index Stats: {'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 10365}},
 'total_vector_count': 10365,
 'vector_type': 'dense'}


In [10]:
# Reconnect to Pinecone index
index_name = "health-recommendations"
index = pc.Index(index_name)

# Check the total number of records in Pinecone
index_stats = index.describe_index_stats()
print("Pinecone Index Stats:", index_stats)


Pinecone Index Stats: {'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 10365}},
 'total_vector_count': 10365,
 'vector_type': 'dense'}


In [11]:
import os

wrong_file = "/content/structured_health_data_with_embeddings.json"

if os.path.exists(wrong_file):
    os.remove(wrong_file)
    print(" Deleted incorrect embeddings file.")
else:
    print(" No incorrect embeddings file found. Ready to generate new embeddings.")


 No incorrect embeddings file found. Ready to generate new embeddings.


In [12]:
import json
import openai
import time

# Load structured health data (without embeddings)
original_json_file = "/content/structured_health_data.json"
updated_json_file = "/content/structured_health_data_with_embeddings.json"

with open(original_json_file, "r") as file:
    health_data = json.load(file)

# Function to generate embeddings using the correct 384-dimension model
def get_embedding(text):
    response = openai.embeddings.create(
        input=text,
        model="text-embedding-3-small"
    )
    return response.data[0].embedding

# Generate embeddings in batches
BATCH_SIZE = 50
for idx in range(0, len(health_data), BATCH_SIZE):
    batch = health_data[idx:idx + BATCH_SIZE]
    texts = [entry["Health_Recommendation"] for entry in batch]

    try:
        embeddings = openai.embeddings.create(
            input=texts,
            model="text-embedding-3-small"
        ).data

        for i, entry in enumerate(batch):
            entry["embedding"] = embeddings[i].embedding

        # Save after every batch
        with open(updated_json_file, "w") as file:
            json.dump(health_data, file, indent=4)

        print(f"✅ Processed batch {idx // BATCH_SIZE + 1}/{len(health_data) // BATCH_SIZE + 1}")

    except Exception as e:
        print(f"⚠️ Error on batch {idx // BATCH_SIZE + 1}: {e}")

# Final save
with open(updated_json_file, "w") as file:
    json.dump(health_data, file, indent=4)

print(f"✅ Embedding regeneration complete and saved to {updated_json_file}!")


✅ Processed batch 1/201
✅ Processed batch 2/201
✅ Processed batch 3/201
✅ Processed batch 4/201
✅ Processed batch 5/201
✅ Processed batch 6/201
✅ Processed batch 7/201
✅ Processed batch 8/201
✅ Processed batch 9/201
✅ Processed batch 10/201
✅ Processed batch 11/201
✅ Processed batch 12/201
✅ Processed batch 13/201
✅ Processed batch 14/201
✅ Processed batch 15/201
✅ Processed batch 16/201
✅ Processed batch 17/201
✅ Processed batch 18/201
✅ Processed batch 19/201
✅ Processed batch 20/201
✅ Processed batch 21/201
✅ Processed batch 22/201
✅ Processed batch 23/201
✅ Processed batch 24/201
✅ Processed batch 25/201
✅ Processed batch 26/201
✅ Processed batch 27/201
✅ Processed batch 28/201
✅ Processed batch 29/201
✅ Processed batch 30/201
✅ Processed batch 31/201
✅ Processed batch 32/201
✅ Processed batch 33/201
✅ Processed batch 34/201
✅ Processed batch 35/201
✅ Processed batch 36/201
✅ Processed batch 37/201
✅ Processed batch 38/201
✅ Processed batch 39/201
✅ Processed batch 40/201
✅ Process

In [13]:
import numpy as np

# Reload the updated file
with open(updated_json_file, "r") as file:
    health_data = json.load(file)

# Check the dimension of one embedding
print("New Embedding dimension:", np.array(health_data[0]["embedding"]).shape)


New Embedding dimension: (1536,)


In [14]:
import openai

# Check the model version
response = openai.embeddings.create(
    input="Test sentence for embedding generation.",
    model="text-embedding-3-small"
)

print("Embedding dimension:", len(response.data[0].embedding))


Embedding dimension: 1536


In [15]:
# Delete the incorrect index
pc.delete_index("health-recommendations")
print(" Deleted old 384-dimensional index.")


❌ Deleted old 384-dimensional index.


In [16]:
from pinecone import ServerlessSpec

# Create a new Pinecone index with 1536 dimensions
pc.create_index(
    name="health-recommendations",
    dimension=1536,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

print("✅ New Pinecone index created with 1,536 dimensions.")


✅ New Pinecone index created with 1,536 dimensions.


In [17]:
import time

# Reconnect to Pinecone index
index = pc.Index("health-recommendations")

# Load embeddings
updated_json_file = "/content/structured_health_data_with_embeddings.json"
with open(updated_json_file, "r") as file:
    health_data = json.load(file)

# Prepare data for insertion
pinecone_vectors = []
for i, entry in enumerate(health_data):
    vector = entry["embedding"]
    metadata = {
        "Health_Category": entry["Health_Category"],
        "Health_Recommendation": entry["Health_Recommendation"],
        "health_score": entry["health_score"],
        "blood_oxygen_level": entry["blood_oxygen_level"],
        "sleep_duration": entry["sleep_duration"],
        "exercise_duration": entry["exercise_duration"],
        "stress_level_High": entry["stress_level_High"]
    }
    pinecone_vectors.append((str(i), vector, metadata))

# Insert embeddings into Pinecone in batches
BATCH_SIZE = 100

for i in range(0, len(pinecone_vectors), BATCH_SIZE):
    batch = pinecone_vectors[i : i + BATCH_SIZE]
    index.upsert(vectors=batch)
    print(f"✅ Inserted batch {i // BATCH_SIZE + 1}/{len(pinecone_vectors) // BATCH_SIZE + 1}")
    time.sleep(1)

# Verify that data is stored in Pinecone
index_stats = index.describe_index_stats()
print("✅ Pinecone Index Stats After Upsert:", index_stats)


✅ Inserted batch 1/101
✅ Inserted batch 2/101
✅ Inserted batch 3/101
✅ Inserted batch 4/101
✅ Inserted batch 5/101
✅ Inserted batch 6/101
✅ Inserted batch 7/101
✅ Inserted batch 8/101
✅ Inserted batch 9/101
✅ Inserted batch 10/101
✅ Inserted batch 11/101
✅ Inserted batch 12/101
✅ Inserted batch 13/101
✅ Inserted batch 14/101
✅ Inserted batch 15/101
✅ Inserted batch 16/101
✅ Inserted batch 17/101
✅ Inserted batch 18/101
✅ Inserted batch 19/101
✅ Inserted batch 20/101
✅ Inserted batch 21/101
✅ Inserted batch 22/101
✅ Inserted batch 23/101
✅ Inserted batch 24/101
✅ Inserted batch 25/101
✅ Inserted batch 26/101
✅ Inserted batch 27/101
✅ Inserted batch 28/101
✅ Inserted batch 29/101
✅ Inserted batch 30/101
✅ Inserted batch 31/101
✅ Inserted batch 32/101
✅ Inserted batch 33/101
✅ Inserted batch 34/101
✅ Inserted batch 35/101
✅ Inserted batch 36/101
✅ Inserted batch 37/101
✅ Inserted batch 38/101
✅ Inserted batch 39/101
✅ Inserted batch 40/101
✅ Inserted batch 41/101
✅ Inserted batch 42/101
✅

In [18]:
index_stats = index.describe_index_stats()
print("✅ Pinecone Index Stats:", index_stats)


✅ Pinecone Index Stats: {'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 10000}},
 'total_vector_count': 10000,
 'vector_type': 'dense'}


RAG Example

In [12]:
import openai

# Function to generate embedding for a user query
def get_query_embedding(query):
    response = openai.embeddings.create(
        input=query,
        model="text-embedding-3-large"
    )
    return response.data[0].embedding

# Function to search Pinecone for the best health recommendation
def query_pinecone(user_query, top_k=3):
    index = pc.Index("health-recommendations")

    # Generate embedding for the query
    query_embedding = get_query_embedding(user_query)

    # Search Pinecone for similar vectors
    search_results = index.query(
        vector=query_embedding,
        top_k=top_k,
        include_metadata=True
    )

    # Display the best matching health recommendations
    results = []
    for match in search_results["matches"]:
        results.append({
            "score": match["score"],
            "Health_Recommendation": match["metadata"]["Health_Recommendation"]
        })

    return results



In [13]:
import openai

# Function to generate embedding for a user query
def get_query_embedding(query):
    response = openai.embeddings.create(
        input=query,
        model="text-embedding-3-small"
    )
    return response.data[0].embedding


In [14]:
import openai

# Function to search Pinecone for the best health recommendation
def query_pinecone(user_query, top_k=3):
    index = pc.Index("health-recommendations")

    # Generate embedding for the query
    query_embedding = get_query_embedding(user_query)

    # Search Pinecone for similar vectors
    search_results = index.query(
        vector=query_embedding,
        top_k=top_k,
        include_metadata=True
    )

    # Display the best matching health recommendations
    results = []
    for match in search_results["matches"]:
        results.append({
            "score": match["score"],
            "Health_Recommendation": match["metadata"]["Health_Recommendation"]
        })

    return results


In [15]:
# Example query
user_query = "I have high blood pressure and trouble sleeping."

# Get recommendations
recommendations = query_pinecone(user_query)

# Display results
for i, rec in enumerate(recommendations):
    print(f"🔹 Recommendation {i+1} (Score: {rec['score']}):")
    print(rec["Health_Recommendation"])
    print("—" * 50)


🔹 Recommendation 1 (Score: 0.417925626):
Your heart rate is elevated. Reduce stress and monitor your cardiovascular health. Improve sleep hygiene. Aim for at least 7 hours of sleep to support recovery and metabolism. Your stress levels are high. Engage in relaxation techniques such as meditation or deep breathing exercises. Increase physical activity. Aim for at least 30 minutes of moderate exercise daily for cardiovascular and metabolic health.
——————————————————————————————————————————————————
🔹 Recommendation 2 (Score: 0.417925626):
Your heart rate is elevated. Reduce stress and monitor your cardiovascular health. Improve sleep hygiene. Aim for at least 7 hours of sleep to support recovery and metabolism. Your stress levels are high. Engage in relaxation techniques such as meditation or deep breathing exercises. Increase physical activity. Aim for at least 30 minutes of moderate exercise daily for cardiovascular and metabolic health.
—————————————————————————————————————————————————

Now that our vector is done, we are going to test  LLMs below.

In [16]:
import openai
import os

# Set OpenAI API key
OPENAI_API_KEY = "Your Key"  # 🔹 Replace with your OpenAI API Key

# Function to get GPT-4 response
def query_gpt4(user_query):
    client = openai.OpenAI(api_key=OPENAI_API_KEY)

    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a health expert providing concise, evidence-based recommendations. "
                "Respond in a single structured paragraph, summarizing key advice without a numbered list."
                "Ensure your recommendations are personalized and have actionable insights. Make sure you give examples for any life style changes and mention the type of medical help you need"},
            {"role": "user", "content": user_query}
        ]
    )
    return response.choices[0].message.content

# Example query
user_query = "I have high blood pressure and trouble sleeping. What should I do?"
gpt4_response = query_gpt4(user_query)

# Display GPT-4 response
print("🔹 GPT-4 Response:")
print(gpt4_response)


🔹 GPT-4 Response:
Addressing high blood pressure and trouble sleeping requires a multi-faceted approach encompassing both lifestyle modifications and possible medical interventions. On the lifestyle front, regular physical activity, a balanced diet low in sodium and high in fresh fruits and vegetables, cessation of smoking plus limiting alcohol and caffeine intake can contribute to lowering your blood pressure and improving your sleep. Take, for example, walking for at least 30 minutes a day and incorporating foods rich in potassium like bananas and oranges. It's also crucial to manage stress through mindfulness techniques such as yoga or meditation. On the medical side, you need to have regular check-ups with your primary care physician who can closely monitor your blood pressure and potentially prescribe appropriate medication. Consulting a sleep specialist may also be beneficial if your sleep doesn't improve with these changes. These integrative strategies can help in effectively ma

with health data

In [32]:
import openai



# Function to get a personalized GPT-4 response
def query_gpt4(user_query):
    client = openai.OpenAI(api_key=OPENAI_API_KEY)  # ✅ Explicitly pass API key

    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content":
                "You are a highly knowledgeable health expert providing detailed, personalized recommendations based on user-specific health insights. "
                "When giving advice, always provide specific examples where possible. If recommending dietary changes, mention specific foods to eat or avoid. "
                "If suggesting medical consultation, specify the type of doctor they should visit (e.g., cardiologist for heart concerns, endocrinologist for diabetes). "
                "Respond in a structured paragraph format, keeping it concise but informative."
            },
            {"role": "user", "content": user_query}
        ]
    )
    return response.choices[0].message.content

# Example query with specific health insights
user_query = (
    "I am a 45-year-old male with high blood pressure (140/90), borderline high cholesterol, and trouble sleeping. "
    "I eat a lot of processed foods and have a sedentary lifestyle. What should I do to improve my health?"
)

gpt4_response = query_gpt4(user_query)

# Display GPT-4 response
print("🔹 GPT-4 Response:")
print(gpt4_response)


🔹 GPT-4 Response:
The first steps to improve your health would be to address your diet, physical activity, and sleep hygiene.

1. Diet: Aim to exchange processed foods for fresh meals made from whole ingredients. For instance, for breakfast, try oatmeal with berries instead of a pre-packaged cereal bar. For lunch and dinner, you might select lean proteins such as chicken or fish, with plenty of vegetables and whole grains like brown rice or quinoa. Reduce sodium intake as it elevates blood pressure - avoid processed foods like canned soup, frozen dinners, and fast food which are typically high in sodium. Include foods that lower cholesterol such as fruits, vegetables, whole grains, and lean proteins. Fruits like berries, citrus, and apples are high in soluble fiber that can decrease cholesterol levels.

2. Physical Activity: Incorporate more physical activity into your daily routine to manage your high blood pressure and cholesterol. This can be as simple as taking a brisk walk for 30 

lets try using gpt 4 with specalied health data from our dataset

In [17]:
import openai



# Function to format user health data into a natural prompt
def format_health_data(user_data):
    prompt = (
        f"The user has the following health stats:\n"
        f"- Steps taken today: {user_data['steps']}\n"
        f"- Calories burned: {user_data['calories_burned']} kcal\n"
        f"- Distance covered: {user_data['distance_covered']} km\n"
        f"- Exercise duration: {user_data['exercise_duration']} hours\n"
        f"- Exercise intensity: {user_data['exercise_intensity']}\n"
        f"- Ambient temperature: {user_data['ambient_temperature']}°C\n"
        f"- UV exposure: {user_data['uv_exposure']} level\n"
        f"- Screen time: {user_data['screen_time']} hours\n"
        f"- Age: {user_data['age']} years\n"
        f"- Gender: {user_data['gender']}\n"
        f"- Weight: {user_data['weight']} kg\n"
        f"- Height: {user_data['height']} cm\n"
        f"- Medical condition: {user_data['medical_conditions']}\n"
        f"- Smoker: {user_data['smoker']}\n"
        f"- Alcohol consumption: {user_data['alcohol_consumption']}\n"
        f"- Sleep duration: {user_data['sleep_duration']} hours\n"
        f"- Deep sleep: {user_data['deep_sleep_duration']} hours\n"
        f"- REM sleep: {user_data['rem_sleep_duration']} hours\n"
        f"- Wake-ups: {user_data['wakeups']} times\n"
        f"- Snoring: {user_data['snoring']}\n"
        f"- Heart rate: {user_data['heart_rate']} bpm\n"
        f"- Blood oxygen level: {user_data['blood_oxygen_level']}%\n"
        f"- Stress level: {user_data['stress_level_High']}\n"
        f"- Mood: {'Happy' if user_data['mood_Happy'] else 'Neutral/Sad'}\n"
        f"- Health Score: {user_data['health_score']} (Higher is better)\n\n"
        f"Based on this health data, provide a personalized recommendation including specific diet, exercise, sleep, and lifestyle adjustments. "
        f"Give examples of foods, activities, and if needed, the type of specialist to consult."
    )
    return prompt

# Example user data
user_data = {
    "steps": 9146,
    "calories_burned": 457.3,
    "distance_covered": 7.3168,
    "exercise_duration": 1.22,
    "exercise_intensity": "Low",
    "ambient_temperature": 27.27,
    "uv_exposure": 4.06,
    "screen_time": 1.85,
    "age": 61,
    "gender": "Other",
    "weight": 88.67,
    "height": 178.52,
    "medical_conditions": "Diabetes",
    "smoker": "No",
    "alcohol_consumption": "Moderate",
    "sleep_duration": 6.51,
    "deep_sleep_duration": 2.78,
    "rem_sleep_duration": 3.73,
    "wakeups": 4,
    "snoring": "Yes",
    "heart_rate": 147,
    "blood_oxygen_level": 90.65,
    "stress_level_High": False,
    "mood_Happy": True,
    "health_score": 77.59
}

# Generate prompt
personalized_prompt = format_health_data(user_data)

# Display the formatted prompt
print("🔹 Generated Prompt for GPT-4:")
print(personalized_prompt)


🔹 Generated Prompt for GPT-4:
The user has the following health stats:
- Steps taken today: 9146
- Calories burned: 457.3 kcal
- Distance covered: 7.3168 km
- Exercise duration: 1.22 hours
- Exercise intensity: Low
- Ambient temperature: 27.27°C
- UV exposure: 4.06 level
- Screen time: 1.85 hours
- Age: 61 years
- Gender: Other
- Weight: 88.67 kg
- Height: 178.52 cm
- Medical condition: Diabetes
- Smoker: No
- Alcohol consumption: Moderate
- Sleep duration: 6.51 hours
- Deep sleep: 2.78 hours
- REM sleep: 3.73 hours
- Wake-ups: 4 times
- Snoring: Yes
- Heart rate: 147 bpm
- Blood oxygen level: 90.65%
- Stress level: False
- Mood: Happy
- Health Score: 77.59 (Higher is better)

Based on this health data, provide a personalized recommendation including specific diet, exercise, sleep, and lifestyle adjustments. Give examples of foods, activities, and if needed, the type of specialist to consult.


In [18]:

def query_gpt4(personalized_prompt):
    client = openai.OpenAI(api_key=OPENAI_API_KEY)

    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content":
                "You are a highly knowledgeable health expert providing detailed, personalized recommendations. "
                "Use the provided health data to offer specific, actionable advice. Include food recommendations and qunataties where relevant, "
                "exercise suggestions, what sort of tests, lab results and biomarkers should they be keeping a close eye on and why, when to seek emergency medical help, and if necessary, which type of doctor to consult."
            },
            {"role": "user", "content": personalized_prompt}
        ]
    )
    return response.choices[0].message.content


gpt4_response = query_gpt4(personalized_prompt)

# Display GPT-4 response
print("🔹 GPT-4 Personalized Health Recommendation:")
print(gpt4_response)


🔹 GPT-4 Personalized Health Recommendation:
Based on your health data, here are some personalized recommendations:

Diet:
As you are diabetic, it is integral that you follow a balanced diet while managing your blood sugar. It's beneficial to include lots of fiber-rich foods with a low glycemic index. 

- Include plenty of non-starchy vegetables like broccoli, spinach, and peppers. Aim to have at least five portions of fruit and vegetables a day.
- Wholegrains like brown rice, whole oats, and quinoa are excellent choices. Try to have around 180g per day.
- Incorporate lean proteins like chicken, fish, eggs, or plant-based alternatives like lentils and chickpeas. Aim for around 45-55g per day.
- Healthy monounsaturated and polyunsaturated fats are essential, found in avocados, nuts, seeds, and oily fish. Keep fat intake to about 70g per day.

Avoid processed foods and drinks high in sugar, and be conscious of your alcohol consumption as it can affect your blood sugar levels.

Exercise:
T

moving to medical LLM

In [19]:
!pip install transformers torch


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load BioMedLM Model & Tokenizer
model_name = "stanford-crfm/BioMedLM"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

print(" BioMedLM Model Loaded Successfully!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


✅ BioMedLM Model Loaded Successfully!


In [3]:
def query_biomedlm(user_query, max_length=250, temperature=0.7, top_k=50, top_p=0.9):
    """
    Generates a response from BioMedLM based on the given medical query.
    """
    formatted_prompt = f"Patient Case: {user_query} \nMedical Recommendation: "

    # Tokenize the input
    input_ids = tokenizer.encode(formatted_prompt, return_tensors="pt").to(device)

    # Generate response with controlled randomness
    output = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=1,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        do_sample=True,  # Enables sampling for more natural responses
        repetition_penalty=1.2  # Reduces input repetition
    )

    # Decode response
    response_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return response_text

# Example Query
user_query = "I have high blood pressure and trouble sleeping. What should I do?"
biomedlm_response = query_biomedlm(user_query)

# Display Response
print("🔹 BioMedLM Response:")
print(biomedlm_response)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


🔹 BioMedLM Response:
Patient Case: I have high blood pressure and trouble sleeping. What should I do? 
Medical Recommendation:   Number of patients with hypertension \[Yes/No

Notwithstanding, the role of the N-terminal region in the context of the clinical and diagnostic management {#s1}


In [4]:
def query_biomedlm(user_query, max_length=250, temperature=0.7, top_k=50, top_p=0.9):
    """
    Generates a response from BioMedLM based on the given medical query.
    """
    # ✅ Improved structured prompt
    formatted_prompt = (
        f"Patient Information: {user_query}\n"
        "Provide a clear and medically sound recommendation with specific actions."
    )

    # Tokenize the input with proper padding
    inputs = tokenizer(
        formatted_prompt, return_tensors="pt", padding=True, truncation=True
    ).to(device)

    # Generate response with improved decoding strategy
    output = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],  # ✅ Ensures proper token selection
        max_length=max_length,
        num_return_sequences=1,
        temperature=temperature,  # ✅ Controls randomness
        top_k=top_k,  # ✅ Limits token selection to top-k choices
        top_p=top_p,  # ✅ Nucleus sampling for natural responses
        do_sample=True,  # ✅ Enables sampling
        repetition_penalty=1.2,  # ✅ Reduces repetition
        pad_token_id=tokenizer.eos_token_id,  # ✅ Prevents truncation issues
    )

    # Decode response
    response_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return response_text


In [6]:
def query_biomedlm(user_query, max_length=250, temperature=0.7, top_k=50, top_p=0.9):
    """
    Generates a response from BioMedLM based on the given medical query.
    """

    # Set pad_token if missing
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token  # Assign EOS token as padding

    #  Improved structured prompt
    formatted_prompt = (
        f"Patient Information: {user_query}\n"
        "Provide a clear and medically sound recommendation with specific actions."
    )

    # Tokenize the input with proper padding
    inputs = tokenizer(
        formatted_prompt, return_tensors="pt", padding=True, truncation=True
    ).to(device)

    # Generate response with improved decoding strategy
    output = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],  #  Ensures proper token selection
        max_length=max_length,
        num_return_sequences=1,
        temperature=temperature,  #  Controls randomness
        top_k=top_k,  #  Limits token selection to top-k choices
        top_p=top_p,  #  Nucleus sampling for natural responses
        do_sample=True,  #  Enables sampling
        repetition_penalty=1.2,  #  Reduces repetition
        pad_token_id=tokenizer.pad_token_id,  #  Prevents truncation issues
    )

    # Decode response
    response_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return response_text


In [7]:
# Example Query
user_query = "I have high blood pressure and trouble sleeping. What should I do?"
biomedlm_response = query_biomedlm(user_query)

# Display Response
print("🔹 BioMedLM Response:")
print(biomedlm_response)


🔹 BioMedLM Response:
Patient Information: I have high blood pressure and trouble sleeping. What should I do?
Provide a clear and medically sound recommendation with specific actions.

**Author contributions {#Sec1} \| 1)**

None.
Table 2The following are the authors' contribution:

Jean-Michel Waldmann, K. M. B. Towers, J.F., et al.**Author's response to:**

We apologize for not citing the reference in the second sentence of the text was mistakenly omitted from the first paragraph on page 14, "Another reason" was mistakenly deleted by the reviewer, who suggested that the last sentence (p. 8). We apologized for this error; the reference is omitted. We have added it now reads as follows:

Reviewers:

In the original article can be found here:

Peter Garner, et al."

Commentary to Table [2](#MOESM5){ref-type="media"} (


In [1]:
!pip install transformers torch accelerate sentencepiece




In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load MedAlpaca-7B Model & Tokenizer
model_name = "medalpaca/medalpaca-7b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

print(" MedAlpaca-7B Model Loaded Successfully!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/260 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggin

config.json:   0%|          | 0.00/542 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/9.88G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/9.89G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/7.18G [00:00<?, ?B/s]



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

✅ MedAlpaca-7B Model Loaded Successfully!


In [43]:
def query_medalpaca(user_query, max_length=300, temperature=0.7, top_k=50, top_p=0.9):
    """
    Generates a response from MedAlpaca-7B based on the given medical query.
    """

    # ✅ Set pad_token if missing
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token  # Assign EOS token as padding

    # ✅ Improved structured prompt for MedAlpaca-7B
    formatted_prompt = (
    f"### Instruction: You are a trusted medical assistant. Provide a clear, evidence-based health recommendation for the patient. "
    f"Make your response actionable, easy to understand, and specific. "
    f"Use examples wherever relevant to make your advice practical. "
    f"For example, if recommending a high-potassium diet, mention foods like bananas, spinach, or avocados. "
    f"If advising lifestyle changes, include realistic steps the user can take. "
    f"Ensure all recommendations are medically accurate and structured for easy reading.\n\n"
    f"### Patient Case: {user_query}\n"
    f"### Recommendation: "
)

    # Tokenize the input
    inputs = tokenizer(
        formatted_prompt, return_tensors="pt", padding=True, truncation=True
    ).to("cuda" if torch.cuda.is_available() else "cpu")

    # Generate response with optimized decoding strategy
    output = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],  # ✅ Ensures proper token selection
        max_length=max_length,
        num_return_sequences=1,
        temperature=temperature,  # ✅ Controls randomness
        top_k=top_k,  # ✅ Limits token selection to top-k choices
        top_p=top_p,  # ✅ Nucleus sampling for natural responses
        do_sample=True,  # ✅ Enables sampling
        repetition_penalty=1.2,  # ✅ Reduces repetition
        pad_token_id=tokenizer.pad_token_id,  # ✅ Prevents truncation issues
    )

    # Decode response
    response_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return response_text


In [44]:
# Example Query
user_query = "I have high blood pressure and trouble sleeping. What should I do?"
medalpaca_response = query_medalpaca(user_query)

# Display Response
print("🔹 MedAlpaca-7B Response:")
print(medalpaca_response)


🔹 MedAlpaca-7B Response:
### Instruction: You are a trusted medical assistant. Provide a clear, evidence-based health recommendation for the patient. Make your response actionable, easy to understand, and specific. Use examples wherever relevant to make your advice practical. For example, if recommending a high-potassium diet, mention foods like bananas, spinach, or avocados. If advising lifestyle changes, include realistic steps the user can take. Ensure all recommendations are medically accurate and structured for easy reading.

### Patient Case: I have high blood pressure and trouble sleeping. What should I do?
### Recommendation: 1) Incorporate potassium rich foods into your daily meals such as banana, spinach, or avocado. 2) Limit caffeine intake, especially later in the day. 3) Exercise regularly but avoid overexertion that may increase heart rate. 4) Practice relaxation techniques such as deep breathing or yoga. Consider adding these strategies together with medication to improv

BioMedLM is research LLMs designed to provide assistance with the writing and understanding of medical research. For our use case, it is very difficult to get it to output in a structure intelligible to the end user.

MedAlpaca is better, but it lacks the flexibility in response that is present in a larger more gneralized model. We will stick with GPT 4-o for this test and will use techniques like multi shot prompting and RAG to ensure that the model is providing correct and easy to use insights