In [1]:
!pip install pandas scikit-learn sentence-transformers openpyxl

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, hamming_loss
from sentence_transformers import SentenceTransformer

### Load and Clean the Dataset

In [21]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
import pandas as pd

file_path = '/content/drive/My Drive/ML_Project/Responses4.xlsx'
df_seed = pd.read_excel(file_path, engine='openpyxl')

df_seed.head()

Unnamed: 0,Age,Gender,Height (in cm),Weight (in kg),Daily Activity Level,Average Sleep Duration (hours/day),Water Intake (liter/day),Do you consume alcohol,Current Mood (Last 1 week),Stress Level last 1 week( Scale 1 to 5),Do you have any of followingcconditions,Family history of any chronic disease,"Describe any health symptoms you're currently experiencing \nEg. Fatigue, headache, chest pain, muscle pain, insomnia, join pain, dizziness, etc",Your current wellness goal
0,21,Male,160,56,Medium (Walks/Exercise occasionally),6-7,More than 3,No,Neutral,3,,,headache,Improve mental health
1,20,Male,159,54,Low (Sedentary lifestyle),5-6,2-3,No,Stressed,4,,,No,Gain muscle
2,20,Male,162,55,Medium (Walks/Exercise occasionally),7-8,More than 3,No,Neutral,3,,,Headache,Maintain good
3,21,Male,175,69,Medium (Walks/Exercise occasionally),6-7,More than 3,No,Stressed,4,,,Fatigue,Improve mental health
4,19,Male,170,50,Medium (Walks/Exercise occasionally),7-8,2-3,No,Happy,1,,,Headache,Gain muscle


In [23]:
df_seed.columns = df_seed.columns.str.strip().str.replace('\n', ' ', regex=True)
print(df_seed.columns.tolist())

['Age', 'Gender', 'Height (in cm)', 'Weight (in kg)', 'Daily Activity Level', 'Average Sleep Duration (hours/day)', 'Water Intake (liter/day)', 'Do you consume alcohol', 'Current Mood (Last 1 week)', 'Stress Level last 1 week( Scale 1 to 5)', 'Do you have any of followingcconditions', 'Family history of any chronic disease', "Describe any health symptoms you're currently experiencing  Eg. Fatigue, headache, chest pain, muscle pain, insomnia, join pain, dizziness, etc", 'Your current wellness goal']


In [24]:
df_seed.rename(columns={
    'Age': 'age',
    'Gender': 'gender',
    'Height (in cm)': 'height_cm',
    'Weight (in kg)': 'weight_kg',
    'Daily Activity Level': 'activity_level',
    'Average Sleep Duration (hours/day)': 'sleep_duration',
    'Water Intake (liter/day)': 'water_intake',
    'Do you consume alcohol': 'alcohol',
    'Current Mood (Last 1 week)': 'mood',
    'Stress Level last 1 week( Scale 1 to 5)': 'stress_level',
    'Do you have any of followingcconditions': 'medical_history',
    'Family history of any chronic disease': 'family_history',
    "Describe any health symptoms you're currently experiencing  Eg. Fatigue, headache, chest pain, muscle pain, insomnia, join pain, dizziness, etc": 'symptoms',
    'Your current wellness goal': 'wellness_goal'
}, inplace=True)

# Confirm changes
print(df_seed.columns.tolist())

['age', 'gender', 'height_cm', 'weight_kg', 'activity_level', 'sleep_duration', 'water_intake', 'alcohol', 'mood', 'stress_level', 'medical_history', 'family_history', 'symptoms', 'wellness_goal']


In [25]:
df_seed.info()

for col in df_seed.columns:
    print(f"\n🔹 {col}:\n", df_seed[col].unique())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              21 non-null     int64 
 1   gender           21 non-null     object
 2   height_cm        21 non-null     int64 
 3   weight_kg        21 non-null     int64 
 4   activity_level   21 non-null     object
 5   sleep_duration   21 non-null     object
 6   water_intake     21 non-null     object
 7   alcohol          21 non-null     object
 8   mood             21 non-null     object
 9   stress_level     21 non-null     int64 
 10  medical_history  2 non-null      object
 11  family_history   2 non-null      object
 12  symptoms         20 non-null     object
 13  wellness_goal    21 non-null     object
dtypes: int64(4), object(10)
memory usage: 2.4+ KB

🔹 age:
 [21 20 19 66 28 43 18 81 22]

🔹 gender:
 ['Male' 'Female']

🔹 height_cm:
 [160 159 162 175 170 158 184 180 147 178 15

In [26]:
import numpy as np


sleep_map = {
    '5-6': 5.5,
    '6-7': 6.5,
    '7-8': 7.5,
    'More than 8': 9.0
}
df_seed['sleep_duration'] = df_seed['sleep_duration'].map(sleep_map)

water_map = {
    '1-2': 1.5,
    '2-3': 2.5,
    'More than 3': 3.5,
}
df_seed['water_intake'] = df_seed['water_intake'].map(water_map)

df_seed['activity_level'] = df_seed['activity_level'].map({
    'Low (Sedentary lifestyle)': 'Low',
    'Medium (Walks/Exercise occasionally)': 'Medium',
    'High (Exercises regularly)': 'High'
})

df_seed['alcohol'] = df_seed['alcohol'].map({'No': 0, 'Occasionally': 1})

df_seed['gender'] = df_seed['gender'].map({'Male': 0, 'Female': 1})

mood_map = {m: i for i, m in enumerate(df_seed['mood'].unique())}
df_seed['mood'] = df_seed['mood'].map(mood_map)

df_seed['symptoms'] = df_seed['symptoms'].astype(str).str.lower().str.strip()
df_seed['symptoms'] = df_seed['symptoms'].replace({'nan': '', 'na': '', 'no': ''})

df_seed.head()

Unnamed: 0,age,gender,height_cm,weight_kg,activity_level,sleep_duration,water_intake,alcohol,mood,stress_level,medical_history,family_history,symptoms,wellness_goal
0,21,0,160,56,Medium,6.5,3.5,0,0,3,,,headache,Improve mental health
1,20,0,159,54,Low,5.5,2.5,0,1,4,,,,Gain muscle
2,20,0,162,55,Medium,7.5,3.5,0,0,3,,,headache,Maintain good
3,21,0,175,69,Medium,6.5,3.5,0,1,4,,,fatigue,Improve mental health
4,19,0,170,50,Medium,7.5,2.5,0,2,1,,,headache,Gain muscle


In [27]:
df_seed['bmi'] = df_seed['weight_kg'] / ((df_seed['height_cm'] / 100) ** 2)

In [28]:
from sklearn.preprocessing import StandardScaler

df_encoded = pd.get_dummies(df_seed, columns=['activity_level'], drop_first=True)

num_cols = ['age', 'height_cm', 'weight_kg', 'bmi', 'sleep_duration', 'water_intake', 'stress_level']
scaler = StandardScaler()
df_encoded[num_cols] = scaler.fit_transform(df_encoded[num_cols])

df_encoded.head()

Unnamed: 0,age,gender,height_cm,weight_kg,sleep_duration,water_intake,alcohol,mood,stress_level,medical_history,family_history,symptoms,wellness_goal,bmi,activity_level_Low,activity_level_Medium
0,-0.388517,0,-0.631989,-0.717131,-0.70881,1.24939,0,0,0.722315,,,headache,Improve mental health,-0.308363,False,True
1,-0.450798,0,-0.736492,-0.916598,-1.899611,-0.390434,0,1,1.986367,,,,Gain muscle,-0.451269,True,False
2,-0.450798,0,-0.422985,-0.816865,0.481991,1.24939,0,0,0.722315,,,headache,Maintain good,-0.563006,False,True
3,-0.388517,0,0.935543,0.579404,-0.70881,1.24939,0,1,1.986367,,,fatigue,Improve mental health,-0.126469,False,True
4,-0.51308,0,0.413032,-1.315532,0.481991,-0.390434,0,2,-1.805788,,,headache,Gain muscle,-1.577368,False,True


In [29]:
print("Null values in each column:\n")
print(df_encoded.isnull().sum())

print("\nTotal missing values:", df_encoded.isnull().sum().sum())

Null values in each column:

age                       0
gender                    0
height_cm                 0
weight_kg                 0
sleep_duration            0
water_intake              0
alcohol                   0
mood                      0
stress_level              0
medical_history          19
family_history           19
symptoms                  0
wellness_goal             0
bmi                       0
activity_level_Low        0
activity_level_Medium     0
dtype: int64

Total missing values: 38


In [30]:
df_encoded['medical_history'] = df_encoded['medical_history'].fillna('None')
df_encoded['family_history'] = df_encoded['family_history'].fillna('None')

In [31]:
print(df_encoded.isnull().sum().sum())

0


In [32]:
df_seed.columns

Index(['age', 'gender', 'height_cm', 'weight_kg', 'activity_level',
       'sleep_duration', 'water_intake', 'alcohol', 'mood', 'stress_level',
       'medical_history', 'family_history', 'symptoms', 'wellness_goal',
       'bmi'],
      dtype='object')

In [33]:
import numpy as np
import random

new_values = {
    'alcohol': ['Regularly'],
    'medical_history': ['Diabetes', 'Cancer', 'Asthma', 'None'],
    'family_history': ['Diabetes', 'Cancer', 'Heart Disease', 'None'],
    'wellness_goal': ['Stay Focused', 'Improve Immunity', 'Better Sleep']
}

synthetic_data = []
num_samples = 500

for _ in range(num_samples):
    row = df_seed.sample(1).iloc[0].copy()

    row['alcohol'] = random.choice(['No', 'Occasionally', 'Regularly'])
    row['medical_history'] = random.choice(
        df_seed['medical_history'].dropna().tolist() + new_values['medical_history']
    )
    row['family_history'] = random.choice(
        df_seed['family_history'].dropna().tolist() + new_values['family_history']
    )
    row['wellness_goal'] = random.choice(
        df_seed['wellness_goal'].dropna().tolist() + new_values['wellness_goal']
    )

    row['age'] = np.clip(int(np.random.normal(row['age'], 4)), 18, 85)
    row['height_cm'] = np.clip(row['height_cm'] + np.random.normal(0, 2), 140, 200)
    row['weight_kg'] = np.clip(row['weight_kg'] + np.random.normal(0, 3), 30, 120)
    row['sleep_duration'] = np.clip(row['sleep_duration'] + np.random.normal(0, 0.5), 4, 10)
    row['water_intake'] = np.clip(row['water_intake'] + np.random.normal(0, 0.3), 1.0, 12.0)
    row['stress_level'] = random.randint(1, 5)

    row['bmi'] = row['weight_kg'] / ((row['height_cm'] / 100) ** 2)

    synthetic_data.append(row)

df_synthetic = pd.DataFrame(synthetic_data)
print("Synthetic Shape:", df_synthetic.shape)

df_final = pd.concat([df_seed, df_synthetic], ignore_index=True)
print("Final Dataset Shape:", df_final.shape)

Synthetic Shape: (500, 15)
Final Dataset Shape: (521, 15)


In [39]:
from google.colab import files
df_final.to_csv("SyntheticDataset.csv", index=False)
files.download("SyntheticDataset.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [40]:
uploaded = files.upload()

Saving SyntheticDataset.csv to SyntheticDataset (1).csv


In [41]:
import pandas as pd
df = pd.read_csv('SyntheticDataset.csv')

print(df.head())
print(df.columns.tolist())

   age  gender  height_cm  weight_kg activity_level  sleep_duration  \
0   21       0      160.0       56.0         Medium             6.5   
1   20       0      159.0       54.0            Low             5.5   
2   20       0      162.0       55.0         Medium             7.5   
3   21       0      175.0       69.0         Medium             6.5   
4   19       0      170.0       50.0         Medium             7.5   

   water_intake alcohol  mood  stress_level medical_history family_history  \
0           3.5       0     0             3             NaN            NaN   
1           2.5       0     1             4             NaN            NaN   
2           3.5       0     0             3             NaN            NaN   
3           3.5       0     1             4             NaN            NaN   
4           2.5       0     2             1             NaN            NaN   

   symptoms          wellness_goal        bmi  
0  headache  Improve mental health  21.875000  
1       

In [42]:
df.head()

Unnamed: 0,age,gender,height_cm,weight_kg,activity_level,sleep_duration,water_intake,alcohol,mood,stress_level,medical_history,family_history,symptoms,wellness_goal,bmi
0,21,0,160.0,56.0,Medium,6.5,3.5,0,0,3,,,headache,Improve mental health,21.875
1,20,0,159.0,54.0,Low,5.5,2.5,0,1,4,,,,Gain muscle,21.359915
2,20,0,162.0,55.0,Medium,7.5,3.5,0,0,3,,,headache,Maintain good,20.957171
3,21,0,175.0,69.0,Medium,6.5,3.5,0,1,4,,,fatigue,Improve mental health,22.530612
4,19,0,170.0,50.0,Medium,7.5,2.5,0,2,1,,,headache,Gain muscle,17.301038


### Normalize Structured Data

In [43]:
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

df = df.dropna(subset=['bmi'])

num_cols = ['age', 'height_cm', 'weight_kg', 'bmi', 'sleep_duration', 'water_intake']
scaler = MinMaxScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

le_activity = LabelEncoder()
df['activity_level'] = le_activity.fit_transform(df['activity_level'].astype(str))

###  BERT Embeddings for Symptoms
python
Copy
Edit


In [44]:
!pip install -q sentence-transformers
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
df['symptoms'] = df['symptoms'].fillna('')
symptom_embeddings = model.encode(df['symptoms'].tolist())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

### Generate Multi-label Output (Rule-Based)

In [47]:
def generate_labels(row):
    labels = []

    if row['bmi'] > 0.85 or row['bmi'] < 0.25:
        labels.append("Balanced Diet Recommendation")
    if row['activity_level'] <= 1:
        labels.append("Workout Plan")
    if "pain" in row['symptoms'].lower() or "dizzy" in row['symptoms'].lower() or "fatigue" in row['symptoms'].lower():
        labels.append("Doctor Consultation Advised")
    if "stress" in row['symptoms'].lower() or row['mood'] == 1:
        labels.append("Mental Wellness Tip")
    if row['water_intake'] < 0.4:
        labels.append("Increase Water Intake")
    if row['sleep_duration'] < 0.4:
        labels.append("Sleep Improvement Suggestion")

    labels.append("Lifestyle Monitoring")
    return labels

df['labels'] = df.apply(generate_labels, axis=1)

from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(df['labels'])
output_labels = mlb.classes_

### Merge Features (Structured + BERT)

In [48]:
structured = df[['age', 'gender', 'height_cm', 'weight_kg', 'bmi', 'activity_level', 'sleep_duration', 'water_intake']].values

import numpy as np
X = np.hstack((structured, symptom_embeddings))

### Train-Test Split and Model

In [49]:
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

model = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42))
model.fit(X_train, y_train)

### Evaluation

In [50]:
from sklearn.metrics import classification_report, hamming_loss

y_pred = model.predict(X_test)

print("Hamming Loss:", hamming_loss(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, target_names=output_labels))

Hamming Loss: 0.0163265306122449

Classification Report:

                              precision    recall  f1-score   support

Balanced Diet Recommendation       1.00      0.85      0.92        26
 Doctor Consultation Advised       1.00      1.00      1.00        35
       Increase Water Intake       1.00      0.80      0.89        30
        Lifestyle Monitoring       1.00      1.00      1.00       105
         Mental Wellness Tip       1.00      1.00      1.00        14
Sleep Improvement Suggestion       1.00      0.96      0.98        24
                Workout Plan       0.97      1.00      0.99        37

                   micro avg       1.00      0.96      0.98       271
                   macro avg       1.00      0.94      0.97       271
                weighted avg       1.00      0.96      0.98       271
                 samples avg       1.00      0.97      0.98       271



### Label-to-Recommendation Mapping

In [51]:
recommendation_texts = {
    "Balanced Diet Recommendation": "Eat iron-rich foods like spinach.",
    "Workout Plan": "Try 30 mins of moderate cardio.",
    "Increase Water Intake": "Increase to at least 3 liters/day.",
    "Sleep Improvement Suggestion": "Aim for 8 hours of sleep, reduce screen time before bed.",
    "Doctor Consultation Advised": "Consult a doctor due to your symptoms.",
    "Mental Wellness Tip": "Practice meditation and deep breathing.",
    "Lifestyle Monitoring": "Track your health weekly using a health app or diary."
}

### View Sample Output

In [52]:
from sentence_transformers import SentenceTransformer
text_model = SentenceTransformer('all-MiniLM-L6-v2')

# Sample input
# input_data = {
#     'age': 26,
#     'gender': 1,
#     'height_cm': 170,
#     'weight_kg': 62,
#     'bmi': 21.5,
#     'activity_level': 'High',
#     'sleep_duration': 7.5,
#     'water_intake': 2.5,
#     'symptoms': 'none',
#     'mood': 1
# }

# Stressed & Poor Lifestyle
# input_data = {
#     'age': 34,
#     'gender': 0,
#     'height_cm': 155,
#     'weight_kg': 70,
#     'bmi': 29.1,
#     'activity_level': 'Low',
#     'sleep_duration': 5,
#     'water_intake': 0.8,
#     'symptoms': 'stress, low energy, anxiety, poor appetite',
#     'mood': 0
# }

# Senior with Multiple Issues
# input_data = {
#     'age': 68,
#     'gender': 1,
#     'height_cm': 165,
#     'weight_kg': 85,
#     'bmi': 31.2,
#     'activity_level': 'Low',
#     'sleep_duration': 5.5,
#     'water_intake': 1.0,
#     'symptoms': 'joint pain, fatigue, dizziness, shortness of breath',
#     'mood': 0
# }

# Young Adult with Mental Health Signs
# input_data = {
#     'age': 22,
#     'gender': 0,
#     'height_cm': 160,
#     'weight_kg': 54,
#     'bmi': 21.1,
#     'activity_level': 'High',
#     'sleep_duration': 6.5,
#     'water_intake': 1.8,
#     'symptoms': 'anxiety, sadness, lack of concentration, insomnia',
#     'mood': 0
# }

# Middle-aged with Good Habits
input_data = {
    'age': 45,
    'gender': 1,
    'height_cm': 172,
    'weight_kg': 68,
    'bmi': 23.0,
    'activity_level': 'High',
    'sleep_duration': 7.0,
    'water_intake': 2.8,
    'symptoms': 'none',
    'mood': 1
}


import pandas as pd

input_df = pd.DataFrame([input_data])

num_cols = ['age', 'height_cm', 'weight_kg', 'bmi', 'sleep_duration', 'water_intake']
input_df[num_cols] = scaler.transform(input_df[num_cols])

input_df['activity_level'] = le_activity.transform(input_df['activity_level'].astype(str))

structured_input = input_df[['age', 'gender', 'height_cm', 'weight_kg', 'bmi', 'activity_level', 'sleep_duration', 'water_intake']].values

symptom_embed = text_model.encode(input_df['symptoms'].tolist())

X_input = np.hstack((structured_input, symptom_embed))

y_pred = model.predict(X_input)

predicted_labels = mlb.inverse_transform(y_pred)

print("Predicted Recommendations:")
if predicted_labels and predicted_labels[0]:
    for label in predicted_labels[0]:
        print(f" {label}")
        print(f" Recommendation: {recommendation_texts.get(label, 'No details available.')}")
else:
    print("⚠️ No specific recommendations found. Please consult a healthcare professional.")


Predicted Recommendations:
 Lifestyle Monitoring
 Recommendation: Track your health weekly using a health app or diary.
 Workout Plan
 Recommendation: Try 30 mins of moderate cardio.
