In [1]:
import numpy as np
import pandas as pd

# Set random seed for reproducibility
np.random.seed(0)

# Number of samples
num_samples = 1000

# Generate synthetic data
data = pd.DataFrame({
    'age': np.random.randint(18, 80, num_samples),
    'gender': np.random.choice(['Male', 'Female'], num_samples),
    'weight_kg': np.round(np.random.normal(70, 15, num_samples), 1),  # avg 70kg, std 15
    'height_cm': np.round(np.random.normal(165, 10, num_samples), 1),  # avg 165cm, std 10
    'heart_rate': np.random.randint(60, 100, num_samples),  # resting heart rate range
    'calories': np.random.randint(1500, 3000, num_samples),
    'activity_level': np.random.choice(['Low', 'Moderate', 'High'], num_samples),
    'smoking_status': np.random.choice(['Smoker', 'Non-Smoker'], num_samples),
    'alcohol_intake': np.random.choice(['None', 'Moderate', 'High'], num_samples)
})

# Generate a synthetic health risk based on combinations of features
conditions = [
    (data['age'] >= 50) & (data['heart_rate'] >= 80) & (data['smoking_status'] == 'Smoker'),
    (data['activity_level'] == 'Low') & (data['weight_kg'] > 80),
    (data['calories'] > 2500) & (data['activity_level'] == 'Low'),
    (data['alcohol_intake'] == 'High'),
    (data['age'] < 30) & (data['activity_level'] == 'High')
]
choices = ['High', 'Moderate', 'Moderate', 'Moderate', 'Low']
data['risk'] = np.select(conditions, choices, default='Low')

# Preview the dataset
print(data.head())

# Save dataset to a CSV file
data.to_csv("synthetic_health_data.csv", index=False)
print("Synthetic dataset saved as 'synthetic_health_data.csv'")


   age  gender  weight_kg  height_cm  heart_rate  calories activity_level  \
0   62    Male       65.2      173.2          79      2614       Moderate   
1   65    Male       61.1      155.1          62      2981            Low   
2   71    Male       64.1      130.6          89      2870       Moderate   
3   18  Female       69.5      173.3          61      2949       Moderate   
4   21    Male       65.3      173.2          81      2507           High   

  smoking_status alcohol_intake      risk  
0         Smoker           High  Moderate  
1     Non-Smoker           High  Moderate  
2         Smoker           High      High  
3     Non-Smoker           High  Moderate  
4     Non-Smoker           None       Low  
Synthetic dataset saved as 'synthetic_health_data.csv'


In [None]:
import pandas as pd

# Load the current dataset
data = pd.read_csv("synthetic_health_data.csv")

# Replace or add new categories in 'alcohol_intake' and 'smoking_status'
data['alcohol_intake'] = data['alcohol_intake'].replace({'Frequent': 'Regular'})
data['smoking_status'] = data['smoking_status'].replace({'Occasional': 'Former Smoker'})

# Add more rows for new categories, if needed
new_rows = pd.DataFrame({
    'age': [45, 34],
    'gender': ['Male', 'Female'],
    'weight_kg': [75, 60],
    'height_cm': [170, 160],
    'heart_rate': [80, 72],
    'calories': [2200, 1900],
    'activity_level': ['Moderate', 'High'],
    'smoking_status': ['Non-Smoker', 'Current Smoker'],
    'alcohol_intake': ['Regular', 'None'],
    'risk': ['Medium', 'Low']
})

# Append new rows to the dataset
data = pd.concat([data, new_rows], ignore_index=True)

# Save the updated dataset
data.to_csv("updated_synthetic_health_data.csv", index=False)

print("Dataset updated and saved as 'updated_synthetic_health_data.csv'.")
