In [2]:
import pandas as pd
import numpy as np

# Set the seed for reproducibility
np.random.seed(42)

# Number of users in the dataset
num_users = 10000

# Generate synthetic data for the survey
data = {
    'User_ID': range(1, num_users + 1),
    'Email': [f'user{i}@example.com' for i in range(1, num_users + 1)],
    'Phone_Number': [''.join(np.random.choice(list('0123456789')) for _ in range(10)) for _ in range(num_users)],
    'Name': [f'User {i}' for i in range(1, num_users + 1)],
    
    # Lifestyle details
    'Physical_Activity_Frequency': np.random.choice(['None', 'Occasional', 'Regular'], size=num_users),
    'Workout_Type': np.random.choice(['Cardio', 'Strength Training', 'Yoga', 'None'], size=num_users),
    'Age': np.random.randint(18, 80, size=num_users),
    'Health_Issues': np.random.choice(['Yes', 'No'], size=num_users),
    'Existing_Medications': np.random.choice(['Aspirin', 'Ibuprofen', 'None'], size=num_users),
    
    # Existing Dietary details
    'Food_Consumptions': np.random.choice(['Balanced', 'Vegetarian', 'Vegan'], size=num_users),
    'Nutrition_Intake': np.random.choice(['Adequate', 'Inadequate'], size=num_users),
    'Water_Intake': np.random.randint(1, 4, size=num_users),  # Assuming 1-4 liters per day
    
    # Existing concerns
    'Bone_Health': np.random.choice(['Excellent', 'Good', 'Poor'], size=num_users),
    'Skin_Health': np.random.choice(['Excellent', 'Good', 'Poor'], size=num_users),
    'Heart_Health': np.random.choice(['Excellent', 'Good', 'Poor'], size=num_users),
    'Gut_Health': np.random.choice(['Excellent', 'Good', 'Poor'], size=num_users),
    'Overall_Physical_Health': np.random.choice(['Excellent', 'Good', 'Poor'], size=num_users),
    'Mental_Brain_Health': np.random.choice(['Excellent', 'Good', 'Poor'], size=num_users),
    
    # Symptoms
    'Energy_Symptoms': np.random.choice(['High', 'Moderate', 'Low'], size=num_users),
    'Skin_Hair_Symptoms': np.random.choice(['Excellent', 'Good', 'Poor'], size=num_users),
    'Internal_Body_Function_Symptoms': np.random.choice(['Excellent', 'Good', 'Poor'], size=num_users),
    'Stress_Anxiety_Symptoms': np.random.choice(['Low', 'Moderate', 'High'], size=num_users),
    
    # History of Vitamin Intake
    'Vitamin_Intake_History': np.random.choice(['Yes', 'No'], size=num_users),
    
    # Goals
    'Active_Lifestyle_Goals': np.random.choice(['Maintain', 'Improve'], size=num_users),
    'Sports_Lifestyle_Goals': np.random.choice(['Participate', 'Excel'], size=num_users),
    'Health_Improvement_Goals': np.random.choice(['Weight_Loss', 'Muscle_Gain', 'Overall_Wellness'], size=num_users),
    'Special_Life_Stages_Goals': np.random.choice(['Elderly_Care', 'Pregnancy_Care', 'Young_Kids_Care'], size=num_users),
    
    # Recommended Vitamin (Target Column)
    'Recommended_Vitamin': np.nan,
}

# Create a DataFrame
df = pd.DataFrame(data)

# Define rules for recommending vitamins based on existing concerns, symptoms, and goals
# This is a simplified example and may not reflect real-world medical advice

for index, row in df.iterrows():
    # If overall physical health or gut health is poor, recommend Vitamin D
    if row['Overall_Physical_Health'] == 'Poor' or row['Gut_Health'] == 'Poor':
        df.at[index, 'Recommended_Vitamin'] = 'D'
    # If heart health, mental/brain health, or stress/anxiety symptoms are concerning, recommend Vitamin B12
    elif row['Heart_Health'] == 'Poor' or row['Mental_Brain_Health'] == 'Poor' or row['Stress_Anxiety_Symptoms'] == 'High':
        df.at[index, 'Recommended_Vitamin'] = 'B12'
    # If skin/hair symptoms or internal body function symptoms are poor, recommend Vitamin A
    elif row['Skin_Hair_Symptoms'] == 'Poor' or row['Internal_Body_Function_Symptoms'] == 'Poor':
        df.at[index, 'Recommended_Vitamin'] = 'A'
    # If energy symptoms are low, recommend Vitamin E
    elif row['Energy_Symptoms'] == 'Low':
        df.at[index, 'Recommended_Vitamin'] = 'E'
    # If any other condition is met, recommend a random vitamin from A, B12, C, D, E, K
    else:
        df.at[index, 'Recommended_Vitamin'] = np.random.choice(['A', 'B12', 'C', 'D', 'E', 'K'])

# Save the dataset to a CSV file
df.to_csv('health_survey_dataset.csv', index=False)

# Display the first few rows of the generated dataset
print(df.head())


   User_ID              Email Phone_Number    Name  \
0        1  user1@example.com   6374692674  User 1   
1        2  user2@example.com   3772541751  User 2   
2        3  user3@example.com   4095809263  User 3   
3        4  user4@example.com   8242648613  User 4   
4        5  user5@example.com   8198941367  User 5   

  Physical_Activity_Frequency       Workout_Type  Age Health_Issues  \
0                     Regular  Strength Training   79            No   
1                     Regular               Yoga   24            No   
2                     Regular               None   35           Yes   
3                        None  Strength Training   67           Yes   
4                     Regular  Strength Training   28           Yes   

  Existing_Medications Food_Consumptions  ... Energy_Symptoms  \
0            Ibuprofen          Balanced  ...        Moderate   
1            Ibuprofen        Vegetarian  ...        Moderate   
2                 None        Vegetarian  ...        