In [30]:
import pandas as pd
import numpy as np

# Creating Synthetic Data

In [31]:
# columns i want

# screen_time_hours
# sleep_duration_hours
# study_time_hours
# workout_time_hours
# stress_level -> extrenal stress
# anxiety_level ->  internal
# coffee_intake_mg
# socializing_time_hours
# water_intake_litres
# junk_food_intake

In [32]:
sample_size = 500
np.random.seed(42)

In [33]:
screen_time_hours = np.random.normal(6, 2, sample_size).clip(0, 16) # mean = 6 sd =2 values lie between 0 and 16
sleep_duration_hours = np.random.normal(7, 1.5, sample_size).clip(3, 12)
study_time_hours = np.random.normal(4, 2, sample_size).clip(0, 12)
workout_time_hours = np.random.normal(1, 0.5, sample_size).clip(0, 3)
stress_level = np.random.randint(1, 6, sample_size)  # 1-5 external stress
anxiety_level = np.random.randint(1, 6, sample_size)  # 1-5 internal anxiety
coffee_intake_mg = np.random.normal(150, 80, sample_size).clip(0, 500)
socializing_time_hours = np.random.normal(2, 1, sample_size).clip(0, 8)
water_intake_litres = np.random.normal(2, 0.5, sample_size).clip(0.5, 5)
junk_food_intake = np.random.randint(1, 6, sample_size)  # 1-5

In [34]:
# Burnout increases with stress, anxiety, screen time, junk food, coffee
# Burnout decreases with sleep, workout, socializing, water
burnout_score = (
    0.25*screen_time_hours +
    -0.3*sleep_duration_hours +
    0.2*study_time_hours +
    -0.3*workout_time_hours +
    5*stress_level +
    4*anxiety_level +
    0.01*coffee_intake_mg +
    -0.5*socializing_time_hours +
    -1*water_intake_litres +
    2*junk_food_intake
)

In [35]:
burnout_score = ((burnout_score - burnout_score.min()) / (burnout_score.max() - burnout_score.min()) * 10).round(2)

# first min-max scale value lie between 0 and 1 then multiple by 10 so value lies between 0 and 10 then roundoff to 2 decimal places

In [36]:
data = pd.DataFrame({
    "screen_time_hours": screen_time_hours,
    "sleep_duration_hours": sleep_duration_hours,
    "study_time_hours": study_time_hours,
    "workout_time_hours": workout_time_hours,
    "stress_level": stress_level,
    "anxiety_level": anxiety_level,
    "coffee_intake_mg": coffee_intake_mg,
    "socializing_time_hours": socializing_time_hours,
    "water_intake_litres": water_intake_litres,
    "junk_food_intake": junk_food_intake,
    "burnout_score": burnout_score
})

In [37]:
data.to_csv("../data/synthetic_burnout_data.csv", index=False)

In [38]:
data

Unnamed: 0,screen_time_hours,sleep_duration_hours,study_time_hours,workout_time_hours,stress_level,anxiety_level,coffee_intake_mg,socializing_time_hours,water_intake_litres,junk_food_intake,burnout_score
0,6.993428,8.389266,6.798711,1.389181,3,2,203.491989,2.942846,1.799196,3,4.60
1,5.723471,9.864125,5.849267,0.724407,3,1,76.559299,3.110349,1.922731,2,2.83
2,7.295377,4.902149,4.119261,0.590901,3,1,276.243001,3.083273,2.963104,3,3.81
3,9.046060,7.844454,2.706126,0.998313,4,3,61.908777,1.433593,1.476089,1,5.59
4,5.531693,6.024036,5.396447,0.914908,5,2,58.067609,1.420800,1.592748,5,7.55
...,...,...,...,...,...,...,...,...,...,...,...
495,7.077820,6.578350,8.012186,1.535075,1,2,0.000000,1.260283,1.553573,4,2.84
496,3.925508,9.696530,8.123007,0.986739,3,2,164.944351,2.865953,1.375056,5,5.31
497,5.619323,7.961264,6.416732,0.559063,2,5,65.592759,1.717424,2.281932,2,5.39
498,4.248763,6.143232,6.048125,0.918467,2,5,261.241806,1.828282,2.334936,5,7.09


In [None]:
# lets remove the meaning less data generated here which is total hours should be less than equal to 24

total_hours = data["screen_time_hours"] + data["sleep_duration_hours"] + data["study_time_hours"] + data["workout_time_hours"]

data = data[total_hours <= 24].reset_index(drop = True) # resets the index for dropped rows
data

Unnamed: 0,screen_time_hours,sleep_duration_hours,study_time_hours,workout_time_hours,stress_level,anxiety_level,coffee_intake_mg,socializing_time_hours,water_intake_litres,junk_food_intake,burnout_score
0,6.993428,8.389266,6.798711,1.389181,3,2,203.491989,2.942846,1.799196,3,4.60
1,5.723471,9.864125,5.849267,0.724407,3,1,76.559299,3.110349,1.922731,2,2.83
2,7.295377,4.902149,4.119261,0.590901,3,1,276.243001,3.083273,2.963104,3,3.81
3,9.046060,7.844454,2.706126,0.998313,4,3,61.908777,1.433593,1.476089,1,5.59
4,5.531693,6.024036,5.396447,0.914908,5,2,58.067609,1.420800,1.592748,5,7.55
...,...,...,...,...,...,...,...,...,...,...,...
476,7.077820,6.578350,8.012186,1.535075,1,2,0.000000,1.260283,1.553573,4,2.84
477,3.925508,9.696530,8.123007,0.986739,3,2,164.944351,2.865953,1.375056,5,5.31
478,5.619323,7.961264,6.416732,0.559063,2,5,65.592759,1.717424,2.281932,2,5.39
479,4.248763,6.143232,6.048125,0.918467,2,5,261.241806,1.828282,2.334936,5,7.09


# EDA

In [None]:
from ydata_profiling import ProfileReport

profile = ProfileReport(data, title = "Burnout Dataset EDA", explorative = True)
profile.to_widgets() # in jupyter noteboook
profile.to_file("burnout_eda_report.html") # save as html