In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [29]:
df = pd.read_csv('data_sets/survey.csv')
df.columns

Index(['Timestamp', 'Age (in years)', 'Gender', 'Daily Screen time ',
       'Tea or Coffee consumption per day',
       'How many times a week do you exercise?',
       'How stressful is your daily life? (5 being the highest)',
       'How many hours do you sleep per night in average? ',
       'How many times do you snore ?',
       'How many times do you wake up at night ?',
       'Do you have following health conditions ?',
       '  Do you feel tired during the day?  '],
      dtype='object')

In [30]:
df.drop(columns=['Timestamp', 'Tea or Coffee consumption per day','Daily Screen time ','How many times do you snore ?','How many times do you wake up at night ?','  Do you feel tired during the day?  '], inplace=True)

In [31]:
df.head()

Unnamed: 0,Age (in years),Gender,How many times a week do you exercise?,How stressful is your daily life? (5 being the highest),How many hours do you sleep per night in average?,Do you have following health conditions ?
0,22,Male,Rarely,3,6.0,
1,24,Male,Daily,3,6.0,
2,25,Male,Rarely,3,7.0,
3,22,Male,3-5 times a week,3,6.0,Obesity (BMI > 30)
4,22,Female,Daily,4,8.0,


In [32]:
df.columns

Index(['Age (in years)', 'Gender', 'How many times a week do you exercise?',
       'How stressful is your daily life? (5 being the highest)',
       'How many hours do you sleep per night in average? ',
       'Do you have following health conditions ?'],
      dtype='object')

In [33]:
df.rename(columns = {
    'Age (in years)': 'Age',
    'Gender': 'Gender',
    'How many times a week do you exercise?': 'Physical Activity Level',
    'How stressful is your daily life? (5 being the highest)': 'Stress Level',
    'How many hours do you sleep per night in average? ': 'Sleep Duration',
    'Do you have following health conditions ?': 'Health Conditions',
}, inplace=True)


In [34]:
def clean_activity_level(value):
    if "3-5 times a week" in value:
        return "3-5 times a week"
    elif "Rarely" in value:
        return "Rarely"
    return value  # Keep the original if no changes

# Apply the function to clean the column
df['Physical Activity Level'] = df['Physical Activity Level'].apply(clean_activity_level)

In [35]:
def assign_random_activity(value):
    if value == "Daily":
        return random.randint(80, 100)  # Daily → 80 to 100
    elif value == "3-5 times a week":
        return random.randint(50, 79)  # 3-5 times a week → 50 to 79
    elif value == "Rarely":
        return random.randint(10, 49)  # Rarely
    else:
        return None
    df['Physical Activity Level'] = df['Physical Activity Level'].apply(assign_random_activity) 

In [36]:
df.head()

Unnamed: 0,Age,Gender,Physical Activity Level,Stress Level,Sleep Duration,Health Conditions
0,22,Male,Rarely,3,6.0,
1,24,Male,Daily,3,6.0,
2,25,Male,Rarely,3,7.0,
3,22,Male,3-5 times a week,3,6.0,Obesity (BMI > 30)
4,22,Female,Daily,4,8.0,


In [37]:
df.columns.str.strip()
if 'How many hours do you sleep per night in average?' in df.columns.str.strip():
    df.rename(columns={'How many hours do you sleep per night in average?':'Sleep Duration'}, inplace=True)

In [38]:
df.head()
df['Health Conditions'].unique()

array([nan, 'Obesity (BMI > 30)', 'Diabetes',
       'Hypertension ( High blood pressure )'], dtype=object)

In [39]:
import random
def blood_pressure(df):
    for i in range(len(df)):
        health_condition = df.at[i, 'Health Conditions']
        
        if pd.isna(health_condition):  # If health condition is NaN
            df.at[i, 'Blood Pressure'] = random.choice(["120/80", "115/75", "130/85"])  # Normal BP
            df.at[i, 'BMI Category'] = 'Normal'
        
        elif health_condition == 'Obesity (BMI > 30)':
            df.at[i, 'Blood Pressure'] = random.choice(["130/85", "140/90", "150/100"])  # Higher BP
            df.at[i, 'BMI Category'] = 'Overweight'
        
        elif health_condition == 'Hypertension ( High blood pressure )':
            df.at[i, 'Blood Pressure'] = random.choice(["140/95", "150/100", "160/110"])  # High BP
            df.at[i, 'BMI Category'] = random.choice(['Normal', 'Overweight'])

    return df

# Apply the function to the dataframe
df = blood_pressure(df)

df.columns

        

Index(['Age', 'Gender', 'Physical Activity Level', 'Stress Level',
       'Sleep Duration', 'Health Conditions', 'Blood Pressure',
       'BMI Category'],
      dtype='object')

In [40]:
df.drop(columns=['Health Conditions'], inplace=True)

In [41]:
df.head()
df.dtypes
df['Sleep Duration'].unique()

array([ 6. ,  7. ,  8. ,  9. ,  5. ,  4. , 10. ,  6.5,  5.5,  4.5])

In [42]:
def quality_of_sleep(duration):
    if 7 <= duration <= 9:
        return "Good"
    elif 5 <= duration < 7:
        return "Average"
    else:
        return "Poor"

# Apply the function to the DataFrame
df["Quality of Sleep"] = df["Sleep Duration"].apply(quality_of_sleep)

# Show the updated DataFrame
df.head()
        

Unnamed: 0,Age,Gender,Physical Activity Level,Stress Level,Sleep Duration,Blood Pressure,BMI Category,Quality of Sleep
0,22,Male,Rarely,3,6.0,120/80,Normal,Average
1,24,Male,Daily,3,6.0,115/75,Normal,Average
2,25,Male,Rarely,3,7.0,115/75,Normal,Good
3,22,Male,3-5 times a week,3,6.0,150/100,Overweight,Average
4,22,Female,Daily,4,8.0,130/85,Normal,Good


In [43]:
df['Stress Level'] = df['Stress Level'].apply(lambda x: 2*x)
df.head()

Unnamed: 0,Age,Gender,Physical Activity Level,Stress Level,Sleep Duration,Blood Pressure,BMI Category,Quality of Sleep
0,22,Male,Rarely,6,6.0,120/80,Normal,Average
1,24,Male,Daily,6,6.0,115/75,Normal,Average
2,25,Male,Rarely,6,7.0,115/75,Normal,Good
3,22,Male,3-5 times a week,6,6.0,150/100,Overweight,Average
4,22,Female,Daily,8,8.0,130/85,Normal,Good


In [44]:
def assign_random_activity(value):
    if value == "Daily":
        return random.randint(80, 100)  # Daily → 80 to 100
    elif value == "3-5 times a week":
        return random.randint(50, 79)  # 3-5 times a week → 50 to 79
    elif value == "Rarely":
        return random.randint(20, 49)  # Rarely → 1 to 49
    else:
        return None  # For any other category

# Apply the function to the 'Physical Activity Level' column
df["Physical Activity Level"] = df["Physical Activity Level"].apply(assign_random_activity)

# Show the updated DataFrame
print(df)

    Age  Gender  Physical Activity Level  Stress Level  Sleep Duration  \
0    22    Male                       48             6             6.0   
1    24    Male                       85             6             6.0   
2    25    Male                       23             6             7.0   
3    22    Male                       60             6             6.0   
4    22  Female                       85             8             8.0   
5    31  Female                       68             4             6.0   
6    22  Female                       38            10             6.0   
7    37  Female                       82             2             7.0   
8    40    Male                       94             4             7.0   
9    22    Male                       44             4             7.0   
10   13    Male                       85             8             9.0   
11   23    Male                       57             6             5.0   
12   37  Female                       

In [45]:
df.head()

Unnamed: 0,Age,Gender,Physical Activity Level,Stress Level,Sleep Duration,Blood Pressure,BMI Category,Quality of Sleep
0,22,Male,48,6,6.0,120/80,Normal,Average
1,24,Male,85,6,6.0,115/75,Normal,Average
2,25,Male,23,6,7.0,115/75,Normal,Good
3,22,Male,60,6,6.0,150/100,Overweight,Average
4,22,Female,85,8,8.0,130/85,Normal,Good


In [46]:
df.to_csv('data_sets/refined_survey.csv')