In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder

"""Shows data before it is changed"""
data = pd.read_csv("C:\\Users\\alext\\Downloads\\Sleep_health_and_lifestyle_dataset_no_duplicates.csv")
print(data.to_string())

In [None]:
"""BMI category ordinal encoding"""
weight_mapping = {
    'Underweight': 0,
    'Normal': 1,
    'Normal Weight': 1,
    'Overweight': 2,
    'Obese': 3
}
# Apply the mapping
data['Weight_Category_BMI'] = data['BMI Category'].map(weight_mapping)




"""Gruppering af steps"""
step_labels = [0, 1, 2]  # 0 for 'bad', 1 for 'middle', 2 for 'good'

# Create a new column 'Step Category Ordinal' with numeric values based on defined ranges
data['Step Category Ordinal'] = pd.cut(data['Daily Steps'], 
                                       bins=[-1, 4999, 7999, float('inf')], 
                                       labels=step_labels)


"""This line of code changes blood pressure. It changes blood pressure into systolic and diastolic then goes though thredsholds to categorize them"""
data[['Systolic', 'Diastolic']] = data['Blood Pressure'].str.split('/', expand=True).astype(float)

# Define a function to categorize blood pressure
def categorize_blood_pressure(row):
    systolic = row['Systolic']
    diastolic = row['Diastolic']
    
    if systolic <= 140 and diastolic <= 90:
        return 0 #Normal
    elif 135 <= systolic <= 160 and 90 < diastolic <= 100:
        return 1 #Mild hypertension
    elif 155 <= systolic <= 175 and 100 <= diastolic <= 110:
        return 2 #Moderat hypertension
    elif systolic > 175 or diastolic > 110:
        return 3 #Serious
    else:
        return 'Uncategorized'  # For any values that do not fit above categories

# Apply the function to categorize blood pressure
data['Blood_Pressure_Category'] = data.apply(categorize_blood_pressure, axis=1)



"""Replaces Gender: Female = 0 male = 1"""
data['Gender'] = data['Gender'].replace({'Female': 0, 'Male': 1})


"""Physical_Activity ordinal group: Changes numbers into groups from 0-49 = bad/0, 50-69 = middle/1, 70+ = good/2"""
# Define bins and labels for physical activity levels
activity_bins = [0, 50, 70, float('inf')]
activity_labels = [0, 1, 2]  # Ordinal encoding

# Create an ordinal column based on defined bins
data['Physical_Activity_Ordinal'] = pd.cut(data['Physical Activity Level'], bins=activity_bins, labels=activity_labels)

# Apply one-hot encoding
physical_activity_one_hot = pd.get_dummies(data['Physical_Activity_Ordinal'], prefix='Physical_Activity')


"""Binary stress 0 = not stressed 1 = stressed"""
# Define a binary encoding for 'Stress Level': 0 for not stressed (1-5), 1 for stressed (6-10)
data['Stress_Binary'] = data['Stress Level'].apply(lambda x: 1 if 6 <= x <= 10 else 0)





"""Remove columns which are not important"""
# Remove the 'User_ID and age' column
data = data.drop(columns=["Occupation", "Sleep Disorder", "Blood Pressure", "BMI Category", "Physical Activity Level", "Heart Rate", "Daily Steps", "Systolic", "Diastolic", "Stress Level"])





from IPython.display import display
display(data)