<a href="https://www.kaggle.com/code/i200605salehahmad/sleep-disorder-data-analysis-and-prediction?scriptVersionId=131127956" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
!pip install lazypredict

Collecting lazypredict
  Downloading lazypredict-0.2.12-py2.py3-none-any.whl (12 kB)
Installing collected packages: lazypredict
Successfully installed lazypredict-0.2.12
[0m

In [2]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split

import lazypredict
from lazypredict.Supervised import LazyClassifier

In [3]:
df = pd.read_csv('/kaggle/input/sleep-health-and-lifestyle-dataset/Sleep_health_and_lifestyle_dataset.csv').drop('Person ID',axis=1) # drop Person ID column as it is not needed
df

Unnamed: 0,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,Male,27,Software Engineer,6.10,6,42,6,Overweight,126/83,77,4200,
1,Male,28,Doctor,6.20,6,60,8,Normal,125/80,75,10000,
2,Male,28,Doctor,6.20,6,60,8,Normal,125/80,75,10000,
3,Male,28,Sales Representative,5.90,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,Male,28,Sales Representative,5.90,4,30,8,Obese,140/90,85,3000,Sleep Apnea
...,...,...,...,...,...,...,...,...,...,...,...,...
369,Female,59,Nurse,8.10,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
370,Female,59,Nurse,8.00,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
371,Female,59,Nurse,8.10,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
372,Female,59,Nurse,8.10,9,75,3,Overweight,140/95,68,7000,Sleep Apnea


In [4]:
df['Sleep Disorder'].value_counts(dropna=False) # check for missing values

None           219
Sleep Apnea     78
Insomnia        77
Name: Sleep Disorder, dtype: int64

In [5]:
df['Sleep Disorder'] = df['Sleep Disorder'].replace(np.nan,'None') # There are three labels in the target column as per the description, so we replace the missing values with 'None'
df['Sleep Disorder'].value_counts(dropna=False)

None           219
Sleep Apnea     78
Insomnia        77
Name: Sleep Disorder, dtype: int64

# Values > .60 in Sleep duration seem invalid because if a person slept for 3 hours and 90 minutes, it could have been written as 4 hours and 30 minutes. So correcting that

In [6]:
np.unique(df['Sleep Duration'])

array([5.8, 5.9, 6. , 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7.1,
       7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8. , 8.1, 8.2, 8.3, 8.4,
       8.5])

In [7]:
Updated_Sleep_Duration = []
for idx,val in enumerate(df['Sleep Duration']):
    Updated_Duration_Temp = val
    Hours,Minutes = int(str(val).split('.')[0]),int(str(val).split('.')[1]) # split the hours and minutes
    if Minutes > 6: #let 10 = 1 and so on
        Minutes = Minutes - 6 # subtract 6 minutes from the minutes column
        Hours = Hours + 1 # add 1 hour to the hours column
        Updated_Duration_Temp = Hours + Minutes
    Updated_Sleep_Duration.append(Updated_Duration_Temp)

df['Sleep Duration'] = Updated_Sleep_Duration
df

Unnamed: 0,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,Male,27,Software Engineer,6.10,6,42,6,Overweight,126/83,77,4200,
1,Male,28,Doctor,6.20,6,60,8,Normal,125/80,75,10000,
2,Male,28,Doctor,6.20,6,60,8,Normal,125/80,75,10000,
3,Male,28,Sales Representative,9.00,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,Male,28,Sales Representative,9.00,4,30,8,Obese,140/90,85,3000,Sleep Apnea
...,...,...,...,...,...,...,...,...,...,...,...,...
369,Female,59,Nurse,8.10,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
370,Female,59,Nurse,8.00,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
371,Female,59,Nurse,8.10,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
372,Female,59,Nurse,8.10,9,75,3,Overweight,140/95,68,7000,Sleep Apnea


In [8]:
Sleep_Duration = df['Sleep Duration'].values.astype(str)
Sleep_Hours, Sleep_Minutes = [], []
for idx,val in enumerate(Sleep_Duration):
    Hours,Minutes = int(str(val).split('.')[0]),int(str(val).split('.')[1])
    Sleep_Hours.append(Hours)
    Sleep_Minutes.append(Minutes*10) #since 1 = 10 so, converting 1 to 10 and so on

df = df.drop('Sleep Duration',axis=1)
df.insert(3,'Sleep Hours',Sleep_Hours)
df.insert(4,'Sleep Minutes',Sleep_Minutes)
df

Unnamed: 0,Gender,Age,Occupation,Sleep Hours,Sleep Minutes,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,Male,27,Software Engineer,6,10,6,42,6,Overweight,126/83,77,4200,
1,Male,28,Doctor,6,20,6,60,8,Normal,125/80,75,10000,
2,Male,28,Doctor,6,20,6,60,8,Normal,125/80,75,10000,
3,Male,28,Sales Representative,9,0,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,Male,28,Sales Representative,9,0,4,30,8,Obese,140/90,85,3000,Sleep Apnea
...,...,...,...,...,...,...,...,...,...,...,...,...,...
369,Female,59,Nurse,8,10,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
370,Female,59,Nurse,8,0,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
371,Female,59,Nurse,8,10,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
372,Female,59,Nurse,8,10,9,75,3,Overweight,140/95,68,7000,Sleep Apnea


In [9]:
np.unique(Sleep_Minutes)

array([ 0, 10, 20, 30, 40, 50, 60])

# Split Blood pressure into systolic and diastolic pressures

In [10]:
Blood_Pressure = df['Blood Pressure'].values.astype(str)
Systolic, Diastolic = [], []
for idx,val in enumerate(Blood_Pressure): # split the systolic and diastolic values
    Systolic.append(int(str(val).split('/')[0])) 
    Diastolic.append(int(str(val).split('/')[1])) 

df = df.drop('Blood Pressure',axis=1)
df.insert(7,'Systolic',Systolic)
df.insert(8,'Diastolic',Diastolic)
df

Unnamed: 0,Gender,Age,Occupation,Sleep Hours,Sleep Minutes,Quality of Sleep,Physical Activity Level,Systolic,Diastolic,Stress Level,BMI Category,Heart Rate,Daily Steps,Sleep Disorder
0,Male,27,Software Engineer,6,10,6,42,126,83,6,Overweight,77,4200,
1,Male,28,Doctor,6,20,6,60,125,80,8,Normal,75,10000,
2,Male,28,Doctor,6,20,6,60,125,80,8,Normal,75,10000,
3,Male,28,Sales Representative,9,0,4,30,140,90,8,Obese,85,3000,Sleep Apnea
4,Male,28,Sales Representative,9,0,4,30,140,90,8,Obese,85,3000,Sleep Apnea
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
369,Female,59,Nurse,8,10,9,75,140,95,3,Overweight,68,7000,Sleep Apnea
370,Female,59,Nurse,8,0,9,75,140,95,3,Overweight,68,7000,Sleep Apnea
371,Female,59,Nurse,8,10,9,75,140,95,3,Overweight,68,7000,Sleep Apnea
372,Female,59,Nurse,8,10,9,75,140,95,3,Overweight,68,7000,Sleep Apnea


# Normalize all Numerical Columns

In [11]:
All_Numerical_Columns = df.select_dtypes(exclude=['object']).columns
All_Numerical_Columns

Index(['Age', 'Sleep Hours', 'Sleep Minutes', 'Quality of Sleep',
       'Physical Activity Level', 'Systolic', 'Diastolic', 'Stress Level',
       'Heart Rate', 'Daily Steps'],
      dtype='object')

In [12]:
# Normalize
scaler = MinMaxScaler((0.1,1.1)) # scale the values between 0.1 and 1.1
df[All_Numerical_Columns] = scaler.fit_transform(df[All_Numerical_Columns])
df

Unnamed: 0,Gender,Age,Occupation,Sleep Hours,Sleep Minutes,Quality of Sleep,Physical Activity Level,Systolic,Diastolic,Stress Level,BMI Category,Heart Rate,Daily Steps,Sleep Disorder
0,Male,0.10,Software Engineer,0.10,0.27,0.50,0.30,0.51,0.50,0.70,Overweight,0.67,0.27,
1,Male,0.13,Doctor,0.10,0.43,0.50,0.60,0.47,0.35,1.10,Normal,0.58,1.10,
2,Male,0.13,Doctor,0.10,0.43,0.50,0.60,0.47,0.35,1.10,Normal,0.58,1.10,
3,Male,0.13,Sales Representative,0.70,0.10,0.10,0.10,1.03,0.85,1.10,Obese,1.05,0.10,Sleep Apnea
4,Male,0.13,Sales Representative,0.70,0.10,0.10,0.10,1.03,0.85,1.10,Obese,1.05,0.10,Sleep Apnea
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
369,Female,1.10,Nurse,0.50,0.27,1.10,0.85,1.03,1.10,0.10,Overweight,0.24,0.67,Sleep Apnea
370,Female,1.10,Nurse,0.50,0.10,1.10,0.85,1.03,1.10,0.10,Overweight,0.24,0.67,Sleep Apnea
371,Female,1.10,Nurse,0.50,0.27,1.10,0.85,1.03,1.10,0.10,Overweight,0.24,0.67,Sleep Apnea
372,Female,1.10,Nurse,0.50,0.27,1.10,0.85,1.03,1.10,0.10,Overweight,0.24,0.67,Sleep Apnea


# Label Encoding all Categorical Columns

In [13]:
All_Categorical_Columns = list(set(df.select_dtypes(include=['object']).columns) - set(['Sleep Disorder']))
All_Categorical_Columns

['Gender', 'Occupation', 'BMI Category']

In [14]:
#Label Encode all
le = LabelEncoder()
for col in All_Categorical_Columns:
    df[col] = le.fit_transform(df[col])
df

Unnamed: 0,Gender,Age,Occupation,Sleep Hours,Sleep Minutes,Quality of Sleep,Physical Activity Level,Systolic,Diastolic,Stress Level,BMI Category,Heart Rate,Daily Steps,Sleep Disorder
0,1,0.10,9,0.10,0.27,0.50,0.30,0.51,0.50,0.70,3,0.67,0.27,
1,1,0.13,1,0.10,0.43,0.50,0.60,0.47,0.35,1.10,0,0.58,1.10,
2,1,0.13,1,0.10,0.43,0.50,0.60,0.47,0.35,1.10,0,0.58,1.10,
3,1,0.13,6,0.70,0.10,0.10,0.10,1.03,0.85,1.10,2,1.05,0.10,Sleep Apnea
4,1,0.13,6,0.70,0.10,0.10,0.10,1.03,0.85,1.10,2,1.05,0.10,Sleep Apnea
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
369,0,1.10,5,0.50,0.27,1.10,0.85,1.03,1.10,0.10,3,0.24,0.67,Sleep Apnea
370,0,1.10,5,0.50,0.10,1.10,0.85,1.03,1.10,0.10,3,0.24,0.67,Sleep Apnea
371,0,1.10,5,0.50,0.27,1.10,0.85,1.03,1.10,0.10,3,0.24,0.67,Sleep Apnea
372,0,1.10,5,0.50,0.27,1.10,0.85,1.03,1.10,0.10,3,0.24,0.67,Sleep Apnea


# Lazy Predict

In [15]:
X = df.drop('Sleep Disorder',axis=1)
y = df['Sleep Disorder']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None) #Predictions by 29 classifiers
models,predictions = clf.fit(X_train, X_test, y_train, y_test)
models

100%|██████████| 29/29 [00:02<00:00, 12.19it/s]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
KNeighborsClassifier,0.99,0.98,,0.99,0.02
QuadraticDiscriminantAnalysis,0.97,0.96,,0.97,0.03
SVC,0.97,0.96,,0.97,0.02
BernoulliNB,0.96,0.95,,0.96,0.03
LGBMClassifier,0.96,0.95,,0.96,0.76
RandomForestClassifier,0.96,0.95,,0.96,0.29
DecisionTreeClassifier,0.95,0.94,,0.95,0.02
GaussianNB,0.95,0.94,,0.95,0.02
BaggingClassifier,0.95,0.94,,0.95,0.05
ExtraTreeClassifier,0.95,0.93,,0.95,0.02


# Best Performing Classifier

In [16]:
Best_Classifier, Best_Accuracy, Best_F1 = models.index[0], models['Accuracy'][0], models['F1 Score'][0]
print(f'Best Classifier: {Best_Classifier}\nBest Accuracy: {Best_Accuracy*100}\nBest F1 Score: {Best_F1*100}')

Best Classifier: KNeighborsClassifier
Best Accuracy: 98.66666666666667
Best F1 Score: 98.66666666666667
