<a href="https://www.kaggle.com/code/i200605salehahmad/sleep-disorder-data-analysis-and-prediction?scriptVersionId=131127071" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split

import lazypredict
from lazypredict.Supervised import LazyClassifier

In [None]:
df = pd.read_csv('/kaggle/input/sleep-health-and-lifestyle-dataset/Sleep_health_and_lifestyle_dataset.csv').drop('Person ID',axis=1) # drop Person ID column as it is not needed
df

In [None]:
df['Sleep Disorder'].value_counts(dropna=False) # check for missing values

In [None]:
df['Sleep Disorder'] = df['Sleep Disorder'].replace(np.nan,'None') # There are three labels in the target column as per the description, so we replace the missing values with 'None'
df['Sleep Disorder'].value_counts(dropna=False)

# Values > .60 in Sleep duration seem invalid because if a person slept for 3 hours and 90 minutes, it could have been written as 4 hours and 30 minutes. So correcting that

In [None]:
np.unique(df['Sleep Duration'])

In [None]:
Updated_Sleep_Duration = []
for idx,val in enumerate(df['Sleep Duration']):
    Updated_Duration_Temp = val
    Hours,Minutes = int(str(val).split('.')[0]),int(str(val).split('.')[1]) # split the hours and minutes
    if Minutes > 6: #let 10 = 1 and so on
        Minutes = Minutes - 6 # subtract 6 minutes from the minutes column
        Hours = Hours + 1 # add 1 hour to the hours column
        Updated_Duration_Temp = Hours + Minutes
    Updated_Sleep_Duration.append(Updated_Duration_Temp)

df['Sleep Duration'] = Updated_Sleep_Duration
df

In [None]:
Sleep_Duration = df['Sleep Duration'].values.astype(str)
Sleep_Hours, Sleep_Minutes = [], []
for idx,val in enumerate(Sleep_Duration):
    Hours,Minutes = int(str(val).split('.')[0]),int(str(val).split('.')[1])
    Sleep_Hours.append(Hours)
    Sleep_Minutes.append(Minutes*10) #since 1 = 10 so, converting 1 to 10 and so on

df = df.drop('Sleep Duration',axis=1)
df.insert(3,'Sleep Hours',Sleep_Hours)
df.insert(4,'Sleep Minutes',Sleep_Minutes)
df

In [None]:
np.unique(Sleep_Minutes)

# Split Blood pressure into systolic and diastolic pressures

In [None]:
Blood_Pressure = df['Blood Pressure'].values.astype(str)
Systolic, Diastolic = [], []
for idx,val in enumerate(Blood_Pressure): # split the systolic and diastolic values
    Systolic.append(int(str(val).split('/')[0])) 
    Diastolic.append(int(str(val).split('/')[1])) 

df = df.drop('Blood Pressure',axis=1)
df.insert(7,'Systolic',Systolic)
df.insert(8,'Diastolic',Diastolic)
df

# Normalize all Numerical Columns

In [None]:
All_Numerical_Columns = df.select_dtypes(exclude=['object']).columns
All_Numerical_Columns

In [None]:
# Normalize
scaler = MinMaxScaler((0.1,1.1)) # scale the values between 0.1 and 1.1
df[All_Numerical_Columns] = scaler.fit_transform(df[All_Numerical_Columns])
df

# Label Encoding all Categorical Columns

In [None]:
All_Categorical_Columns = list(set(df.select_dtypes(include=['object']).columns) - set(['Sleep Disorder']))
All_Categorical_Columns

In [None]:
#Label Encode all
le = LabelEncoder()
for col in All_Categorical_Columns:
    df[col] = le.fit_transform(df[col])
df

# Lazy Predict

In [None]:
X = df.drop('Sleep Disorder',axis=1)
y = df['Sleep Disorder']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None) #Predictions by 29 classifiers
models,predictions = clf.fit(X_train, X_test, y_train, y_test)
models

# Best Performing Classifier

In [None]:
Best_Classifier, Best_Accuracy, Best_F1 = models.index[0], models['Accuracy'][0], models['F1 Score'][0]
print(f'Best Classifier: {Best_Classifier}\nBest Accuracy: {Best_Accuracy*100}\nBest F1 Score: {Best_F1*100}')