In [30]:
import pandas as pd


data = pd.read_csv('Sleep_health_and_lifestyle_dataset.csv')
sleep_disorder_mapping = {label: idx for idx, label in enumerate(data['Sleep Disorder'].unique())}
data['Sleep Disorder'] = data['Sleep Disorder'].map(sleep_disorder_mapping)

# Save Sleep Disorder as a map
# Can be used after prediction
mapping_df = pd.DataFrame(list(sleep_disorder_mapping.items()), columns=['Class Name', 'Index'])
mapping_df.to_csv('sleep_disorder_mapping.csv', index=False)


#print(data['Sleep Disorder'].head(10))
#print("Mapping of Sleep Disorder classes to numbers:", sleep_disorder_mapping)
#print(data['Sleep Disorder'])

In [31]:
# Split x, y data
y_data = data[['Sleep Disorder']]
x_data = data.drop(columns=['Sleep Disorder'])

# Correct x_data
x_data['BMI Category'] = x_data['BMI Category'].replace('Normal Weight', 'Normal')

#print(x_data)

In [32]:
columns_to_drop = [
    'Person ID'
]
x_data = x_data.drop(columns=columns_to_drop)

In [33]:
# Split the 'Blood Pressure (systolic/diastolic)' column into two new columns
x_data[['Blood Pressure systolic', 'Blood Pressure diastolic']] = x_data['Blood Pressure'].str.split('/', expand=True)

x_data['Blood Pressure systolic'] = pd.to_numeric(x_data['Blood Pressure systolic'])
x_data['Blood Pressure diastolic'] = pd.to_numeric(x_data['Blood Pressure diastolic'])

x_data = x_data.drop(columns=['Blood Pressure'])

print("First few rows of x_data after splitting 'Blood Pressure':")
print(x_data.head())


First few rows of x_data after splitting 'Blood Pressure':
  Gender  Age            Occupation  Sleep Duration  Quality of Sleep  \
0   Male   27     Software Engineer             6.1                 6   
1   Male   28                Doctor             6.2                 6   
2   Male   28                Doctor             6.2                 6   
3   Male   28  Sales Representative             5.9                 4   
4   Male   28  Sales Representative             5.9                 4   

   Physical Activity Level  Stress Level BMI Category  Heart Rate  \
0                       42             6   Overweight          77   
1                       60             8       Normal          75   
2                       60             8       Normal          75   
3                       30             8        Obese          85   
4                       30             8        Obese          85   

   Daily Steps  Blood Pressure systolic  Blood Pressure diastolic  
0         4200     

In [34]:
# Split training and testing data
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(x_data, y_data, test_size=0.2, random_state=42)


# One-hot encode categorical features in X_train and X_val

X_train = pd.get_dummies(X_train, drop_first=True)
X_val = pd.get_dummies(X_val, drop_first=True)


X_val = X_val.reindex(columns=X_train.columns, fill_value=0)
# # print("First few rows of X_train after one-hot encoding:")
# print(X_train.head())
# # print("\nFirst few rows of X_val after one-hot encoding:")
# print(X_val.head())

In [35]:
# Do Scaling
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()

X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_val = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns, index=X_val.index)

# Display the first few rows to confirm scaling
# print("First few rows of X_train after scaling:")
# print(X_train.head())
# print("\nFirst few rows of X_val after scaling:")
# print(X_val.head())

In [36]:
from imblearn.over_sampling import SMOTE

print("Class distribution before resampling:")
print(y_train.value_counts())
print("\n")

# Oversampling using SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("Class distribution after resampling:")
print(y_train_resampled.value_counts())

Class distribution before resampling:
Sleep Disorder
0                 176
1                  62
2                  61
Name: count, dtype: int64


Class distribution after resampling:
Sleep Disorder
0                 176
1                 176
2                 176
Name: count, dtype: int64


In [37]:
# print(X_train)
# print(X_val)

# Train --EN

In [38]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, f1_score, accuracy_score, classification_report, roc_auc_score, roc_curve, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_validate
from xgboost import XGBClassifier

train_X = X_train_resampled
train_y = y_train_resampled['Sleep Disorder']
val_X = X_val
val_y = y_val['Sleep Disorder']

scoring_metrics = {
    'accuracy': 'accuracy',
    'f1': make_scorer(f1_score, average='weighted'),  
    'roc_auc': 'roc_auc_ovr'  
}

def print_val_score(model_name, cv_results):
    print(f"[ {model_name} ]")
    print("Average F1 Score:  ", round(cv_results['test_f1'].mean(), 4))
    print("Average Accuracy:  ", round(cv_results['test_accuracy'].mean(), 4))
    print("Average AUROC:     ", round(cv_results['test_roc_auc'].mean(), 4))

def print_performance(model_name, predictions, val_y):
    print(f"[ {model_name} ]")
    score_f1 = f1_score(val_y, predictions, average='weighted')
    score_acc = accuracy_score(val_y, predictions)
    score_precision = precision_score(y_val, predictions, average='weighted')
    score_recall = recall_score(y_val, predictions, average='weighted')
    print("Average F1 Score:  ", round(score_f1, 4))
    print("Average Accuracy:  ", round(score_acc, 4))
    print("Average Precision: ", round(score_precision, 4))
    print("Average Recall:    ", round(score_recall, 4))
    



### Cross Validation

In [39]:
model_name = "Random Forest"
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=15,
    class_weight='balanced',
    random_state=42)

rf_scores = cross_validate(estimator=rf_model, X=train_X, y=train_y, cv=5, scoring=scoring_metrics)
print_val_score(model_name, rf_scores)

[ Random Forest ]
Average F1 Score:   0.9069
Average Accuracy:   0.9071
Average AUROC:      0.9641


In [40]:
model_name = 'XGBoost'
xgboost_model = XGBClassifier(
    n_estimators=200, 
    learning_rate=0.1, 
    max_depth=6, 
    random_state=42, 
    verbosity=2)
xgboost_scores = cross_validate(estimator=xgboost_model, X=train_X, y=train_y, cv=5, scoring=scoring_metrics)
print_val_score(model_name, xgboost_scores)

[ XGBoost ]
Average F1 Score:   0.9221
Average Accuracy:   0.9223
Average AUROC:      0.9604


In [41]:
model_name = 'SVM'
svm_model = SVC(
    kernel='linear',
    random_state=42,
    probability=True)
svm_scores = cross_validate(estimator=svm_model, X=train_X, y=train_y, cv=5, scoring=scoring_metrics)
print_val_score(model_name, svm_scores)

[ SVM ]
Average F1 Score:   0.9058
Average Accuracy:   0.9053
Average AUROC:      0.9598


### Train & Test

In [42]:
best_model = rf_model 
best_model.fit(train_X, train_y)
test_predictions = best_model.predict(val_X)
print_performance(str(best_model), test_predictions, val_y)
print("\n\n")
best_model = xgboost_model 
best_model.fit(train_X, train_y)
test_predictions = best_model.predict(val_X)
print_performance(str(best_model), test_predictions, val_y)
print("\n\n")
best_model = svm_model 
best_model.fit(train_X, train_y)
test_predictions = best_model.predict(val_X)
print_performance(str(best_model), test_predictions, val_y)


[ RandomForestClassifier(class_weight='balanced', max_depth=15, random_state=42) ]
Average F1 Score:   0.9058
Average Accuracy:   0.9067
Average Precision:  0.9055
Average Recall:     0.9067



[ XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=6, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=200, n_jobs=None,
              num_parallel_tree=None, objective='multi:softprob', ...) ]
Average F1 Score:   0.9058
Average Accur