In [1]:
try:
    import xgboost
except ImportError:
    !pip install xgboost
    import xgboost

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

In [3]:
!pip install --upgrade xgboost



In [4]:
df = pd.read_csv('Sleep_health_and_lifestyle_dataset.csv')
df = df.drop(['Stress Level','Person ID','Occupation','Daily Steps'], axis=1)

df.head()

Unnamed: 0,Gender,Age,Sleep Duration,Quality of Sleep,Physical Activity Level,BMI Category,Blood Pressure,Heart Rate,Sleep Disorder
0,Male,27,6.1,6,42,Overweight,126/83,77,
1,Male,28,6.2,6,60,Normal,125/80,75,
2,Male,28,6.2,6,60,Normal,125/80,75,
3,Male,28,5.9,4,30,Obese,140/90,85,Sleep Apnea
4,Male,28,5.9,4,30,Obese,140/90,85,Sleep Apnea


In [5]:
print(df.isnull().sum())

# Check data types
print(df.dtypes)

# List unique values for categoricals
for col in ['Gender', 'BMI Category', 'Sleep Disorder']:
    print(f'{col}:', df[col].unique())


Gender                     0
Age                        0
Sleep Duration             0
Quality of Sleep           0
Physical Activity Level    0
BMI Category               0
Blood Pressure             0
Heart Rate                 0
Sleep Disorder             0
dtype: int64
Gender                      object
Age                          int64
Sleep Duration             float64
Quality of Sleep             int64
Physical Activity Level      int64
BMI Category                object
Blood Pressure              object
Heart Rate                   int64
Sleep Disorder              object
dtype: object
Gender: ['Male' 'Female']
BMI Category: ['Overweight' 'Normal' 'Obese' 'Normal Weight']
Sleep Disorder: ['None' 'Sleep Apnea' 'Insomnia']


In [6]:
df[['systolic_bp', 'diastolic_bp']] = df['Blood Pressure'].str.split('/', expand=True).astype(float)
df = df.drop('Blood Pressure', axis=1)

In [7]:
X = df.drop('Sleep Disorder', axis=1)
y = df['Sleep Disorder']

# Identify categorical and numerical columns
categorical_cols = ['Gender', 'BMI Category']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Preprocessing for numerical data
from sklearn.impute import SimpleImputer
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

# Encode target if not numeric
if y.dtype == 'O':
    le = LabelEncoder()
    y = le.fit_transform(y)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [9]:
# Random Forest Pipeline
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# XGBoost Pipeline
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42))
])

# Train both models
rf_pipeline.fit(X_train, y_train)
xgb_pipeline.fit(X_train, y_train)

# Evaluate
rf_pred = rf_pipeline.predict(X_test)
xgb_pred = xgb_pipeline.predict(X_test)

rf_acc = accuracy_score(y_test, rf_pred)
xgb_acc = accuracy_score(y_test, xgb_pred)

print(f'Random Forest Accuracy: {rf_acc:.4f}')
print(f'XGBoost Accuracy: {xgb_acc:.4f}')

# Select best
if rf_acc >= xgb_acc:
    best_model = rf_pipeline
    best_name = 'Random Forest'
    best_pred = rf_pred
else:
    best_model = xgb_pipeline
    best_name = 'XGBoost'
    best_pred = xgb_pred

print(f'\nBest Model: {best_name}')


Random Forest Accuracy: 0.9733
XGBoost Accuracy: 0.9733

Best Model: Random Forest


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [10]:
print('Accuracy:', accuracy_score(y_test, best_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, best_pred))
print('Classification Report:\n', classification_report(y_test, best_pred))


Accuracy: 0.9733333333333334
Confusion Matrix:
 [[14  0  1]
 [ 0 44  0]
 [ 1  0 15]]
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.93      0.93        15
           1       1.00      1.00      1.00        44
           2       0.94      0.94      0.94        16

    accuracy                           0.97        75
   macro avg       0.96      0.96      0.96        75
weighted avg       0.97      0.97      0.97        75



In [11]:
# Save the best model pipeline
joblib.dump(best_model, 'sleep_disorder_pred_v2.pkl')
print('Model saved as sleep_disorder_pred_v2.pkl')

# Save the label encoder if used
if 'le' in locals():
    joblib.dump(le, 'sleep_disorder_label_encoder.pkl')
    print('Label encoder saved as sleep_disorder_label_encoder.pkl')


Model saved as sleep_disorder_pred_v2.pkl
Label encoder saved as sleep_disorder_label_encoder.pkl


In [12]:
!pip show scikit-learn

Name: scikit-learn
Version: 1.6.0
Summary: A set of python modules for machine learning and data mining
Home-page: https://scikit-learn.org
Author: 
Author-email: 
License: BSD 3-Clause License
         
         Copyright (c) 2007-2024 The scikit-learn developers.
         All rights reserved.
         
         Redistribution and use in source and binary forms, with or without
         modification, are permitted provided that the following conditions are met:
         
         * Redistributions of source code must retain the above copyright notice, this
           list of conditions and the following disclaimer.
         
         * Redistributions in binary form must reproduce the above copyright notice,
           this list of conditions and the following disclaimer in the documentation
           and/or other materials provided with the distribution.
         
         * Neither the name of the copyright holder nor the names of its
           contributors may be used to endorse 