## Predicting Stress and Sleep Disorders Using Health and Lifestyle Data



## Team
- Bhogaraju Shanmukha Sri Krishna
- Bhupati Varun
- Vishnu Shreeram M P

In [1]:
import sys
import os

# Add project root (one level up from notebooks/) to path
sys.path.append(os.path.abspath(".."))

# Loading Dataset
- To load the dataset, we can use the `read_csv()` function from the pandas library :

```python []
df_part1 = pd.read_csv('./Sleep_health_and_lifestyle_dataset.csv')
df_part2 = pd.read_csv('./Sleep_health_and_lifestyle_dataset_part_2.csv')
```
- These two lines of code will load the dataset from the CSV files `Sleep_health_and_lifestyle_dataset.csv` and `Sleep_health_and_lifestyle_dataset_part_2.csv` respectively and store them in the variables `df_part1` and `df_part2`.
- This logic can be found in ```src/load_data.py```.
- Later the two dfs are merged into a single df and this df is returned.


In [2]:
from src.pre_process import load_data

path1 = '../data/Sleep_health_and_lifestyle_dataset.csv'
path2 = '../data/Sleep_health_and_lifestyle_dataset_part_2.csv'

df = load_data(path1, path2)
df.head()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea


## Exploring the data set

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 560 entries, 0 to 185
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Person ID                560 non-null    int64  
 1   Gender                   560 non-null    object 
 2   Age                      560 non-null    int64  
 3   Occupation               560 non-null    object 
 4   Sleep Duration           560 non-null    float64
 5   Quality of Sleep         560 non-null    int64  
 6   Physical Activity Level  560 non-null    int64  
 7   Stress Level             560 non-null    int64  
 8   BMI Category             560 non-null    object 
 9   Blood Pressure           560 non-null    object 
 10  Heart Rate               560 non-null    int64  
 11  Daily Steps              560 non-null    int64  
 12  Sleep Disorder           185 non-null    object 
dtypes: float64(1), int64(7), object(5)
memory usage: 61.2+ KB


In [4]:
df.describe()

Unnamed: 0,Person ID,Age,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,Heart Rate,Daily Steps
count,560.0,560.0,560.0,560.0,560.0,560.0,560.0,560.0
mean,280.167857,40.021429,7.115893,7.275,58.7875,5.458929,70.178571,6821.071429
std,161.418024,8.132153,0.742655,1.134631,19.955394,1.676447,3.873247,1527.465518
min,1.0,27.0,5.8,4.0,30.0,3.0,65.0,3000.0
25%,140.75,33.0,6.5,6.0,45.0,4.0,68.0,5500.0
50%,280.5,38.0,7.2,7.0,60.0,5.0,70.0,7000.0
75%,419.25,44.0,7.7,8.0,75.0,7.0,72.0,8000.0
max,559.0,59.0,8.5,9.0,90.0,8.0,86.0,10000.0


# Dropping Irrelevant Features
- Here, the Person ID feature is irrelevant for our model training. Hence it is dropped
- Note that the drop is done in-place

In [5]:
df = df.drop('Person ID', axis=1)

# Handling NA values  

In [6]:
df.isnull().sum()

Gender                       0
Age                          0
Occupation                   0
Sleep Duration               0
Quality of Sleep             0
Physical Activity Level      0
Stress Level                 0
BMI Category                 0
Blood Pressure               0
Heart Rate                   0
Daily Steps                  0
Sleep Disorder             375
dtype: int64

- NA values found in 'Sleep Disorder' indicates that the person doesn't have any sleep disorders
- so instead of treating it as NA value we will assign 'None' to it

In [7]:
df['Sleep Disorder'] = df['Sleep Disorder'].fillna('None')

In [8]:
df.isnull().sum()

Gender                     0
Age                        0
Occupation                 0
Sleep Duration             0
Quality of Sleep           0
Physical Activity Level    0
Stress Level               0
BMI Category               0
Blood Pressure             0
Heart Rate                 0
Daily Steps                0
Sleep Disorder             0
dtype: int64

In [9]:
df

Unnamed: 0,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
...,...,...,...,...,...,...,...,...,...,...,...,...
181,Female,43,Teacher,6.7,7,45,4,Overweight,135/90,65,6000,Insomnia
182,Male,43,Salesperson,6.5,6,45,7,Overweight,130/85,72,6000,Insomnia
183,Female,43,Teacher,6.7,7,45,4,Overweight,135/90,65,6000,Insomnia
184,Male,43,Salesperson,6.4,6,45,7,Overweight,130/85,72,6000,Insomnia


## Handling the 'Blood Pressure' column 
- Blood pressure has a systolic and a diastolic pressure in (high/low) format
- Hence, splitting it into Diastolic Pressure and Systolic Pressure

In [None]:
from src.pre_process import handle_bp

df = handle_bp(df)
df

ModuleNotFoundError: No module named 'pre_process'

## Using LabelEncoders to Encode the Categorical Data
- We also store a dictionary of encoders so that we can also Transform the unseen data into the same format

In [None]:
from pre_process import categorical_features, numeric_features, encode

In [None]:
catData = categorical_features(df)
print('categorical features are :',catData)

In [None]:
numData = numeric_features(df)
print('numeric features are :',catData)

In [None]:
enc_res = encode(df)
df = enc_res[0]
label_encoding_dict = enc_res[1]
df

# Distribution of target features

In [None]:
import matplotlib.pyplot as plt
from src.plotting import distr_tgt_feat

distr_tgt_feat(label_encoding_dict, df)

# Boxplot of Features

In [None]:
from src.plotting import boxplot

boxplot(df)

## Correlation analysis

In [None]:

from src.plotting import correlation_analysis

correlation_analysis(df)

## Analysis
The correlation heatmap provides insights into the relationships between different features in the dataset
From the above heatmap, we can see that :-
- Systolic and Diastolic Pressures are highly correlated (+ve)
- Quality of Sleep and Sleep duration (+ve )
- Stress level and Quality of sleep / Sleep duaration (-ve)
- Daily steps and physical activity level (+ve)

# Standardisation of Data

In [None]:
# Standardize the data
from pre_process import standardise
from sklearn.preprocessing import StandardScaler

df = standardise(df)
df

# Building Regression models for Stress level Prediction

In [None]:

# Regression
tgt1 = df['Stress Level']
feat1 = df.drop(columns = ['Stress Level','Sleep Disorder'])


In [None]:
from src.regression import get_training_scores

scoreList = get_training_scores(df)

In [None]:
from src.plotting import regression_results

regression_results(scoreList)

# From the above plot, we select the degree of polynomial features as 3

In [None]:
from src.regression import regress

regress(feat1, tgt1)

## Classification

In [None]:
X = df.drop(columns=['Stress Level','Sleep Disorder'])
y = df['Sleep Disorder']

# Building Models for Classification of Sleep Disorders

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from src.classification import classify_svc

classify_svc(X_train, y_train, X_test, y_test, kernel = 'linear')

# Using over sampling method SMOTE to balance the data

In [None]:
from collections import Counter

# Checking for imbalanced data
cc = Counter(y_train)
cc

In [None]:
# Bad news, we actually have a large amount of imbalanced data
# So using a imbalanced learn method helps us to predict good observations
# Using SMOTE: Synthetic Minority Oversampling Technique

from src.sampling import resample_smote

X_smote, y_smote = resample_smote(X_train, y_train)

In [None]:
cc = Counter(y_smote)
cc

## Using Undersampling Method

In [None]:
from src.sampling import resample_cc

X_clc, y_clc = resample_cc(X_train, y_train)

In [None]:
from src.sampling import resample_iht

X_clc, y_clc = resample_iht(X_train, y_train)

In [None]:
cc = Counter(y_clc)
cc

# Using Randomized search CV

## Initial Data

In [None]:
from scipy.stats import uniform
from src.hyperparameter_tuning import run_randomised_search_svc

gammaValues = ['scale', 'auto']
param_grid = {
    'kernel': ['linear', 'rbf', 'poly'],
    'C': uniform(1e-5, 10),
    'degree': [i for i in range(1, 20)],
    'gamma': gammaValues,
    'decision_function_shape': ['ovo', 'ovr']
}



In [None]:
rand_clf = run_randomised_search_svc(X_train, y_train, param_grid)

In [None]:
rand_clf.best_params_

In [None]:
# Printing the classification report for the randomized search hyper parameter tuned model
y_pred = rand_clf.predict(X_test)

print("The classification report after using randomized search CV\n", classification_report(y_test, y_pred))

In [None]:
# Printing the confusion matrix for the randomized search hyper parameter tuned model
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

ConfusionMatrixDisplay(confusion_matrix(y_test, y_pred))

# Randomized search CV for SMOTE

In [None]:
rand_clf = run_randomised_search_svc(X_smote, y_smote, param_grid)

In [None]:
rand_clf.best_params_

In [None]:
# Printing the classification report for the randomized search hyper parameter tuned model

y_pred = rand_clf.predict(X_test)

print("The classification report after using randomized search CV\n", classification_report(y_test, y_pred))

In [None]:
confusion_matrix(y_test, y_pred)

# RandomizedSearchCV for Under Sampled Data

In [None]:
rand_clf = run_randomised_search_svc(X_clc, y_clc, param_grid)

In [None]:
rand_clf.best_params_

In [None]:
y_pred = rand_clf.predict(X_test)

print("The classification report after using randomized search CV\n", classification_report(y_test, y_pred))

In [None]:
confusion_matrix(y_test, y_pred)

# Using GridSearchCV for hyper parameter tuning

In [None]:
from src.hyperparameter_tuning import run_grid_search_svc

gammaValues = ['scale', 'auto']
cvals = [1e-5, 1e-4, 1e-3, 0.01, 0.1, 1, 10]
param_grid = {
    'kernel': ['linear', 'rbf', 'poly'],
    'C': cvals,
    'degree': [i for i in range(1, 5)],
    'gamma': gammaValues,
    'decision_function_shape': ['ovo', 'ovr']
}

In [None]:
grid_clf = run_grid_search_svc(X_train, y_train, param_grid)

In [None]:
# Finding the best parameters for the gridsearch CV
grid_clf.best_params_

In [None]:
# Using the test data to make predictions on the input
y_pred = grid_clf.predict(X_test)
print('Classification Report after using grid search CV\n', classification_report(y_pred = y_pred, y_true = y_test))

In [None]:
print("The confusion matrix is as follows")
confusion_matrix(y_test, y_pred)

## Using SMOTE samples for GridSearchCV


In [None]:
grid_clf = run_grid_search_svc(X_smote, y_smote, param_grid)

In [None]:
grid_clf.best_params_

In [None]:
y_pred = grid_clf.predict(X_test)
print('Classification Report after using grid search CV on SMOTE data\n', classification_report(y_pred = y_pred, y_true = y_test))

In [None]:
confusion_matrix(y_pred = y_pred, y_true = y_test)

# Using GridSearchCV for Under Sampled data

In [None]:
grid_clf = run_grid_search_svc(X_clc, y_clc, param_grid)

In [None]:
grid_clf.best_params_

In [None]:
y_pred = grid_clf.predict(X_test)
print('Classification Report after using grid search CV on SMOTE data\n', classification_report(y_pred = y_pred, y_true = y_test))

In [None]:
confusion_matrix(y_true = y_test, y_pred = y_pred)

# Visualising clusters using Principal Component Analysis (PCA and Kernel PCA)

In [None]:
label_encoding_dict['Sleep Disorder'].classes_

In [None]:
from src.plotting import plot_pca, plot_variance_ratio
from src.decompose import do_pca

pca, df_full_pca = do_pca(df)

plot_pca(df_full_pca, df, label_encoding_dict)

## Visualising Explained Variance Ratio vs Number of Components, & finding the optimal number of components

In [None]:
plot_variance_ratio(pca)

## From here we can see that at Number of Components = 5, it has achieved 91% of cumulative variance ratio. So we would try the classification using the new 5 components

In [None]:
from src.decompose import get_new_components

new_pca, new_points = get_new_components(df)

In [None]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['Stress Level','Sleep Disorder'])
y = df['Sleep Disorder']


X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=42)

classify_svc(X_train, y_train, X_test, y_test, kernel='rbf',degree=4,C=8.430136277531556,gamma='auto',decision_function_shape='ovr')

# using ensemble methods for imporving accuracy and reduce overfitting


In [None]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [None]:
y2 = df['Sleep Disorder']
y = df['Stress Level']

# Ensembles for Regression

In [None]:
from sklearn.model_selection import train_test_split
from src.regression import regress_bagging

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

regress_bagging(X_train, y_train, X_test, y_test)

In [None]:
from src.regression import regress_rf

# Calculate the differences (residuals)
residuals, y_pred_reg = regress_rf(X_train, y_train, X_test, y_test)

In [None]:
from src.plotting import plot_residual

plot_residual(y_test, y_pred_reg, residuals)

# classification using Ensembles

In [None]:
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(X, y2, test_size=0.2, random_state=42)

In [None]:
from src.classification import classify_rf

cmd = classify_rf(X_train_clf, y_train_clf, X_test_clf, y_test_clf)
plt.title("Confusion Matrix for Sleep Disorder Classification")
plt.show()

## Finding the best n_estimators for Random Forest Classifier


In [None]:
from src.plotting import plot_rfc

plot_rfc()

In [None]:

from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor

from src.regression import compare_regs
from xgboost import XGBRegressor, XGBClassifier


# Split the data into training and testing sets
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X, y, test_size=0.2, random_state=42)

# Define and train ensemble regressors
regressors = {
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "AdaBoost": AdaBoostRegressor(random_state=42),
    "Bagging": BaggingRegressor(estimator=RandomForestRegressor(), random_state=42),
    "XGBoost": XGBRegressor(objective='reg:squarederror', random_state=42)
}

compare_regs(X_train_reg, y_train_reg, X_test_reg, y_test_reg, regressors)


### The best Ensemble Regressor is XG Boost

In [None]:
from src.classification import compare_clf
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier

X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(X, y2, test_size=0.2, random_state=42)

# Define and train ensemble classifiers
classifiers = {
    "Random Forest": RandomForestClassifier(n_estimators=40,random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "Bagging": BaggingClassifier(estimator=RandomForestClassifier(), random_state=42),
    "XGBoost": XGBClassifier(eval_metric='logloss', random_state=42)
}

# Evaluate each classifier
compare_clf(X_train_clf, y_train_clf, X_test_clf, y_test_clf, classifiers)

## The best Ensemble Classifier are Random Forest, Gradient Boosting and XG Boost

# precision recall graphs


In [None]:
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import make_pipeline
from src.plotting import precision_recall_1, precision_recall_2

y = df['Sleep Disorder']
# Use label_binarize to be multi-label like settings
Y = label_binarize(y, classes=[0, 1, 2])
n_classes = Y.shape[1]

# Split into training and test
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

classifier = OneVsRestClassifier(
        make_pipeline(StandardScaler(), SVC(random_state= 42))
    )
classifier.fit(X_train, y_train)



In [None]:
precision_recall_1(X_train2, Y_train2, X_test2, Y_test2, classifier)

In [None]:
precision_recall_2(X_test, y_test, classifier, n_classes)