In [1]:
#Sleep Health and Lifestyle Data -- Capstone Project

%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from scipy import stats

shl_data = pd.read_csv("https://raw.githubusercontent.com/QWelch008/DataScience/main/Sleep_health_and_lifestyle_dataset.csv")

#Overview of data
shl_data.info()
shl_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374 entries, 0 to 373
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Person ID      374 non-null    int64  
 1   Gender         374 non-null    object 
 2   Age            374 non-null    int64  
 3   Occupation     374 non-null    object 
 4   slpduration    374 non-null    float64
 5   qltsleep       374 non-null    int64  
 6   actlevel       374 non-null    int64  
 7   stresslevel    374 non-null    int64  
 8   BMICategory    374 non-null    object 
 9   BloodPressure  374 non-null    object 
 10  HeartRate      374 non-null    int64  
 11  DailySteps     374 non-null    int64  
 12  SleepDisorder  374 non-null    object 
dtypes: float64(1), int64(7), object(5)
memory usage: 38.1+ KB


Unnamed: 0,Person ID,Gender,Age,Occupation,slpduration,qltsleep,actlevel,stresslevel,BMICategory,BloodPressure,HeartRate,DailySteps,SleepDisorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea


In [2]:
#Stats for all numeric values in dataset. Good for introducing the data to audience with basic stats such as min/max and average.
shl_data[['slpduration','actlevel','qltsleep','stresslevel','Age','HeartRate','DailySteps']].describe()

Unnamed: 0,slpduration,actlevel,qltsleep,stresslevel,Age,HeartRate,DailySteps
count,374.0,374.0,374.0,374.0,374.0,374.0,374.0
mean,7.132086,59.171123,7.312834,5.385027,42.184492,70.165775,6816.84492
std,0.795657,20.830804,1.196956,1.774526,8.673133,4.135676,1617.915679
min,5.8,30.0,4.0,3.0,27.0,65.0,3000.0
25%,6.4,45.0,6.0,4.0,35.25,68.0,5600.0
50%,7.2,60.0,7.0,5.0,43.0,70.0,7000.0
75%,7.8,75.0,8.0,7.0,50.0,72.0,8000.0
max,8.5,90.0,9.0,8.0,59.0,86.0,10000.0


In [None]:
#list of possible potential outcomes for BMI Category. (quick glance info for audience)
shl_data['BMICategory'].unique()

In [None]:
#list of possible potential outcomes for Sleep Disorders. (quick glance info for audience)
shl_data['SleepDisorder'].unique()

In [None]:
#Checking correlation between sleep duration, quality of sleep, phyiscal activity & stress levels
shl_data[['slpduration','qltsleep','actlevel','stresslevel']].corr()

In [None]:
#Eyechecking histograms of each main variable we are looking at
sns.histplot(shl_data['stresslevel'])

In [None]:
#as expected qlt of sleep being subjective has a slightly skewed histogram
sns.histplot(shl_data['qltsleep'])

In [None]:
sns.histplot(shl_data['slpduration'])

In [None]:
sns.histplot(shl_data['actlevel'])

In [None]:
#Statistical Signfigance of Correlations (p-value less than 0.05 shows signifigance)
stats.pearsonr(shl_data['qltsleep'], shl_data['slpduration'])

In [None]:
stats.pearsonr(shl_data['qltsleep'], shl_data['actlevel'])

In [None]:
stats.pearsonr(shl_data['qltsleep'], shl_data['stresslevel'])

In [None]:
#ttest for each correlations (p-value < 0.05 is significant outcome) (reject null if statistics is greater than 1.96 OR p-value is less than 0.05)
stats.ttest_ind(shl_data['qltsleep'], shl_data['slpduration'])

In [None]:
stats.ttest_ind(shl_data['qltsleep'], shl_data['actlevel'])

In [None]:
stats.ttest_ind(shl_data['qltsleep'], shl_data['stresslevel'])

In [None]:
#Scatterplot showing increased sleep duration often leads to higer perceived quality of sleep which shows a correlation to lower levels of stress
sns.scatterplot(x='slpduration', y='qltsleep', hue='stresslevel', data=shl_data)

In [None]:
#Scatterplot showing increased activity levels seem to not have a signifigant impact on qlt of sleep
sns.scatterplot(x='actlevel', y='qltsleep', hue='stresslevel', data=shl_data)
plt.legend(loc='lower center')

In [None]:
#BoxPlot showing a positive correlation between an increased amount of sleep leading to a better percieved quality of sleep
sns.boxplot(y='slpduration', x='qltsleep', data=shl_data)

In [None]:
#BoxPlot showing a negative correlation between a lower levels of perceieved stress leading to increased perceived levels of quality of sleep and vice versa
sns.boxplot(y='stresslevel', x='qltsleep', data=shl_data)

In [None]:
#Splitting Dataset by Sleepdisorder. mostly did this for audience purposes, graphs show visual that potential patients with insomnia or sleep apnea are at risk of lower sleep durations which in turn has a significant correlation of percieved stress levels.
sleepdisorder_no = shl_data[(shl_data['SleepDisorder'] == 'None')]
sleepdisorder_no.info()

sleepdisorder_yes = shl_data[(shl_data['SleepDisorder'] == 'Insomnia')]
sleepdisorder_yes.info()

sleepdisorder_yes2 = shl_data[(shl_data['SleepDisorder'] == 'Sleep Apnea')]
sleepdisorder_yes2.info()

#PlottingData showing duration of sleep compared to sleep disorders
plt.hist(sleepdisorder_yes['slpduration'], alpha = .5, label = 'Insomnia')
plt.hist(sleepdisorder_no['slpduration'], alpha = .5, label = 'None')
plt.legend(loc="upper right")
plt.show();

plt.hist(sleepdisorder_yes2['slpduration'], alpha = .5, label = 'Sleep Apnea')
plt.hist(sleepdisorder_no['slpduration'], alpha = .5, label = 'None')
plt.legend(loc="upper right")
plt.show()

print(stats.describe(sleepdisorder_yes['stresslevel']))
print(stats.describe(sleepdisorder_yes2['stresslevel']))
print(stats.describe(sleepdisorder_no['stresslevel']))

In [None]:
#Splitting Dataset by Avg Sleep Duration. Histogram provides proof lower stress levels tend to correlate with longer sleep durations
sleepduration_high = shl_data[(shl_data['slpduration'] >= 7)]
sleepduration_high.info()

sleepduration_low = shl_data[(shl_data['slpduration'] < 7)]
sleepduration_low.info()

plt.hist(sleepduration_high['stresslevel'], alpha = .5, label = 'Higher Stress')
plt.hist(sleepduration_low['stresslevel'], alpha = .5, label = 'Lower Stress')
plt.legend(loc= 'upper center')
plt.show()

print(stats.describe(sleepduration_high['stresslevel']))
print(stats.describe(sleepduration_low['stresslevel']))

In [None]:
#Splitting Dataset by Avg Activity Level. Histogram shows visual proof of little correlation between activity level and stress levels
actlevel_high = shl_data[(shl_data['actlevel'] >= 59)]
actlevel_high.info()

actlevel_low = shl_data[(shl_data['actlevel'] < 59)]
actlevel_low.info()

plt.hist(actlevel_high['stresslevel'], alpha = .5, label = 'Higher Stress')
plt.hist(actlevel_low['stresslevel'], alpha = .5, label = 'Lower Stress')
plt.legend(loc= 'upper right')
plt.show()

print(stats.describe(actlevel_high['stresslevel']))
print(stats.describe(actlevel_low['stresslevel']))

In [None]:
#Splitting Dataset by Avg Quality of Sleep. Histogram shows visual proof that lower stress levels seem to correlate with higher quality of sleep
qltsleep_high = shl_data[(shl_data['qltsleep'] >= 7)]
qltsleep_high.info()

qltsleep_low = shl_data[(shl_data['qltsleep'] < 7)]
qltsleep_low.info()

plt.hist(qltsleep_high['stresslevel'], alpha = .5, label = 'Higher Stress')
plt.hist(qltsleep_low['stresslevel'], alpha = .5, label = 'Lower Stress')
plt.legend(loc= 'upper right')
plt.show()

print(stats.describe(qltsleep_high['stresslevel']))
print(stats.describe(qltsleep_low['stresslevel']))

In [None]:
#Overall, our inital correlations showed proof of our hypothesis being correct in the sense that increased sleep duration and sleep quality seem to have a strong correlation to an overal precieved less of stress. However the physical activity level did not seem to show a strong correlation to stress levelsa as initially expected. After running some Pearson and Ttest on each indivsual varaibel compared to stress levels we did receive an accetable P-value for ALL test run below 0.05 to reject the null hypothesis confirming that the intended variabels listed such as increased sleep duration & physical activity do normally lead to higher levels of perceived quality of sleep, with sleep duration having a much more significant impact than physical activity. Also proving that with an improved level quality of sleep will lead to lower levels of stress we ran multiple different visuals from histograms, boxplots and scatterplots to show proof of our findings with ALL graphs information showing skewness within +-3 and Kurtosis within +-8, (other than basic stress level histogram as expected) showing proof of symmetry for our graphs when using stress level as a hue to provide easier visuals to read. Our conclusion would be to focus on reccomending a focus on proper sleep durations to increase overall levels of perceived quality of sleep. Which in turn strongly correlates to overal lower levels of stress.