In [1]:
import plotly
import matplotlib.pyplot as plt
import pandas as pd

df = pd.read_csv("Sleep_health_and_lifestyle_dataset.csv")

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374 entries, 0 to 373
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Person ID                374 non-null    int64  
 1   Gender                   374 non-null    object 
 2   Age                      374 non-null    int64  
 3   Occupation               374 non-null    object 
 4   Sleep Duration           374 non-null    float64
 5   Quality of Sleep         374 non-null    int64  
 6   Physical Activity Level  374 non-null    int64  
 7   Stress Level             374 non-null    int64  
 8   BMI Category             374 non-null    object 
 9   Blood Pressure           374 non-null    object 
 10  Heart Rate               374 non-null    int64  
 11  Daily Steps              374 non-null    int64  
 12  Sleep Disorder           155 non-null    object 
dtypes: float64(1), int64(7), object(5)
memory usage: 38.1+ KB
None


In [2]:
print(df.head())

   Person ID Gender  Age            Occupation  Sleep Duration  \
0          1   Male   27     Software Engineer             6.1   
1          2   Male   28                Doctor             6.2   
2          3   Male   28                Doctor             6.2   
3          4   Male   28  Sales Representative             5.9   
4          5   Male   28  Sales Representative             5.9   

   Quality of Sleep  Physical Activity Level  Stress Level BMI Category  \
0                 6                       42             6   Overweight   
1                 6                       60             8       Normal   
2                 6                       60             8       Normal   
3                 4                       30             8        Obese   
4                 4                       30             8        Obese   

  Blood Pressure  Heart Rate  Daily Steps Sleep Disorder  
0         126/83          77         4200            NaN  
1         125/80          75      

In [3]:
# 'Blood Pressure' 열을 나누어 'Systolic Pressure'와 'Diastolic Pressure' 열로 저장
df[['Systolic Pressure', 'Diastolic Pressure']] = df['Blood Pressure'].str.split('/', expand=True)

# 'Systolic Pressure'와 'Diastolic Pressure' 열의 데이터 타입을 숫자로 변환
df['Systolic Pressure'] = pd.to_numeric(df['Systolic Pressure'], errors='coerce')
df['Diastolic Pressure'] = pd.to_numeric(df['Diastolic Pressure'], errors='coerce')

# 'Blood Pressure' 열 제거
df.drop(columns=['Blood Pressure'], inplace=True)

# 변환된 DataFrame 출력
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374 entries, 0 to 373
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Person ID                374 non-null    int64  
 1   Gender                   374 non-null    object 
 2   Age                      374 non-null    int64  
 3   Occupation               374 non-null    object 
 4   Sleep Duration           374 non-null    float64
 5   Quality of Sleep         374 non-null    int64  
 6   Physical Activity Level  374 non-null    int64  
 7   Stress Level             374 non-null    int64  
 8   BMI Category             374 non-null    object 
 9   Heart Rate               374 non-null    int64  
 10  Daily Steps              374 non-null    int64  
 11  Sleep Disorder           155 non-null    object 
 12  Systolic Pressure        374 non-null    int64  
 13  Diastolic Pressure       374 non-null    int64  
dtypes: float64(1), int64(9), o

In [4]:
# IQR을 이용한 이상치 처리 함수 정의
def handle_outliers_iqr(column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    new_df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return new_df

# 'Sleep Duration' 열의 이상치 처리
new_df = handle_outliers_iqr('Sleep Duration')

# 'Quality of Sleep' 열의 이상치 처리
new_df = handle_outliers_iqr('Quality of Sleep')

# 이상치 처리된 DataFrame 출력
print(new_df.head())

   Person ID Gender  Age            Occupation  Sleep Duration  \
0          1   Male   27     Software Engineer             6.1   
1          2   Male   28                Doctor             6.2   
2          3   Male   28                Doctor             6.2   
3          4   Male   28  Sales Representative             5.9   
4          5   Male   28  Sales Representative             5.9   

   Quality of Sleep  Physical Activity Level  Stress Level BMI Category  \
0                 6                       42             6   Overweight   
1                 6                       60             8       Normal   
2                 6                       60             8       Normal   
3                 4                       30             8        Obese   
4                 4                       30             8        Obese   

   Heart Rate  Daily Steps Sleep Disorder  Systolic Pressure  \
0          77         4200            NaN                126   
1          75        100

In [5]:
print(new_df.tail())

     Person ID  Gender  Age Occupation  Sleep Duration  Quality of Sleep  \
369        370  Female   59      Nurse             8.1                 9   
370        371  Female   59      Nurse             8.0                 9   
371        372  Female   59      Nurse             8.1                 9   
372        373  Female   59      Nurse             8.1                 9   
373        374  Female   59      Nurse             8.1                 9   

     Physical Activity Level  Stress Level BMI Category  Heart Rate  \
369                       75             3   Overweight          68   
370                       75             3   Overweight          68   
371                       75             3   Overweight          68   
372                       75             3   Overweight          68   
373                       75             3   Overweight          68   

     Daily Steps Sleep Disorder  Systolic Pressure  Diastolic Pressure  
369         7000    Sleep Apnea            

In [6]:
# 'Sleep Disorder' 열의 결측치를 'None'으로 대체하여 새로운 DataFrame 생성
noMV_df = df.copy()  # 원본 데이터프레임을 변경하지 않기 위해 복사본 생성
noMV_df['Sleep Disorder'].fillna('None', inplace=True)

# 결측치가 대체된 DataFrame 출력
print(noMV_df.head())

   Person ID Gender  Age            Occupation  Sleep Duration  \
0          1   Male   27     Software Engineer             6.1   
1          2   Male   28                Doctor             6.2   
2          3   Male   28                Doctor             6.2   
3          4   Male   28  Sales Representative             5.9   
4          5   Male   28  Sales Representative             5.9   

   Quality of Sleep  Physical Activity Level  Stress Level BMI Category  \
0                 6                       42             6   Overweight   
1                 6                       60             8       Normal   
2                 6                       60             8       Normal   
3                 4                       30             8        Obese   
4                 4                       30             8        Obese   

   Heart Rate  Daily Steps Sleep Disorder  Systolic Pressure  \
0          77         4200           None                126   
1          75        100

In [7]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()  # Min-Max 스케일링
scaled_data = scaler.fit_transform(df[['Physical Activity Level']])

# 'Physical Activity Level' 열을 스케일링하여 새로운 열에 저장
noMV_df['Scaled Physical Activity Level'] = scaled_data

In [8]:
print(noMV_df.head())

   Person ID Gender  Age            Occupation  Sleep Duration  \
0          1   Male   27     Software Engineer             6.1   
1          2   Male   28                Doctor             6.2   
2          3   Male   28                Doctor             6.2   
3          4   Male   28  Sales Representative             5.9   
4          5   Male   28  Sales Representative             5.9   

   Quality of Sleep  Physical Activity Level  Stress Level BMI Category  \
0                 6                       42             6   Overweight   
1                 6                       60             8       Normal   
2                 6                       60             8       Normal   
3                 4                       30             8        Obese   
4                 4                       30             8        Obese   

   Heart Rate  Daily Steps Sleep Disorder  Systolic Pressure  \
0          77         4200           None                126   
1          75        100

In [9]:
print(df['BMI Category'])

0      Overweight
1          Normal
2          Normal
3           Obese
4           Obese
          ...    
369    Overweight
370    Overweight
371    Overweight
372    Overweight
373    Overweight
Name: BMI Category, Length: 374, dtype: object


In [10]:
import plotly.express as px

# 'BMI Category' 열 값의 카운트를 계산
bmi_counts = df['BMI Category'].value_counts()

# 파이 차트 생성
fig = px.pie(names=bmi_counts.index, values=bmi_counts.values, title='BMI Category Distribution')

# 차트 출력 (옵션)
fig.show()

In [11]:
noMV_df['BMI Category'] = noMV_df['BMI Category'].replace('Normal Weight', 'Normal')

# 변경된 DataFrame 출력
# 'BMI Category' 열 값의 카운트를 계산
bmi_counts = noMV_df['BMI Category'].value_counts()

# 파이 차트 생성
fig = px.pie(names=bmi_counts.index, values=bmi_counts.values, title='BMI Category Distribution')

# 차트 출력
fig.show()

In [12]:
# 레이블 인코딩을 위한 딕셔너리 생성
bmi_label_encoding = {
    'Normal': 0,
    'Overweight': 1,
    'Obese': 2
}

# 'BMI Category' 열을 레이블 인코딩하여 새로운 열에 저장
noMV_df['BMI Category Encoded'] = noMV_df['BMI Category'].map(bmi_label_encoding)

# 결과 확인
print(noMV_df.head())

   Person ID Gender  Age            Occupation  Sleep Duration  \
0          1   Male   27     Software Engineer             6.1   
1          2   Male   28                Doctor             6.2   
2          3   Male   28                Doctor             6.2   
3          4   Male   28  Sales Representative             5.9   
4          5   Male   28  Sales Representative             5.9   

   Quality of Sleep  Physical Activity Level  Stress Level BMI Category  \
0                 6                       42             6   Overweight   
1                 6                       60             8       Normal   
2                 6                       60             8       Normal   
3                 4                       30             8        Obese   
4                 4                       30             8        Obese   

   Heart Rate  Daily Steps Sleep Disorder  Systolic Pressure  \
0          77         4200           None                126   
1          75        100

### 3.2.1	측정 대상자들의 직업과 수면의 관계를 살펴보기

In [13]:
occupation_counts = noMV_df['Occupation'].value_counts()

fig = px.bar(x=occupation_counts.index, y=occupation_counts.values, title="Occupation Bar Charts")

# 레이아웃 설정
fig.update_layout(
    xaxis_title='Occupation',  # x축 라벨
    yaxis_title='Counts',       # y축 라벨
    xaxis_tickangle=-45,        # x축 눈금 레이블 각도 설정
    bargap=0.1,                 # 바 간격 조절
    plot_bgcolor='white',       # 차트 배경색
    paper_bgcolor='white',      # 차트 페이퍼 배경색
    font=dict(family='Arial', size=12, color='black')  # 폰트 설정
)

# 바 색상 설정
fig.update_traces(marker_color='purple')

fig.show()

In [14]:
fig = px.box(noMV_df, x='Occupation', y="Sleep Duration")
fig.show()

In [15]:
# 'Occupation' 별 평균 'Sleep Duration' 계산
occupation_sleep_duration = noMV_df.groupby('Occupation')['Sleep Duration'].mean().reset_index()

# 바 차트 생성
fig = px.bar(occupation_sleep_duration, x='Occupation', y='Sleep Duration', 
             title='Average Sleep Duration by Occupation', 
             labels={'Occupation': 'Occupation', 'Sleep Duration': 'Average Sleep Duration'},
             color='Occupation')

# 차트 출력
fig.show()

In [16]:
# 직업 별 스트레스 수준과 수면 품질을 포함한 DataFrame 생성
occupation_stress_sleep_df = noMV_df[['Occupation', 'Stress Level', 'Quality of Sleep']]

# 직업 별로 그룹화하여 스트레스 수준과 수면 품질의 평균 계산
occupation_stress_sleep_avg = occupation_stress_sleep_df.groupby('Occupation').mean().reset_index()

# Scatter plot 생성
fig = px.scatter(occupation_stress_sleep_avg, 
                 x='Stress Level', y='Quality of Sleep', 
                 color='Occupation', 
                 title='Stress Level vs Quality of Sleep by Occupation', 
                 labels={'Stress Level': 'Stress Level', 'Quality of Sleep': 'Quality of Sleep', 'Occupation': 'Occupation'})

# 차트 출력
fig.show()

In [17]:
# 중복된 행 제거 및 누락된 행 제거
cleaned_df = noMV_df.dropna(subset=['Occupation', 'Sleep Disorder']).drop_duplicates(subset=['Occupation', 'Sleep Disorder'])

# 'Occupation' 별 'Sleep Disorder' 카운트 계산
occupation_sleep_disorder_counts = cleaned_df.groupby(['Occupation', 'Sleep Disorder']).size().reset_index(name='Count')

# Pivot table 생성
pivot_table = pd.pivot_table(occupation_sleep_disorder_counts, values='Count', 
                             index='Occupation', columns='Sleep Disorder', fill_value=0)

# Heatmap 생성
fig = px.imshow(pivot_table,
                labels=dict(color='Count'), x=pivot_table.columns, y=pivot_table.index,
                title='Sleep Disorder by Occupation (Heatmap)')

# 차트 출력
fig.show()

In [18]:
# 'Occupation'과 'Sleep Disorder' 별 카운트 계산
occupation_sleep_disorder_counts = noMV_df.groupby(['Occupation', 'Sleep Disorder']).size().reset_index(name='Count')

# Pivot table 생성
pivot_table = pd.pivot_table(occupation_sleep_disorder_counts, values='Count', 
                             index='Occupation', columns='Sleep Disorder', fill_value=0)

# Heatmap 생성
fig = px.imshow(pivot_table,
                labels=dict(color='Count'), x=pivot_table.columns, y=pivot_table.index,
                title='Sleep Disorder by Occupation (Heatmap)')

# 차트 출력
fig.show()

### 3.2.2	수면의 질에 영향 미치는 요인 분석을 위한 데이터 시각화

In [19]:
fig = px.scatter(noMV_df, x='Scaled Physical Activity Level', y='Quality of Sleep', color='BMI Category')
fig.show()

In [20]:
fig = px.violin(noMV_df, y="Quality of Sleep", x="Sleep Disorder", color="BMI Category", box=True, points="all",
          hover_data=df.columns)
fig.show()

In [21]:
print(noMV_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374 entries, 0 to 373
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Person ID                       374 non-null    int64  
 1   Gender                          374 non-null    object 
 2   Age                             374 non-null    int64  
 3   Occupation                      374 non-null    object 
 4   Sleep Duration                  374 non-null    float64
 5   Quality of Sleep                374 non-null    int64  
 6   Physical Activity Level         374 non-null    int64  
 7   Stress Level                    374 non-null    int64  
 8   BMI Category                    374 non-null    object 
 9   Heart Rate                      374 non-null    int64  
 10  Daily Steps                     374 non-null    int64  
 11  Sleep Disorder                  374 non-null    object 
 12  Systolic Pressure               374 

In [22]:
# Line plot 생성 (수면 시간 vs. 일일 걸음 수)
fig = px.line(noMV_df, x='Daily Steps', y='Sleep Duration', color='Quality of Sleep',
              title='Correlation between Sleep Duration and Daily Steps (Time Series)',
              labels={'Sleep Duration': 'Sleep Duration (hours)', 'Daily Steps': 'Daily Steps'})

# 차트 출력
fig.show()

In [23]:
# Line plot 생성 (수면 품질 vs. 일일 걸음 수)
fig = px.line(noMV_df, x='Daily Steps', y='Quality of Sleep',
              title='Correlation between Quality of Sleep and Daily Steps (Time Series)',
              labels={'Quality of Sleep': 'Quality of Sleep', 'Daily Steps': 'Daily Steps'})

# 차트 출력
fig.show()

In [24]:
import plotly.graph_objects as go

# Swarm plot 생성 (Sleep Duration vs. Gender)
fig1 = go.Figure()

for gender in noMV_df['Gender'].unique():
    fig1.add_trace(go.Box(y=noMV_df[noMV_df['Gender'] == gender]['Sleep Duration'],
                          x=[gender] * len(noMV_df[noMV_df['Gender'] == gender]['Sleep Duration']),
                          name=gender, boxmean='sd', boxpoints='all'))

fig1.update_layout(title='Sleep Duration by Gender',
                  xaxis=dict(title='Gender'),
                  yaxis=dict(title='Sleep Duration (hours)'))

# 차트 출력
fig1.show()

# Swarm plot 생성 (Quality of Sleep vs. Gender)
fig2 = go.Figure()

for gender in noMV_df['Gender'].unique():
    fig2.add_trace(go.Box(y=noMV_df[noMV_df['Gender'] == gender]['Quality of Sleep'],
                          x=[gender] * len(noMV_df[noMV_df['Gender'] == gender]['Quality of Sleep']),
                          name=gender, boxmean='sd', boxpoints='all'))

fig2.update_layout(title='Quality of Sleep by Gender',
                  xaxis=dict(title='Gender'),
                  yaxis=dict(title='Quality of Sleep'))

# 차트 출력
fig2.show()


In [25]:
# Pair Plot 생성
fig = px.scatter_matrix(noMV_df, dimensions=['Heart Rate', 'Systolic Pressure', 'Diastolic Pressure', 'BMI Category Encoded', 'Sleep Duration', 'Quality of Sleep'],
                        color='Gender', title='Pair Plot of Health Indicators')

# 차트 출력
fig.show()


In [26]:
import plotly.express as px

# Box plot 생성 (건강 지표에 따른 수면 패턴 비교)
fig1 = px.box(noMV_df, x='BMI Category', y='Sleep Duration', color='Gender',
              title='Sleep Duration by BMI Category',
              labels={'BMI Category': 'BMI Category', 'Sleep Duration': 'Sleep Duration (hours)'})

# 차트 출력
fig1.show()

# Box plot 생성 (건강 지표에 따른 수면 품질 비교)
fig2 = px.box(noMV_df, x='BMI Category', y='Quality of Sleep', color='Gender',
              title='Quality of Sleep by BMI Category',
              labels={'BMI Category': 'BMI Category', 'Quality of Sleep': 'Quality of Sleep'})

# 차트 출력
fig2.show()


In [29]:
# 데이터를 그룹화하고 그룹 별 데이터 수를 계산
grouped_data = noMV_df.groupby(['Sleep Duration', 'Quality of Sleep']).size().reset_index(name='count')

# Scatter plot 생성 (Sleep Duration vs. Quality of Sleep)
fig = px.scatter(grouped_data, x='Sleep Duration', y='Quality of Sleep', size='count',
                 title='Correlation between Sleep Duration and Quality of Sleep with Linear Fit',
                 labels={'Sleep Duration': 'Sleep Duration (hours)', 'Quality of Sleep': 'Quality of Sleep'},
                 color_discrete_sequence=['green'])  # Scatter plot의 색상 변경

# 선형 회귀를 통한 선 추가 (분홍색)
fig.add_trace(px.scatter(noMV_df, x='Sleep Duration', y='Quality of Sleep', color='Gender',
                         trendline='ols',  # Ordinary Least Squares 선형 회귀를 사용한 선 추가
                         trendline_color_override='pink',  # 선형 회귀 선 색상 변경
                         labels={'Sleep Duration': 'Sleep Duration (hours)', 'Quality of Sleep': 'Quality of Sleep'}).data[1])

# 차트 출력
fig.show()