# 生活习惯与睡眠质量关联分析

## 环境配置与数据准备

In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import os

sns.set(style="whitegrid", font_scale=1.1)
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['figure.dpi'] = 100

output_dir = "analysis_results"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)


## 数据导入

In [None]:
df = pd.read_csv('Sleep_health_and_lifestyle_dataset.csv')

# 数据基本信息检查
print("数据形状:", df.shape)
print("\n前5行:")
display(df.head())

print("\n数据类型 缺失值:")
display(df.info())

print("\n基本统计:")
display(df.describe(include='all').T)

# 检查缺失值
missing_values = df.isnull().sum()
if missing_values.sum() > 0:

    display(missing_values[missing_values > 0])

## 数据清洗与预处理


In [None]:
# 复制原数据
df_clean = df.copy()

# 拆分
bp_split = df_clean['Blood Pressure'].str.split('/', expand=True)
df_clean['Systolic_BP'] = bp_split[0].astype(int)
df_clean['Diastolic_BP'] = bp_split[1].astype(int)
df_clean.drop('Blood Pressure', axis=1, inplace=True)

# 为分类变量创建编码
categorical_columns = ['Gender', 'Occupation', 'BMI Category', 'Sleep Disorder']

# 创建编码映射
for col in categorical_columns:
    if col in df_clean.columns:
        unique_values = df_clean[col].unique()
        df_clean[f'{col}_Code'] = pd.factorize(df_clean[col])[0]

# 保存清洗后的数据
clean_data_path = os.path.join(output_dir, 'cleaned_sleep_data.csv')
df_clean.to_csv(clean_data_path, index=False)

## 描述性统计与单变量分析部分

In [None]:
# 1. 睡眠质量分布
plt.figure(figsize=(10, 6))
sns.histplot(df_clean['Quality of Sleep'], bins=10, kde=True, color='skyblue')
plt.title('睡眠质量分布', fontsize=16)
plt.xlabel('睡眠质量评分 (1-10)', fontsize=12)
plt.ylabel('频数', fontsize=12)
plt.axvline(df_clean['Quality of Sleep'].mean(), color='red', linestyle='--', 
           label=f'平均值: {df_clean["Quality of Sleep"].mean():.2f}')
plt.legend()
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'sleep_quality_distribution.png'))
plt.show()

# 2. 睡眠时长分布
plt.figure(figsize=(10, 6))
sns.histplot(df_clean['Sleep Duration'], bins=15, kde=True, color='lightgreen')
plt.title('睡眠时长分布', fontsize=16)
plt.xlabel('睡眠时长 (小时)', fontsize=12)
plt.ylabel('频数', fontsize=12)
plt.axvline(df_clean['Sleep Duration'].mean(), color='red', linestyle='--', 
           label=f'平均值: {df_clean["Sleep Duration"].mean():.2f}')
plt.legend()
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'sleep_duration_distribution.png'))
plt.show()

# 3. BMI类别分布
plt.figure(figsize=(10, 6))
bmi_counts = df_clean['BMI Category'].value_counts()
colors = sns.color_palette('pastel')[0:len(bmi_counts)]

plt.pie(bmi_counts, labels=bmi_counts.index, autopct='%1.1f%%', colors=colors, startangle=90)
plt.title('BMI类别分布', fontsize=16)
plt.axis('equal')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'bmi_distribution.png'))
plt.show()

# 保存结果
desc_stats = df_clean.describe()
desc_stats.to_csv(os.path.join(output_dir, 'descriptive_statistics.csv'))