In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from google.colab import files

# Load sleep health data
sleep_data = pd.read_csv('https://raw.githubusercontent.com/Salina-Huang/Programming-for-AI-Sleep-Health-and-Lifestyle-Model/refs/heads/main/cleaned_data_sleep.csv')

print("SLEEP HEALTH DATA PROCESSING FOR MACHINE LEARNING")
print("=" * 70)
print(f"Original data shape: {sleep_data.shape}")
print(f"Original columns: {list(sleep_data.columns)}")
print("\nFirst 5 rows of data:")
print(sleep_data.head())

# Create a copy of the data
sleep_data_encoded = sleep_data.copy()

# Remove columns based on feature importance analysis
# Removing bp_category (low importance: 0.0267) and Gender (low importance)
columns_to_drop = ['Person ID', 'bp_category', 'Gender']  # Person ID is identifier, bp_category and Gender have low importance
sleep_data_encoded = sleep_data_encoded.drop(columns=columns_to_drop, errors='ignore')

print(f"\nAfter removing {columns_to_drop}, data shape: {sleep_data_encoded.shape}")

# Handle missing values in target variable (Sleep Disorder)
sleep_data_encoded['Sleep Disorder'] = sleep_data_encoded['Sleep Disorder'].fillna('None')

# Define categorical columns for one-hot encoding
categorical_columns = [
    'Occupation',
    'BMI Category',
    'Sleep Disorder'  # Target variable also needs encoding
]

# Keep only columns that exist in the dataset
categorical_columns = [col for col in categorical_columns if col in sleep_data_encoded.columns]

print("\nCategorical columns for one-hot encoding:")
for col in categorical_columns:
    print(f"  {col}: {sleep_data_encoded[col].nunique()} unique values")

# Apply one-hot encoding
sleep_data_final = pd.get_dummies(sleep_data_encoded, columns=categorical_columns)

print(f"\nData shape before encoding: {sleep_data_encoded.shape}")
print(f"Data shape after encoding: {sleep_data_final.shape}")

# Check for any remaining non-numeric columns
remaining_non_numeric = sleep_data_final.select_dtypes(include=['object']).columns
if len(remaining_non_numeric) > 0:
    print(f"\nAdditional non-numeric columns found: {list(remaining_non_numeric)}")
    sleep_data_final = pd.get_dummies(sleep_data_final, columns=remaining_non_numeric)
    print(f"Data shape after second encoding: {sleep_data_final.shape}")

# Define numerical columns for standardization
numeric_columns_to_scale = [
    'Age',
    'Sleep Duration',
    'Quality of Sleep',
    'Physical Activity Level',
    'Stress Level',
    'Heart Rate',
    'Daily Steps'
]

# Keep only columns that exist in the dataset
numeric_columns_to_scale = [col for col in numeric_columns_to_scale if col in sleep_data_final.columns]

print(f"\nNumerical columns to standardize ({len(numeric_columns_to_scale)} columns):")
for col in numeric_columns_to_scale:
    print(f"  {col}: Mean={sleep_data_final[col].mean():.2f}, Std={sleep_data_final[col].std():.2f}")

# Initialize StandardScaler and apply standardization
if numeric_columns_to_scale:
    scaler = StandardScaler()
    original_values = sleep_data_final[numeric_columns_to_scale].copy()
    sleep_data_final[numeric_columns_to_scale] = scaler.fit_transform(sleep_data_final[numeric_columns_to_scale])

    print("\nStatistics after standardization:")
    for col in numeric_columns_to_scale:
        print(f"  {col}: Mean={sleep_data_final[col].mean():.2f}, Std={sleep_data_final[col].std():.2f}")
else:
    print("\nNo numerical columns to standardize")

# Display all column names after encoding
print("\n" + "=" * 70)
print("ALL COLUMN NAMES AFTER ENCODING:")
print("=" * 70)
for i, col in enumerate(sleep_data_final.columns, 1):
    print(f"{i:3d}. {col}")

# Check target variable distribution
print("\n" + "=" * 70)
print("TARGET VARIABLE DISTRIBUTION (Sleep Disorder):")
print("=" * 70)

# Identify target variable columns
target_columns = [col for col in sleep_data_final.columns if 'Sleep Disorder' in col]
print("Target variable columns:", target_columns)

# Count samples for each class
if target_columns:
    print("\nSample count for each class:")
    for target_col in target_columns:
        count = sleep_data_final[target_col].sum()
        percentage = (count / len(sleep_data_final)) * 100
        class_name = target_col.replace('Sleep Disorder_', '')
        print(f"  {class_name}: {int(count)} samples ({percentage:.1f}%)")

# Data summary
print("\n" + "=" * 70)
print("DATA PREPROCESSING SUMMARY")
print("=" * 70)
print(f"Total samples: {len(sleep_data_final)}")
print(f"Total features: {len(sleep_data_final.columns)}")
print(f"Numerical features: {len(numeric_columns_to_scale)}")
print(f"Encoded categorical features: {len(sleep_data_final.columns) - len(numeric_columns_to_scale)}")

# Separate features and target variable for machine learning
if target_columns:
    # Create feature matrix (exclude all target variable columns)
    X = sleep_data_final.drop(columns=target_columns)
    y = sleep_data_final[target_columns]

    print(f"\nFeature matrix X shape: {X.shape}")
    print(f"Target matrix y shape: {y.shape}")

    # For multi-class classification, create single target column
    if len(target_columns) > 1:
        y_single = sleep_data_final[target_columns].idxmax(axis=1)
        y_single = y_single.str.replace('Sleep Disorder_', '')
        print(f"\nSingle column target variable shape: {y_single.shape}")
        print("First 5 values of single target variable:")
        print(y_single.head())

print("\n" + "=" * 70)
print("PROCESSED DATA SAMPLE (First 5 rows):")
print("=" * 70)
print(sleep_data_final.head())

print("\n" + "=" * 70)
print("DATA STATISTICS:")
print("=" * 70)
print(sleep_data_final.describe())

# Save processed data
output_filename = 'sleep_health_encoded_onehot.csv'
sleep_data_final.to_csv(output_filename, index=False)
print(f"\nProcessed data saved as: {output_filename}")
print(f"File size: {len(sleep_data_final.to_csv(index=False)) / 1024:.2f} KB")

# Download file in Google Colab
files.download(output_filename)

# Optionally save separated features and target variables
if target_columns:
    # Save features
    X_filename = 'sleep_health_features.csv'
    X.to_csv(X_filename, index=False)

    y_single_filename = 'sleep_health_target.csv'
    y_single.to_csv(y_single_filename, index=False, header=['Sleep_Disorder'])


# Additional analysis: Show which features were kept and why
print("\n" + "=" * 70)
print("FEATURE SELECTION RATIONALE")
print("=" * 70)
print("Features RETAINED based on domain knowledge and importance:")
print("1. Age - Important for sleep pattern analysis")
print("2. Sleep Duration - Direct sleep metric")
print("3. Quality of Sleep - Subjective sleep assessment")
print("4. Physical Activity Level - Impacts sleep quality")
print("5. Stress Level - Affects sleep disorders")
print("6. BMI Category - Obesity linked to sleep apnea")
print("7. Heart Rate - Sleep quality indicator")
print("8. Daily Steps - Activity level measurement")
print("9. Occupation - Lifestyle and stress factors")
print("\nFeatures REMOVED:")
print("1. bp_category - Low feature importance (0.0267)")
print("2. Gender - Low predictive power for sleep disorders")
print("3. Person ID - Identifier only (not predictive)")

SLEEP HEALTH DATA PROCESSING FOR MACHINE LEARNING
Original data shape: (374, 13)
Original columns: ['Person ID', 'Gender', 'Age', 'Occupation', 'Sleep Duration', 'Quality of Sleep', 'Physical Activity Level', 'Stress Level', 'BMI Category', 'Heart Rate', 'Daily Steps', 'Sleep Disorder', 'bp_category']

First 5 rows of data:
   Person ID Gender  Age            Occupation  Sleep Duration  \
0          1   Male   27     Software Engineer             6.1   
1          2   Male   28                Doctor             6.2   
2          3   Male   28                Doctor             6.2   
3          4   Male   28  Sales Representative             5.9   
4          5   Male   28  Sales Representative             5.9   

   Quality of Sleep  Physical Activity Level  Stress Level BMI Category  \
0                 6                       42             6   Overweight   
1                 6                       60             8       Normal   
2                 6                       60        

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


FEATURE SELECTION RATIONALE
Features RETAINED based on domain knowledge and importance:
1. Age - Important for sleep pattern analysis
2. Sleep Duration - Direct sleep metric
3. Quality of Sleep - Subjective sleep assessment
4. Physical Activity Level - Impacts sleep quality
5. Stress Level - Affects sleep disorders
6. BMI Category - Obesity linked to sleep apnea
7. Heart Rate - Sleep quality indicator
8. Daily Steps - Activity level measurement
9. Occupation - Lifestyle and stress factors

Features REMOVED:
1. bp_category - Low feature importance (0.0267)
2. Gender - Low predictive power for sleep disorders
3. Person ID - Identifier only (not predictive)
