In [19]:
# Cell 1: Import Libraries and Load Data
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
df = pd.read_csv('../data/raw/mental_health_lifestyle_raw.csv')

print("Dataset loaded successfully")
print(f"Initial shape: {df.shape[0]} rows, {df.shape[1]} columns")

Dataset loaded successfully
Initial shape: 50000 rows, 17 columns


In [35]:
# Cell 2: Handle Missing Values - CORRECTED
print("=" * 80)
print("HANDLING MISSING VALUES")
print("=" * 80)

# Check missing values before
print("\nMissing values before removal:")
missing_before = df.isnull().sum()
print(missing_before[missing_before > 0])

# Select ONLY the features we need for our study
selected_features = [
    'Sleep_Hours',
    'Work_Hours',
    'Physical_Activity_Hours',
    'Social_Media_Usage',
    'Diet_Quality',
    'Smoking_Habit',
    'Alcohol_Consumption',
    'Age',
    'Gender',
    'Occupation',
    'Country',
    'Stress_Level'
]

print(f"\nSelecting {len(selected_features)} columns for our study:")
print("  7 Lifestyle variables")
print("  4 Demographic variables")
print("  1 Target variable")

# Select ONLY these columns from the original dataframe
df_clean = df[selected_features].copy()

# Check missing values only in selected features
missing_in_selected = df_clean.isnull().sum()
print(f"\nMissing values in selected features:")
print(missing_in_selected[missing_in_selected > 0])

# Remove rows only if missing values exist
df_clean = df_clean.dropna()

print(f"\nRows removed: {len(df) - len(df_clean)}")
print(f"Shape after removal: {df_clean.shape[0]} rows, {df_clean.shape[1]} columns")

# Verify no missing values and only selected columns remain
print(f"\nTotal missing values after removal: {df_clean.isnull().sum().sum()}")
print(f"\nFinal columns in dataset:")
print(df_clean.columns.tolist())

HANDLING MISSING VALUES

Missing values before removal:
Severity    25002
dtype: int64

Selecting 12 columns for our study:
  7 Lifestyle variables
  4 Demographic variables
  1 Target variable

Missing values in selected features:
Series([], dtype: int64)

Rows removed: 0
Shape after removal: 50000 rows, 12 columns

Total missing values after removal: 0

Final columns in dataset:
['Sleep_Hours', 'Work_Hours', 'Physical_Activity_Hours', 'Social_Media_Usage', 'Diet_Quality', 'Smoking_Habit', 'Alcohol_Consumption', 'Age', 'Gender', 'Occupation', 'Country', 'Stress_Level']


In [40]:
# Cell 3: Remove Duplicate Rows
print("=" * 80)
print("REMOVING DUPLICATE ROWS")
print("=" * 80)

# Check duplicates before
duplicates_before = df_clean.duplicated().sum()
print(f"\nDuplicate rows found: {duplicates_before}")

# Remove duplicates
df_clean = df_clean.drop_duplicates()

print(f"Shape after duplicate removal: {df_clean.shape[0]} rows, {df_clean.shape[1]} columns")
print(f"Remaining duplicates: {df_clean.duplicated().sum()}")

REMOVING DUPLICATE ROWS

Duplicate rows found: 0
Shape after duplicate removal: 50000 rows, 12 columns
Remaining duplicates: 0


In [44]:
# Cell 4: Encode Categorical Variables (Label Encoding)
print("=" * 80)
print("ENCODING CATEGORICAL VARIABLES")
print("=" * 80)

# Define ordinal mappings for lifestyle categorical variables
diet_mapping = {'Healthy': 0, 'Average': 1, 'Unhealthy': 2}
smoking_mapping = {'Non-Smoker': 0, 'Occasional Smoker': 1, 'Regular Smoker': 2, 'Heavy Smoker': 3}
alcohol_mapping = {'Non-Drinker': 0, 'Social Drinker': 1, 'Regular Drinker': 2, 'Heavy Drinker': 3}

# Apply label encoding to lifestyle variables
df_encoded = df_clean.copy()
df_encoded['Diet_Quality'] = df_encoded['Diet_Quality'].map(diet_mapping)
df_encoded['Smoking_Habit'] = df_encoded['Smoking_Habit'].map(smoking_mapping)
df_encoded['Alcohol_Consumption'] = df_encoded['Alcohol_Consumption'].map(alcohol_mapping)

print("\nLifestyle categorical variables encoded:")
print("\nDiet_Quality:")
for key, value in diet_mapping.items():
    print(f"  {key} -> {value}")

print("\nSmoking_Habit:")
for key, value in smoking_mapping.items():
    print(f"  {key} -> {value}")

print("\nAlcohol_Consumption:")
for key, value in alcohol_mapping.items():
    print(f"  {key} -> {value}")

# Note: Demographic variables (Gender, Occupation, Country) will be one-hot encoded later for classification
print("\n" + "-" * 80)
print("Note: Gender, Occupation, and Country will be one-hot encoded during classification stage")

# Verify encoding
print("\n" + "=" * 80)
print("ENCODED DATA VERIFICATION")
print("=" * 80)
print("\nData types after encoding:")
print(df_encoded.dtypes)

print("\nFirst 5 rows after encoding:")
display(df_encoded.head())

ENCODING CATEGORICAL VARIABLES

Lifestyle categorical variables encoded:

Diet_Quality:
  Healthy -> 0
  Average -> 1
  Unhealthy -> 2

Smoking_Habit:
  Non-Smoker -> 0
  Occasional Smoker -> 1
  Regular Smoker -> 2
  Heavy Smoker -> 3

Alcohol_Consumption:
  Non-Drinker -> 0
  Social Drinker -> 1
  Regular Drinker -> 2
  Heavy Drinker -> 3

--------------------------------------------------------------------------------
Note: Gender, Occupation, and Country will be one-hot encoded during classification stage

ENCODED DATA VERIFICATION

Data types after encoding:
Sleep_Hours                float64
Work_Hours                   int64
Physical_Activity_Hours      int64
Social_Media_Usage         float64
Diet_Quality                 int64
Smoking_Habit                int64
Alcohol_Consumption          int64
Age                          int64
Gender                      object
Occupation                  object
Country                     object
Stress_Level                object
dtype: obj

Unnamed: 0,Sleep_Hours,Work_Hours,Physical_Activity_Hours,Social_Media_Usage,Diet_Quality,Smoking_Habit,Alcohol_Consumption,Age,Gender,Occupation,Country,Stress_Level
0,7.6,46,8,2.2,0,2,2,36,Male,Education,Australia,Low
1,6.8,74,2,3.4,2,3,1,48,Male,Engineering,Other,Low
2,7.1,77,9,5.9,0,3,1,18,Prefer not to say,Sales,India,Medium
3,6.9,57,4,5.4,1,2,2,30,Non-binary,Engineering,Australia,Low
4,4.7,45,10,3.3,2,2,0,58,Male,IT,USA,High


In [47]:
# Cell 5: Normalize and Encode All Features
print("=" * 80)
print("FEATURE NORMALIZATION AND ENCODING")
print("=" * 80)

# Lifestyle features to normalize (already label encoded)
lifestyle_features = [
    'Sleep_Hours',
    'Work_Hours',
    'Physical_Activity_Hours',
    'Social_Media_Usage',
    'Diet_Quality',
    'Smoking_Habit',
    'Alcohol_Consumption'
]

print(f"\n1. Normalizing {len(lifestyle_features)} lifestyle features using Min-Max scaling [0, 1]:")
for feature in lifestyle_features:
    print(f"  - {feature}")

# Apply Min-Max scaling to lifestyle features
scaler_lifestyle = MinMaxScaler(feature_range=(0, 1))
df_normalized = df_encoded.copy()
df_normalized[lifestyle_features] = scaler_lifestyle.fit_transform(df_encoded[lifestyle_features])

# Normalize Age
print(f"\n2. Normalizing Age using Min-Max scaling [0, 1]:")
scaler_age = MinMaxScaler(feature_range=(0, 1))
df_normalized['Age'] = scaler_age.fit_transform(df_encoded[['Age']])
print(f"  - Age")

# One-hot encode Gender, Occupation, Country
print(f"\n3. One-hot encoding categorical demographic variables:")
print(f"  - Gender (unique values: {df_encoded['Gender'].nunique()})")
print(f"  - Occupation (unique values: {df_encoded['Occupation'].nunique()})")
print(f"  - Country (unique values: {df_encoded['Country'].nunique()})")

# Apply one-hot encoding
df_normalized = pd.get_dummies(df_normalized, columns=['Gender', 'Occupation', 'Country'], drop_first=True)

print("\n" + "=" * 80)
print("NORMALIZATION AND ENCODING STATISTICS")
print("=" * 80)

print("\nNormalized lifestyle features - Min and Max values:")
print(df_normalized[lifestyle_features].describe().loc[['min', 'max']])

print(f"\nNormalized Age - Min and Max values:")
print(f"  Min: {df_normalized['Age'].min():.6f}")
print(f"  Max: {df_normalized['Age'].max():.6f}")

print(f"\nOne-hot encoded columns created: {len(df_normalized.columns) - len(df_encoded.columns) + 3}")
print(f"Total columns after encoding: {len(df_normalized.columns)}")

print("\n" + "=" * 80)
print("FIRST 5 ROWS AFTER NORMALIZATION AND ENCODING")
print("=" * 80)
display(df_normalized.head())

FEATURE NORMALIZATION AND ENCODING

1. Normalizing 7 lifestyle features using Min-Max scaling [0, 1]:
  - Sleep_Hours
  - Work_Hours
  - Physical_Activity_Hours
  - Social_Media_Usage
  - Diet_Quality
  - Smoking_Habit
  - Alcohol_Consumption

2. Normalizing Age using Min-Max scaling [0, 1]:
  - Age

3. One-hot encoding categorical demographic variables:
  - Gender (unique values: 4)
  - Occupation (unique values: 7)
  - Country (unique values: 7)

NORMALIZATION AND ENCODING STATISTICS

Normalized lifestyle features - Min and Max values:
     Sleep_Hours  Work_Hours  Physical_Activity_Hours  Social_Media_Usage  \
min          0.0         0.0                      0.0                 0.0   
max          1.0         1.0                      1.0                 1.0   

     Diet_Quality  Smoking_Habit  Alcohol_Consumption  
min           0.0            0.0                  0.0  
max           1.0            1.0                  1.0  

Normalized Age - Min and Max values:
  Min: 0.000000
  

Unnamed: 0,Sleep_Hours,Work_Hours,Physical_Activity_Hours,Social_Media_Usage,Diet_Quality,Smoking_Habit,Alcohol_Consumption,Age,Stress_Level,Gender_Male,Gender_Non-binary,Gender_Prefer not to say,Occupation_Engineering,Occupation_Finance,Occupation_Healthcare,Occupation_IT,Occupation_Other,Occupation_Sales,Country_Canada,Country_Germany,Country_India,Country_Other,Country_UK,Country_USA
0,0.6,0.32,0.8,0.309091,0.0,0.666667,0.666667,0.382979,Low,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,0.466667,0.88,0.2,0.527273,1.0,1.0,0.333333,0.638298,Low,True,False,False,True,False,False,False,False,False,False,False,False,True,False,False
2,0.516667,0.94,0.9,0.981818,0.0,1.0,0.333333,0.0,Medium,False,False,True,False,False,False,False,False,True,False,False,True,False,False,False
3,0.483333,0.54,0.4,0.890909,0.5,0.666667,0.666667,0.255319,Low,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False
4,0.116667,0.3,1.0,0.509091,1.0,0.666667,0.0,0.851064,High,True,False,False,False,False,False,True,False,False,False,False,False,False,False,True


In [50]:
# Cell 6: Create Final Preprocessed Datasets
print("=" * 80)
print("CREATING PREPROCESSED DATASETS")
print("=" * 80)

# Dataset 1: Lifestyle features only (for clustering)
lifestyle_columns = [
    'Sleep_Hours',
    'Work_Hours',
    'Physical_Activity_Hours',
    'Social_Media_Usage',
    'Diet_Quality',
    'Smoking_Habit',
    'Alcohol_Consumption'
]

df_lifestyle = df_normalized[lifestyle_columns].copy()

print("\n1. LIFESTYLE DATASET (for clustering):")
print(f"   Shape: {df_lifestyle.shape[0]} rows, {df_lifestyle.shape[1]} columns")
print(f"   Columns: {df_lifestyle.columns.tolist()}")

# Dataset 2: Lifestyle features + Target (for main classification)
df_lifestyle_target = df_normalized[lifestyle_columns + ['Stress_Level']].copy()

print("\n2. LIFESTYLE + TARGET DATASET (for main classification):")
print(f"   Shape: {df_lifestyle_target.shape[0]} rows, {df_lifestyle_target.shape[1]} columns")
print(f"   Columns: {df_lifestyle_target.columns.tolist()}")

# Dataset 3: Lifestyle + Demographics + Target (for additional analysis)
# Get all demographic columns (Age + one-hot encoded Gender, Occupation, Country)
demographic_columns = [col for col in df_normalized.columns 
                      if col not in lifestyle_columns and col != 'Stress_Level']
df_full = df_normalized[lifestyle_columns + demographic_columns + ['Stress_Level']].copy()

print("\n3. FULL DATASET (for additional analysis with demographics):")
print(f"   Shape: {df_full.shape[0]} rows, {df_full.shape[1]} columns")
print(f"   Lifestyle columns: {len(lifestyle_columns)}")
print(f"   Demographic columns: {len(demographic_columns)}")
print(f"     - Age: 1")
print(f"     - Gender (one-hot): {len([c for c in demographic_columns if c.startswith('Gender_')])}")
print(f"     - Occupation (one-hot): {len([c for c in demographic_columns if c.startswith('Occupation_')])}")
print(f"     - Country (one-hot): {len([c for c in demographic_columns if c.startswith('Country_')])}")
print(f"   Target: 1 (Stress_Level)")

# Verify no extra columns leaked through
print("\n" + "=" * 80)
print("COLUMN VERIFICATION")
print("=" * 80)
expected_cols = set(lifestyle_columns + demographic_columns + ['Stress_Level'])
actual_cols = set(df_full.columns)
print(f"Expected columns match actual columns: {expected_cols == actual_cols}")
print(f"Total columns in full dataset: {len(df_full.columns)}")

print("\n" + "=" * 80)
print("PREVIEW OF DATASETS")
print("=" * 80)

print("\nLifestyle Dataset (first 5 rows):")
display(df_lifestyle.head())

print("\nLifestyle + Target Dataset (first 5 rows):")
display(df_lifestyle_target.head())

print("\nFull Dataset with Demographics (first 5 rows, first 15 columns):")
display(df_full.iloc[:5, :15])

CREATING PREPROCESSED DATASETS

1. LIFESTYLE DATASET (for clustering):
   Shape: 50000 rows, 7 columns
   Columns: ['Sleep_Hours', 'Work_Hours', 'Physical_Activity_Hours', 'Social_Media_Usage', 'Diet_Quality', 'Smoking_Habit', 'Alcohol_Consumption']

2. LIFESTYLE + TARGET DATASET (for main classification):
   Shape: 50000 rows, 8 columns
   Columns: ['Sleep_Hours', 'Work_Hours', 'Physical_Activity_Hours', 'Social_Media_Usage', 'Diet_Quality', 'Smoking_Habit', 'Alcohol_Consumption', 'Stress_Level']

3. FULL DATASET (for additional analysis with demographics):
   Shape: 50000 rows, 24 columns
   Lifestyle columns: 7
   Demographic columns: 16
     - Age: 1
     - Gender (one-hot): 3
     - Occupation (one-hot): 6
     - Country (one-hot): 6
   Target: 1 (Stress_Level)

COLUMN VERIFICATION
Expected columns match actual columns: True
Total columns in full dataset: 24

PREVIEW OF DATASETS

Lifestyle Dataset (first 5 rows):


Unnamed: 0,Sleep_Hours,Work_Hours,Physical_Activity_Hours,Social_Media_Usage,Diet_Quality,Smoking_Habit,Alcohol_Consumption
0,0.6,0.32,0.8,0.309091,0.0,0.666667,0.666667
1,0.466667,0.88,0.2,0.527273,1.0,1.0,0.333333
2,0.516667,0.94,0.9,0.981818,0.0,1.0,0.333333
3,0.483333,0.54,0.4,0.890909,0.5,0.666667,0.666667
4,0.116667,0.3,1.0,0.509091,1.0,0.666667,0.0



Lifestyle + Target Dataset (first 5 rows):


Unnamed: 0,Sleep_Hours,Work_Hours,Physical_Activity_Hours,Social_Media_Usage,Diet_Quality,Smoking_Habit,Alcohol_Consumption,Stress_Level
0,0.6,0.32,0.8,0.309091,0.0,0.666667,0.666667,Low
1,0.466667,0.88,0.2,0.527273,1.0,1.0,0.333333,Low
2,0.516667,0.94,0.9,0.981818,0.0,1.0,0.333333,Medium
3,0.483333,0.54,0.4,0.890909,0.5,0.666667,0.666667,Low
4,0.116667,0.3,1.0,0.509091,1.0,0.666667,0.0,High



Full Dataset with Demographics (first 5 rows, first 15 columns):


Unnamed: 0,Sleep_Hours,Work_Hours,Physical_Activity_Hours,Social_Media_Usage,Diet_Quality,Smoking_Habit,Alcohol_Consumption,Age,Gender_Male,Gender_Non-binary,Gender_Prefer not to say,Occupation_Engineering,Occupation_Finance,Occupation_Healthcare,Occupation_IT
0,0.6,0.32,0.8,0.309091,0.0,0.666667,0.666667,0.382979,True,False,False,False,False,False,False
1,0.466667,0.88,0.2,0.527273,1.0,1.0,0.333333,0.638298,True,False,False,True,False,False,False
2,0.516667,0.94,0.9,0.981818,0.0,1.0,0.333333,0.0,False,False,True,False,False,False,False
3,0.483333,0.54,0.4,0.890909,0.5,0.666667,0.666667,0.255319,False,True,False,True,False,False,False
4,0.116667,0.3,1.0,0.509091,1.0,0.666667,0.0,0.851064,True,False,False,False,False,False,True


In [51]:
# Cell 7: Quality Verification - Lifestyle Features
print("=" * 80)
print("DATA QUALITY VERIFICATION - LIFESTYLE FEATURES")
print("=" * 80)

# Check for missing values
print("\n1. Missing Values:")
print(f"   Total missing values: {df_lifestyle_target.isnull().sum().sum()}")

# Check data types
print("\n2. Data Types:")
print(df_lifestyle_target.dtypes)

# Check value ranges for normalized lifestyle features
print("\n3. Lifestyle Feature Ranges (should be [0, 1]):")
for col in lifestyle_columns:
    min_val = df_lifestyle_target[col].min()
    max_val = df_lifestyle_target[col].max()
    print(f"   {col}: [{min_val:.6f}, {max_val:.6f}]")

# Check target variable distribution
print("\n4. Target Variable Distribution:")
stress_counts = df_lifestyle_target['Stress_Level'].value_counts().sort_index()
stress_percent = df_lifestyle_target['Stress_Level'].value_counts(normalize=True).sort_index() * 100
for level in stress_counts.index:
    print(f"   {level}: {stress_counts[level]} ({stress_percent[level]:.2f}%)")

# Check for duplicates
print(f"\n5. Duplicate Rows: {df_lifestyle_target.duplicated().sum()}")

# Summary statistics
print("\n6. Summary Statistics - Lifestyle Features:")
display(df_lifestyle_target[lifestyle_columns].describe())

DATA QUALITY VERIFICATION - LIFESTYLE FEATURES

1. Missing Values:
   Total missing values: 0

2. Data Types:
Sleep_Hours                float64
Work_Hours                 float64
Physical_Activity_Hours    float64
Social_Media_Usage         float64
Diet_Quality               float64
Smoking_Habit              float64
Alcohol_Consumption        float64
Stress_Level                object
dtype: object

3. Lifestyle Feature Ranges (should be [0, 1]):
   Sleep_Hours: [0.000000, 1.000000]
   Work_Hours: [0.000000, 1.000000]
   Physical_Activity_Hours: [0.000000, 1.000000]
   Social_Media_Usage: [0.000000, 1.000000]
   Diet_Quality: [0.000000, 1.000000]
   Smoking_Habit: [0.000000, 1.000000]
   Alcohol_Consumption: [0.000000, 1.000000]

4. Target Variable Distribution:
   High: 16707 (33.41%)
   Low: 16446 (32.89%)
   Medium: 16847 (33.69%)

5. Duplicate Rows: 6

6. Summary Statistics - Lifestyle Features:


Unnamed: 0,Sleep_Hours,Work_Hours,Physical_Activity_Hours,Social_Media_Usage,Diet_Quality,Smoking_Habit,Alcohol_Consumption
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,0.501656,0.501257,0.498204,0.498756,0.50091,0.498067,0.499813
std,0.288779,0.293832,0.316176,0.288225,0.408457,0.372712,0.372568
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.25,0.24,0.2,0.254545,0.0,0.0,0.0
50%,0.5,0.5,0.5,0.490909,0.5,0.333333,0.666667
75%,0.75,0.76,0.8,0.745455,1.0,0.666667,0.666667
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [52]:
# Cell 8: Quality Verification - Full Dataset with Demographics
print("=" * 80)
print("DATA QUALITY VERIFICATION - FULL DATASET")
print("=" * 80)

# Verify ONLY expected columns exist
print("\n1. Column Verification:")
print(f"   Total columns: {len(df_full.columns)}")
print(f"   Lifestyle features: {len(lifestyle_columns)}")
print(f"   Demographic features: {len(demographic_columns)}")
print(f"   Target variable: 1")
print(f"   Sum matches: {len(lifestyle_columns) + len(demographic_columns) + 1 == len(df_full.columns)}")

# Check for missing values
print("\n2. Missing Values:")
print(f"   Total missing values: {df_full.isnull().sum().sum()}")

# Check Age normalization
print("\n3. Age Normalization:")
print(f"   Min: {df_full['Age'].min():.6f}")
print(f"   Max: {df_full['Age'].max():.6f}")

# Check one-hot encoded columns
onehot_columns = [col for col in df_full.columns if col.startswith(('Gender_', 'Occupation_', 'Country_'))]
print(f"\n4. One-Hot Encoded Columns:")
print(f"   Total one-hot columns: {len(onehot_columns)}")
gender_cols = [col for col in onehot_columns if col.startswith('Gender_')]
occupation_cols = [col for col in onehot_columns if col.startswith('Occupation_')]
country_cols = [col for col in onehot_columns if col.startswith('Country_')]
print(f"   Gender columns: {len(gender_cols)}")
print(f"   Occupation columns: {len(occupation_cols)}")
print(f"   Country columns: {len(country_cols)}")

# Verify binary values in one-hot columns
print(f"\n5. One-Hot Encoding Verification (should only contain 0 and 1):")
unique_values = df_full[onehot_columns].apply(lambda x: x.unique())
all_binary = all(len(vals) <= 2 and all(v in [0, 1, 0.0, 1.0] for v in vals) for vals in unique_values)
print(f"   All one-hot columns are binary: {all_binary}")

# Check for duplicates
print(f"\n6. Duplicate Rows: {df_full.duplicated().sum()}")

# Confirm no unwanted columns (like Severity, Medication, etc.)
print("\n7. Unwanted Columns Check:")
unwanted = ['Severity', 'Medication', 'Consultation', 'Mental_Health_Diagnosis', 'User_ID']
found_unwanted = [col for col in df_full.columns if any(uw.lower() in col.lower() for uw in unwanted)]
print(f"   Unwanted columns found: {len(found_unwanted)}")
if found_unwanted:
    print(f"   WARNING: {found_unwanted}")
else:
    print(f"   Clean: No unwanted columns detected")

DATA QUALITY VERIFICATION - FULL DATASET

1. Column Verification:
   Total columns: 24
   Lifestyle features: 7
   Demographic features: 16
   Target variable: 1
   Sum matches: True

2. Missing Values:
   Total missing values: 0

3. Age Normalization:
   Min: 0.000000
   Max: 1.000000

4. One-Hot Encoded Columns:
   Total one-hot columns: 15
   Gender columns: 3
   Occupation columns: 6
   Country columns: 6

5. One-Hot Encoding Verification (should only contain 0 and 1):
   All one-hot columns are binary: False

6. Duplicate Rows: 0

7. Unwanted Columns Check:
   Unwanted columns found: 0
   Clean: No unwanted columns detected


In [53]:
# Cell 9: Save Preprocessed Datasets
print("=" * 80)
print("SAVING PREPROCESSED DATASETS")
print("=" * 80)

# Define save directory
save_dir = '../data/'

# Save lifestyle dataset (for clustering)
df_lifestyle.to_csv(f'{save_dir}preprocessed_lifestyle_features.csv', index=False)
print(f"\n1. Lifestyle features saved as: {save_dir}preprocessed_lifestyle_features.csv")
print(f"   Shape: {df_lifestyle.shape[0]} rows, {df_lifestyle.shape[1]} columns")
print(f"   Use: K-Means clustering")

# Save lifestyle + target dataset (for main classification)
df_lifestyle_target.to_csv(f'{save_dir}preprocessed_lifestyle_target.csv', index=False)
print(f"\n2. Lifestyle + target saved as: {save_dir}preprocessed_lifestyle_target.csv")
print(f"   Shape: {df_lifestyle_target.shape[0]} rows, {df_lifestyle_target.shape[1]} columns")
print(f"   Use: Main classification (7 lifestyle features + cluster labels)")

# Save full dataset with demographics (for additional analysis)
df_full.to_csv(f'{save_dir}preprocessed_full_dataset.csv', index=False)
print(f"\n3. Full dataset saved as: {save_dir}preprocessed_full_dataset.csv")
print(f"   Shape: {df_full.shape[0]} rows, {df_full.shape[1]} columns")
print(f"   Use: Additional classification (lifestyle + demographics + cluster labels)")

SAVING PREPROCESSED DATASETS

1. Lifestyle features saved as: ../data/preprocessed_lifestyle_features.csv
   Shape: 50000 rows, 7 columns
   Use: K-Means clustering

2. Lifestyle + target saved as: ../data/preprocessed_lifestyle_target.csv
   Shape: 50000 rows, 8 columns
   Use: Main classification (7 lifestyle features + cluster labels)

3. Full dataset saved as: ../data/preprocessed_full_dataset.csv
   Shape: 50000 rows, 24 columns
   Use: Additional classification (lifestyle + demographics + cluster labels)


In [58]:
# Cell 10: Train-Validation-Test Split (70-20-10)
print("=" * 80)
print("TRAIN-VALIDATION-TEST SPLIT")
print("=" * 80)

from sklearn.model_selection import train_test_split

# Split ratios: 70% train, 20% validation, 10% test
print("\nSplit ratios:")
print("  - Training: 70%")
print("  - Validation: 20%")
print("  - Testing: 10%")

# First split: 70% train, 30% temp (validation + test)
train_data, temp_data = train_test_split(df_lifestyle_target, test_size=0.30, random_state=42, stratify=df_lifestyle_target['Stress_Level'])

# Second split: 20% validation, 10% test (from the 30% temp)
val_data, test_data = train_test_split(temp_data, test_size=(10/30), random_state=42, stratify=temp_data['Stress_Level'])

print("\n" + "=" * 80)
print("SPLIT VERIFICATION")
print("=" * 80)
print(f"\nOriginal dataset: {len(df_lifestyle_target)} rows")
print(f"Training set: {len(train_data)} rows ({len(train_data)/len(df_lifestyle_target)*100:.1f}%)")
print(f"Validation set: {len(val_data)} rows ({len(val_data)/len(df_lifestyle_target)*100:.1f}%)")
print(f"Test set: {len(test_data)} rows ({len(test_data)/len(df_lifestyle_target)*100:.1f}%)")

# Verify stress level distribution in each set
print("\n" + "=" * 80)
print("STRESS LEVEL DISTRIBUTION ACROSS SPLITS")
print("=" * 80)

print("\nTraining Set:")
train_dist = train_data['Stress_Level'].value_counts(normalize=True).sort_index() * 100
for level in train_dist.index:
    print(f"  {level}: {train_dist[level]:.2f}%")

print("\nValidation Set:")
val_dist = val_data['Stress_Level'].value_counts(normalize=True).sort_index() * 100
for level in val_dist.index:
    print(f"  {level}: {val_dist[level]:.2f}%")

print("\nTest Set:")
test_dist = test_data['Stress_Level'].value_counts(normalize=True).sort_index() * 100
for level in test_dist.index:
    print(f"  {level}: {test_dist[level]:.2f}%")

# Extract lifestyle features only (for clustering)
print("\n" + "=" * 80)
print("EXTRACTING LIFESTYLE FEATURES (FOR CLUSTERING)")
print("=" * 80)

lifestyle_columns = [
    'Sleep_Hours',
    'Work_Hours',
    'Physical_Activity_Hours',
    'Social_Media_Usage',
    'Diet_Quality',
    'Smoking_Habit',
    'Alcohol_Consumption'
]

train_lifestyle = train_data[lifestyle_columns].copy()
val_lifestyle = val_data[lifestyle_columns].copy()
test_lifestyle = test_data[lifestyle_columns].copy()

print(f"\nLifestyle features extracted (no target variable):")
print(f"  Training: {train_lifestyle.shape[0]} rows, {train_lifestyle.shape[1]} columns")
print(f"  Validation: {val_lifestyle.shape[0]} rows, {val_lifestyle.shape[1]} columns")
print(f"  Test: {test_lifestyle.shape[0]} rows, {test_lifestyle.shape[1]} columns")

TRAIN-VALIDATION-TEST SPLIT

Split ratios:
  - Training: 70%
  - Validation: 20%
  - Testing: 10%

SPLIT VERIFICATION

Original dataset: 50000 rows
Training set: 35000 rows (70.0%)
Validation set: 10000 rows (20.0%)
Test set: 5000 rows (10.0%)

STRESS LEVEL DISTRIBUTION ACROSS SPLITS

Training Set:
  High: 33.41%
  Low: 32.89%
  Medium: 33.69%

Validation Set:
  High: 33.41%
  Low: 32.90%
  Medium: 33.69%

Test Set:
  High: 33.42%
  Low: 32.88%
  Medium: 33.70%

EXTRACTING LIFESTYLE FEATURES (FOR CLUSTERING)

Lifestyle features extracted (no target variable):
  Training: 35000 rows, 7 columns
  Validation: 10000 rows, 7 columns
  Test: 5000 rows, 7 columns


In [61]:
# Cell 11: Split Full Dataset (with demographics) - Same indices
print("=" * 80)
print("SPLITTING FULL DATASET (WITH DEMOGRAPHICS)")
print("=" * 80)

# Use the same indices from the lifestyle split to maintain consistency
train_full = df_full.loc[train_data.index]
val_full = df_full.loc[val_data.index]
test_full = df_full.loc[test_data.index]

print(f"\nFull dataset splits (with demographics):")
print(f"  Training: {len(train_full)} rows, {train_full.shape[1]} columns")
print(f"  Validation: {len(val_full)} rows, {val_full.shape[1]} columns")
print(f"  Test: {len(test_full)} rows, {test_full.shape[1]} columns")

# Verify indices match
print(f"\nIndex alignment verification:")
print(f"  Train indices match: {train_data.index.equals(train_full.index)}")
print(f"  Val indices match: {val_data.index.equals(val_full.index)}")
print(f"  Test indices match: {test_data.index.equals(test_full.index)}")

SPLITTING FULL DATASET (WITH DEMOGRAPHICS)

Full dataset splits (with demographics):
  Training: 35000 rows, 24 columns
  Validation: 10000 rows, 24 columns
  Test: 5000 rows, 24 columns

Index alignment verification:
  Train indices match: True
  Val indices match: True
  Test indices match: True


In [63]:
# Cell 12: Save Split Datasets
print("=" * 80)
print("SAVING SPLIT DATASETS")
print("=" * 80)

save_dir = '../data/'

# Save lifestyle features only (for clustering)
train_lifestyle.to_csv(f'{save_dir}train_lifestyle_features.csv', index=False)
val_lifestyle.to_csv(f'{save_dir}val_lifestyle_features.csv', index=False)
test_lifestyle.to_csv(f'{save_dir}test_lifestyle_features.csv', index=False)

print("\n1. Lifestyle features (for clustering) saved:")
print(f"   - {save_dir}train_lifestyle_features.csv ({len(train_lifestyle)} rows, 7 columns)")
print(f"   - {save_dir}val_lifestyle_features.csv ({len(val_lifestyle)} rows, 7 columns)")
print(f"   - {save_dir}test_lifestyle_features.csv ({len(test_lifestyle)} rows, 7 columns)")

# Save lifestyle + target (for classification)
train_data.to_csv(f'{save_dir}train_lifestyle_target.csv', index=False)
val_data.to_csv(f'{save_dir}val_lifestyle_target.csv', index=False)
test_data.to_csv(f'{save_dir}test_lifestyle_target.csv', index=False)

print("\n2. Lifestyle + target (for classification) saved:")
print(f"   - {save_dir}train_lifestyle_target.csv ({len(train_data)} rows)")
print(f"   - {save_dir}val_lifestyle_target.csv ({len(val_data)} rows)")
print(f"   - {save_dir}test_lifestyle_target.csv ({len(test_data)} rows)")

# Save full splits (with demographics)
train_full.to_csv(f'{save_dir}train_full.csv', index=False)
val_full.to_csv(f'{save_dir}val_full.csv', index=False)
test_full.to_csv(f'{save_dir}test_full.csv', index=False)

print("\n3. Full datasets (with demographics) saved:")
print(f"   - {save_dir}train_full.csv ({len(train_full)} rows)")
print(f"   - {save_dir}val_full.csv ({len(val_full)} rows)")
print(f"   - {save_dir}test_full.csv ({len(test_full)} rows)")

print("\n" + "=" * 80)
print("PREPROCESSING AND SPLITTING COMPLETE")
print("=" * 80)
print("\nDatasets Created:")
print("\n1. For Clustering (Notebook 03):")
print(f"   - train_lifestyle_features.csv (7 features, no target)")
print(f"   - val_lifestyle_features.csv (7 features, no target)")
print(f"   - test_lifestyle_features.csv (7 features, no target)")
print("\n2. For Main Classification (Notebook 04):")
print(f"   - train_lifestyle_target.csv (7 features + target)")
print(f"   - val_lifestyle_target.csv (7 features + target)")
print(f"   - test_lifestyle_target.csv (7 features + target)")
print("\n3. For Additional Analysis (Notebook 05):")
print(f"   - train_full.csv (lifestyle + demographics + target)")
print(f"   - val_full.csv (lifestyle + demographics + target)")
print(f"   - test_full.csv (lifestyle + demographics + target)")
print("\nNext Steps:")
print("  1. Fit K-Means on train_lifestyle_features.csv ONLY")
print("  2. Transform val and test sets using fitted model")
print("  3. Add cluster labels to all datasets")
print("  4. Proceed to classification")

SAVING SPLIT DATASETS

1. Lifestyle features (for clustering) saved:
   - ../data/train_lifestyle_features.csv (35000 rows, 7 columns)
   - ../data/val_lifestyle_features.csv (10000 rows, 7 columns)
   - ../data/test_lifestyle_features.csv (5000 rows, 7 columns)

2. Lifestyle + target (for classification) saved:
   - ../data/train_lifestyle_target.csv (35000 rows)
   - ../data/val_lifestyle_target.csv (10000 rows)
   - ../data/test_lifestyle_target.csv (5000 rows)

3. Full datasets (with demographics) saved:
   - ../data/train_full.csv (35000 rows)
   - ../data/val_full.csv (10000 rows)
   - ../data/test_full.csv (5000 rows)

PREPROCESSING AND SPLITTING COMPLETE

Datasets Created:

1. For Clustering (Notebook 03):
   - train_lifestyle_features.csv (7 features, no target)
   - val_lifestyle_features.csv (7 features, no target)
   - test_lifestyle_features.csv (7 features, no target)

2. For Main Classification (Notebook 04):
   - train_lifestyle_target.csv (7 features + target)
   - val