# Phase 2 — Preprocessing

This notebook will:
1) Load the Phase 1 clean CSV  
2) Normalize text labels (trim + Title Case)  
3) Encode categories (ordinal + target + one-hot)  
4) Scale numeric features (Min-Max 0–1)  
5) Split Train / Val / Test (70 / 20 / 10)  
6) Save processed files + scaler


In [1]:
# cleaned Phase 1 output
from pathlib import Path
import pandas as pd

# If you're inside notebooks/, go up to project root; else stay put
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()

CLEAN_CSV_PATH = PROJECT_ROOT / "outputs" / "mh_lifestyle_phase1_clean.csv"
df = pd.read_csv(CLEAN_CSV_PATH)
print("Shape:", df.shape)
df.head(3)

#  Define column groups
num_cols   = ["Sleep_Hours","Work_Hours","Physical_Activity_Hours","Social_Media_Usage","Age"]
ord_cols   = ["Diet_Quality","Smoking_Habit","Alcohol_Consumption"]   # ordinal categories
nom_cols   = [c for c in ["Gender","Occupation","Country"] if c in df.columns]  # nominal (optional)
target_col = "Stress_Level"


Shape: (50000, 12)


In [2]:
#Normalize text labels and inspect category values

# Clean up text columns so categories are consistent
for c in ord_cols + nom_cols + [target_col]:
    if c in df.columns:
        df[c] = (
            df[c].astype(str)       # ensure string
                 .str.replace("_", " ", regex=False)
                 .str.strip()       # remove extra spaces
                 .str.title()       # e.g., 'low' -> 'Low'
        )

# Peek at unique values to know exactly what to encode next
for c in ord_cols + [target_col]:
    if c in df.columns:
        print(f"\n{c} unique values:")
        print(sorted(df[c].unique()))



Diet_Quality unique values:
['Average', 'Healthy', 'Unhealthy']

Smoking_Habit unique values:
['Heavy Smoker', 'Non-Smoker', 'Occasional Smoker', 'Regular Smoker']

Alcohol_Consumption unique values:
['Heavy Drinker', 'Non-Drinker', 'Regular Drinker', 'Social Drinker']

Stress_Level unique values:
['High', 'Low', 'Medium']


In [3]:
# Ordinal encode (minimal, exact to YOUR labels)

# Diet_Quality: Unhealthy < Average < Healthy
df["Diet_Quality"] = df["Diet_Quality"].map({
    "Unhealthy": 0,
    "Average":   1,
    "Healthy":   2,
})

# Smoking_Habit: Non-Smoker < Occasional < Regular < Heavy
df["Smoking_Habit"] = df["Smoking_Habit"].map({
    "Non-Smoker":        0,
    "Occasional Smoker": 1,
    "Regular Smoker":    2,
    "Heavy Smoker":      3,
})

# Alcohol_Consumption: Non-Drinker < Social < Regular < Heavy
df["Alcohol_Consumption"] = df["Alcohol_Consumption"].map({
    "Non-Drinker":     0,
    "Social Drinker":  1,
    "Regular Drinker": 2,
    "Heavy Drinker":   3,
})

# Quick verification
for c in ["Diet_Quality","Smoking_Habit","Alcohol_Consumption"]:
    na = df[c].isna().sum()
    print(f"{c}: dtype={df[c].dtype}, NaNs={na}")


Diet_Quality: dtype=int64, NaNs=0
Smoking_Habit: dtype=int64, NaNs=0
Alcohol_Consumption: dtype=int64, NaNs=0


In [None]:
# encode target (Low=0, Medium=1, High=2)
y_map = {"Low":0, "Medium":1, "High":2}
df[target_col] = df[target_col].map(y_map)
print("Target counts (encoded):")
print(df[target_col].value_counts().sort_index())


Target counts (encoded):
Stress_Level
0    16446
1    16847
2    16707
Name: count, dtype: int64


In [5]:
#  one-hot encode Gender / Occupation / Country
import pandas as pd

nom_cols = [c for c in ["Gender","Occupation","Country"] if c in df.columns]
df_enc = pd.get_dummies(df, columns=nom_cols, drop_first=True)

print("Encoded shape:", df_enc.shape)
df_enc.head(2)


Encoded shape: (50000, 24)


Unnamed: 0,Sleep_Hours,Work_Hours,Physical_Activity_Hours,Social_Media_Usage,Diet_Quality,Smoking_Habit,Alcohol_Consumption,Stress_Level,Age,Gender_Male,...,Occupation_Healthcare,Occupation_It,Occupation_Other,Occupation_Sales,Country_Canada,Country_Germany,Country_India,Country_Other,Country_Uk,Country_Usa
0,7.6,46,8,2.2,2,2,2,0,36,True,...,False,False,False,False,False,False,False,False,False,False
1,6.8,74,2,3.4,0,3,1,0,48,True,...,False,False,False,False,False,False,False,True,False,False


In [6]:
# What nominal columns did we one-hot?
print("Nominal cols detected:", nom_cols)

# Peek at unique values (first 10) for each nominal col
for c in nom_cols:
    print(f"\n{c} uniques (sample):", list(df[c].unique())[:10])

# Which new dummy columns got created?
new_cols = [c for c in df_enc.columns if c not in df.columns or any(c.startswith(p+"_") for p in nom_cols)]
print("\nNew dummy columns (first 30):", new_cols[:30])
print("Total new dummy cols:", len(new_cols))

# Show just the dummy columns for a quick visual
dummy_view_cols = [c for c in df_enc.columns if any(c.startswith(p+"_") for p in nom_cols)]
print("\nExample rows (dummy columns only):")
display(df_enc[dummy_view_cols].head(3))


Nominal cols detected: ['Gender', 'Occupation', 'Country']

Gender uniques (sample): ['Male', 'Prefer Not To Say', 'Non-Binary', 'Female']

Occupation uniques (sample): ['Education', 'Engineering', 'Sales', 'It', 'Healthcare', 'Other', 'Finance']

Country uniques (sample): ['Australia', 'Other', 'India', 'Usa', 'Germany', 'Canada', 'Uk']

New dummy columns (first 30): ['Gender_Male', 'Gender_Non-Binary', 'Gender_Prefer Not To Say', 'Occupation_Engineering', 'Occupation_Finance', 'Occupation_Healthcare', 'Occupation_It', 'Occupation_Other', 'Occupation_Sales', 'Country_Canada', 'Country_Germany', 'Country_India', 'Country_Other', 'Country_Uk', 'Country_Usa']
Total new dummy cols: 15

Example rows (dummy columns only):


Unnamed: 0,Gender_Male,Gender_Non-Binary,Gender_Prefer Not To Say,Occupation_Engineering,Occupation_Finance,Occupation_Healthcare,Occupation_It,Occupation_Other,Occupation_Sales,Country_Canada,Country_Germany,Country_India,Country_Other,Country_Uk,Country_Usa
0,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,True,False,False,True,False,False,False,False,False,False,False,False,True,False,False
2,False,False,True,False,False,False,False,False,True,False,False,True,False,False,False


In [7]:
# scale numeric features and save the scaler
from sklearn.preprocessing import MinMaxScaler
from pathlib import Path
import pickle

num_cols_actual = [c for c in ["Sleep_Hours","Work_Hours","Physical_Activity_Hours","Social_Media_Usage","Age"] if c in df_enc.columns]

scaler = MinMaxScaler()
df_enc[num_cols_actual] = scaler.fit_transform(df_enc[num_cols_actual])

# save scaler for reuse
ART_DIR = (Path.cwd().parent if Path.cwd().name=="notebooks" else Path.cwd()) / "outputs" / "artifacts"
ART_DIR.mkdir(parents=True, exist_ok=True)
with open(ART_DIR / "minmax_scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

df_enc[num_cols_actual].head(3)


Unnamed: 0,Sleep_Hours,Work_Hours,Physical_Activity_Hours,Social_Media_Usage,Age
0,0.6,0.32,0.8,0.309091,0.382979
1,0.466667,0.88,0.2,0.527273,0.638298
2,0.516667,0.94,0.9,0.981818,0.0


In [9]:
# build X/y, split 70/20/10 (stratified)

target_col = "Stress_Level"  # same as before

# Build X and y from the encoded+scaled dataframe
X = df_enc.drop(columns=[target_col])
y = df_enc[target_col]

from sklearn.model_selection import train_test_split

# 70% train, 30% temp (val+test)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, train_size=0.70, random_state=42, stratify=y
)

# Of remaining 30% → 1/3 test (10% overall), 2/3 val (20% overall)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=1/3, random_state=42, stratify=y_temp
)

# Quick report
print("Shapes:")
print("  Train:", X_train.shape)
print("  Val:  ", X_val.shape)
print("  Test: ", X_test.shape)

print("\nClass balance (%):")
for name, arr in [("train", y_train), ("val", y_val), ("test", y_test)]:
    pct = (arr.value_counts(normalize=True).sort_index() * 100).round(2)
    print(f"  {name}: {pct.to_dict()}")


Shapes:
  Train: (35000, 23)
  Val:   (10000, 23)
  Test:  (5000, 23)

Class balance (%):
  train: {0: 32.89, 1: 33.69, 2: 33.41}
  val: {0: 32.89, 1: 33.69, 2: 33.42}
  test: {0: 32.9, 1: 33.7, 2: 33.4}


In [11]:
print("Final dataset shape:", df_enc.shape)
print("\nColumn samples:", list(df_enc.columns)[:15], "...")
pd.set_option("display.max_columns", None)  # show all columns
df_enc.head(5)



Final dataset shape: (50000, 24)

Column samples: ['Sleep_Hours', 'Work_Hours', 'Physical_Activity_Hours', 'Social_Media_Usage', 'Diet_Quality', 'Smoking_Habit', 'Alcohol_Consumption', 'Stress_Level', 'Age', 'Gender_Male', 'Gender_Non-Binary', 'Gender_Prefer Not To Say', 'Occupation_Engineering', 'Occupation_Finance', 'Occupation_Healthcare'] ...


Unnamed: 0,Sleep_Hours,Work_Hours,Physical_Activity_Hours,Social_Media_Usage,Diet_Quality,Smoking_Habit,Alcohol_Consumption,Stress_Level,Age,Gender_Male,Gender_Non-Binary,Gender_Prefer Not To Say,Occupation_Engineering,Occupation_Finance,Occupation_Healthcare,Occupation_It,Occupation_Other,Occupation_Sales,Country_Canada,Country_Germany,Country_India,Country_Other,Country_Uk,Country_Usa
0,0.6,0.32,0.8,0.309091,2,2,2,0,0.382979,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,0.466667,0.88,0.2,0.527273,0,3,1,0,0.638298,True,False,False,True,False,False,False,False,False,False,False,False,True,False,False
2,0.516667,0.94,0.9,0.981818,2,3,1,1,0.0,False,False,True,False,False,False,False,False,True,False,False,True,False,False,False
3,0.483333,0.54,0.4,0.890909,1,2,2,0,0.255319,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False
4,0.116667,0.3,1.0,0.509091,0,2,0,2,0.851064,True,False,False,False,False,False,True,False,False,False,False,False,False,False,True


In [12]:
from pathlib import Path

# Path to outputs/
OUT_DIR = (Path.cwd().parent if Path.cwd().name=="notebooks" else Path.cwd()) / "outputs"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Save X and y splits
X_train.to_csv(OUT_DIR / "X_train.csv", index=False)
X_val.to_csv(OUT_DIR / "X_val.csv", index=False)
X_test.to_csv(OUT_DIR / "X_test.csv", index=False)
y_train.to_csv(OUT_DIR / "y_train.csv", index=False)
y_val.to_csv(OUT_DIR / "y_val.csv", index=False)
y_test.to_csv(OUT_DIR / "y_test.csv", index=False)

# Save feature names (helps when modeling later)
X.columns.to_series().to_frame("feature").to_csv(OUT_DIR / "feature_columns.csv", index=False)

print("✅ All splits saved to:", OUT_DIR)


✅ All splits saved to: /Users/rykelle/Documents/GitHub/Thesis/outputs
