In [1]:
try:
    import xgboost
except ImportError:
    !pip install xgboost
    import xgboost
    !pip install --upgrade xgboost

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load data
df = pd.read_csv('Sleep_health_and_lifestyle_dataset.csv')
df = df.drop(['Sleep Disorder','Occupation','Person ID','Daily Steps'], axis=1)
# (Do your feature engineering here, e.g., split blood pressure, etc.)

# Define features and target
X = df.drop(['Stress Level'], axis=1)
y = df['Stress Level']

# 1. Fit LabelEncoder on the FULL y
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# 2. Split AFTER encoding
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# 3. Use y_train and y_test as they are (do NOT re-encode)

In [4]:
df.head()

Unnamed: 0,Gender,Age,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate
0,Male,27,6.1,6,42,6,Overweight,126/83,77
1,Male,28,6.2,6,60,8,Normal,125/80,75
2,Male,28,6.2,6,60,8,Normal,125/80,75
3,Male,28,5.9,4,30,8,Obese,140/90,85
4,Male,28,5.9,4,30,8,Obese,140/90,85


In [5]:
# Check for missing values
print(df.isnull().sum())

# Check data types
print(df.dtypes)

# List unique values for categoricals
for col in ['Gender', 'BMI Category', 'Stress Level']:
    print(f'{col}:', df[col].unique())


Gender                     0
Age                        0
Sleep Duration             0
Quality of Sleep           0
Physical Activity Level    0
Stress Level               0
BMI Category               0
Blood Pressure             0
Heart Rate                 0
dtype: int64
Gender                      object
Age                          int64
Sleep Duration             float64
Quality of Sleep             int64
Physical Activity Level      int64
Stress Level                 int64
BMI Category                object
Blood Pressure              object
Heart Rate                   int64
dtype: object
Gender: ['Male' 'Female']
BMI Category: ['Overweight' 'Normal' 'Obese' 'Normal Weight']
Stress Level: [6 8 7 4 3 5]


In [5]:
# Define features and target
X = df.drop('Stress Level', axis=1)
y = df['Stress Level']

# Identify categorical and numerical columns
categorical_cols = [col for col in X.columns if X[col].dtype == 'O']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Preprocessing for numerical data
from sklearn.impute import SimpleImputer
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])




In [6]:
from sklearn.utils.class_weight import compute_sample_weight

# Define tuned classifiers
rf = RandomForestClassifier(
    class_weight='balanced',
    max_depth=3,
    min_samples_leaf=5,
    n_estimators=50,
    random_state=21
)

xgb = XGBClassifier(
    max_depth=5,
    min_child_weight=10,
    reg_alpha=1,
    reg_lambda=1,
    random_state=42
)

# Use tuned classifiers in pipelines
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', rf)
])

xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', xgb)
])

# Optionally, compute sample weights for XGBoost
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)

# Train both models
rf_pipeline.fit(X_train, y_train)
xgb_pipeline.fit(X_train, y_train, classifier__sample_weight=sample_weights)

# Evaluate
rf_pred = rf_pipeline.predict(X_test)
xgb_pred = xgb_pipeline.predict(X_test)

rf_acc = accuracy_score(y_test, rf_pred)
xgb_acc = accuracy_score(y_test, xgb_pred)

print(f'Random Forest Accuracy: {rf_acc:.4f}')
print(f'XGBoost Accuracy: {xgb_acc:.4f}')

from sklearn.metrics import classification_report, confusion_matrix

print('Random Forest Classification Report:')
print(classification_report(y_test, rf_pred))
print('Random Forest Confusion Matrix:')
print(confusion_matrix(y_test, rf_pred))

print('XGBoost Classification Report:')
print(classification_report(y_test, xgb_pred))
print('XGBoost Confusion Matrix:')
print(confusion_matrix(y_test, xgb_pred))

# Select best
if rf_acc >= xgb_acc:
    best_model = rf_pipeline
    best_name = 'Random Forest'
    best_pred = rf_pred
else:
    best_model = xgb_pipeline
    best_name = 'XGBoost'
    best_pred = xgb_pred

print(f'\nBest Model: {best_name}')

Random Forest Accuracy: 0.9600
XGBoost Accuracy: 0.9467
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.93      0.96        14
           1       0.93      1.00      0.97        14
           2       1.00      0.93      0.96        14
           3       0.89      0.89      0.89         9
           4       0.91      1.00      0.95        10
           5       1.00      1.00      1.00        14

    accuracy                           0.96        75
   macro avg       0.96      0.96      0.96        75
weighted avg       0.96      0.96      0.96        75

Random Forest Confusion Matrix:
[[13  0  0  1  0  0]
 [ 0 14  0  0  0  0]
 [ 0  0 13  0  1  0]
 [ 0  1  0  8  0  0]
 [ 0  0  0  0 10  0]
 [ 0  0  0  0  0 14]]
XGBoost Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.93      0.96        14
           1       0.93      1.00      0.97        14
           

In [7]:
print('Accuracy:', accuracy_score(y_test, best_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, best_pred))
print('Classification Report:\n', classification_report(y_test, best_pred))

Accuracy: 0.96
Confusion Matrix:
 [[13  0  0  1  0  0]
 [ 0 14  0  0  0  0]
 [ 0  0 13  0  1  0]
 [ 0  1  0  8  0  0]
 [ 0  0  0  0 10  0]
 [ 0  0  0  0  0 14]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.93      0.96        14
           1       0.93      1.00      0.97        14
           2       1.00      0.93      0.96        14
           3       0.89      0.89      0.89         9
           4       0.91      1.00      0.95        10
           5       1.00      1.00      1.00        14

    accuracy                           0.96        75
   macro avg       0.96      0.96      0.96        75
weighted avg       0.96      0.96      0.96        75



In [8]:
# Save the best model pipeline
joblib.dump(best_model, 'stress_level_pred_v2.pkl')
print('Model saved as stress_level_pred_v2.pkl')

# Save the label encoder if used
if 'le' in locals():
    joblib.dump(le, 'stress_level_label_encoder.pkl')
    print('Label encoder saved as stress_level_label_encoder.pkl')


Model saved as stress_level_pred_v2.pkl
Label encoder saved as stress_level_label_encoder.pkl
