In [4]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import joblib

# Step 2: Load Dataset
df = pd.read_csv("Sleep_health_and_lifestyle_dataset.csv")

# Step 3: Clean & Preprocess Data

# Remove non-informative ID column
df.drop('Person ID', axis=1, inplace=True)

# Replace missing sleep disorder values with 'None'
df['Sleep Disorder'].fillna('None', inplace=True)

# Split 'Blood Pressure' into systolic and diastolic
df['systolic_bp'] = df['Blood Pressure'].apply(lambda x: int(x.split('/')[0]))
df['diastolic_bp'] = df['Blood Pressure'].apply(lambda x: int(x.split('/')[1]))
df.drop('Blood Pressure', axis=1, inplace=True)

# Standardize BMI category text
df['BMI Category'] = df['BMI Category'].replace('Normal Weight', 'Normal')

# Label encode categorical features
label_columns = ['Gender', 'Occupation', 'BMI Category', 'Sleep Disorder']
le = LabelEncoder()
for col in label_columns:
    df[col] = le.fit_transform(df[col])

# Step 4: Prepare Training Data
X = df.drop('Sleep Disorder', axis=1)
y = df['Sleep Disorder']

# Step 5: Split into Training & Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 6: Train Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(X_train, y_train)

# Step 7: Save Model & Feature Columns
joblib.dump(rfc, "sleep_disorder_model.pkl")
joblib.dump(X.columns.tolist(), "feature_columns.pkl")

print("Model trained and saved successfully without 'Person ID'.")

joblib.dump(X.columns.tolist(), "feature_columns.pkl")


Model trained and saved successfully without 'Person ID'.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Sleep Disorder'].fillna('None', inplace=True)


['feature_columns.pkl']

In [3]:
import joblib

# Save feature column names after training
joblib.dump(X.columns.tolist(), "feature_columns.pkl")


['feature_columns.pkl']