In [1]:
import pandas as pd

# Load the CLEANED data directly
df = pd.read_csv('cleaned_sleep_data.csv')

# Display the first few rows to make sure it loaded correctly
df.head()

Unnamed: 0,Person_ID,Gender,Age,Occupation,Sleep_Duration,Quality_of_Sleep,Physical_Activity_Level,Stress_Level,BMI_Category,Heart_Rate,Daily_Steps,Sleep_Disorder,Systolic_BP,Diastolic_BP
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,77,4200,,126,83
1,2,Male,28,Doctor,6.2,6,60,8,Normal,75,10000,,125,80
2,3,Male,28,Doctor,6.2,6,60,8,Normal,75,10000,,125,80
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,85,3000,Sleep Apnea,140,90
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,85,3000,Sleep Apnea,140,90


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Create a copy to keep your original df intact
df_ml = df.copy()

# Convert categorical variables into numerical ones
df_ml = pd.get_dummies(df_ml, columns=['Gender', 'Occupation', 'BMI_Category', 'Sleep_Disorder'], drop_first=True)

In [7]:
# Define your features (X) and the target variable (y)
X = df_ml.drop(['Person_ID', 'Quality_of_Sleep'], axis=1)
y = df_ml['Quality_of_Sleep']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest Classifier model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [9]:
# Make predictions on the test data
y_pred = model.predict(X_test)

# Check the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Get a detailed report of precision, recall, and f1-score
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Model Accuracy: 1.00

Classification Report:
              precision    recall  f1-score   support

           4       1.00      1.00      1.00         2
           5       1.00      1.00      1.00         1
           6       1.00      1.00      1.00        26
           7       1.00      1.00      1.00        16
           8       1.00      1.00      1.00        18
           9       1.00      1.00      1.00        12

    accuracy                           1.00        75
   macro avg       1.00      1.00      1.00        75
weighted avg       1.00      1.00      1.00        75



In [11]:
# Get feature importances from the model
feature_importances = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

# Display the most important features
print("\nTop 5 Most Important Features for Predicting Sleep Quality:")
print(feature_importances.head())


Top 5 Most Important Features for Predicting Sleep Quality:
          feature  importance
1  Sleep_Duration    0.189073
3    Stress_Level    0.183551
0             Age    0.126376
4      Heart_Rate    0.082022
5     Daily_Steps    0.068398


In [15]:
import joblib

# Define the filename for your model
filename = 'sleep_quality_model.joblib'

# Save the model to the file
joblib.dump(model, filename)

print(f"Model saved successfully to '{filename}'")

Model saved successfully to 'sleep_quality_model.joblib'
