# Model Trainer Notebook

Modifying file path.

In [1]:
import os

In [2]:
%pwd

'c:\\Users\\RaviB\\GitHub\\SleepEfficiencyML\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\RaviB\\GitHub\\SleepEfficiencyML'

## Model Tuning

In [5]:
import pandas as pd
from sklearn.linear_model import ElasticNet

In [6]:
train_data = pd.read_csv("artifacts/data_transformation/train.csv")
test_data = pd.read_csv("artifacts/data_transformation/test.csv")

train_x = train_data.drop(['Sleep efficiency'], axis=1)
test_x = test_data.drop(['Sleep efficiency'], axis=1)
train_y = train_data[['Sleep efficiency']]
test_y = test_data[['Sleep efficiency']]

In [7]:
lr = ElasticNet(alpha=0.4, l1_ratio=0.8, random_state=42)
lr.fit(train_x, train_y)

In [8]:
data = test_x.iloc[5]
data

Age                     30.0
Gender                   0.0
Bedtime                  2.0
Wakeup time             11.0
Sleep duration           9.0
Awakenings               0.0
Caffeine consumption     1.0
Alcohol consumption      0.0
Smoking status           0.0
Exercise frequency       0.0
Name: 5, dtype: float64

In [9]:
feature_names = list(test_x.columns)
feature_names

['Age',
 'Gender',
 'Bedtime',
 'Wakeup time',
 'Sleep duration',
 'Awakenings',
 'Caffeine consumption',
 'Alcohol consumption',
 'Smoking status',
 'Exercise frequency']

In [10]:
data_df = pd.DataFrame([data], columns=feature_names)

In [11]:
lr.predict(data_df)

array([0.78707965])

In [12]:
for i in range(0, 15):
    data = test_x.iloc[i]
    data_df = pd.DataFrame([data], columns=feature_names)
    print(lr.predict(data_df))

[0.78707965]
[0.78707965]
[0.78707965]
[0.78707965]
[0.78707965]
[0.78707965]
[0.78707965]
[0.78707965]
[0.78707965]
[0.78707965]
[0.78707965]
[0.78707965]
[0.78707965]
[0.78707965]
[0.78707965]


ElasticNet is terrible, let's try XGBoost.

In [13]:
import xgboost as xgb

In [15]:
model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

In [17]:
model.fit(train_x, train_y)

# Predict using the test data
predictions = model.predict(test_x)

print("First 10 predictions:", predictions[:10])

First 10 predictions: [0.9200317  0.74777484 0.83594507 0.7912201  0.6201089  0.9048254
 0.9760935  0.8131884  0.81347674 0.6990114 ]


In [18]:
test_y[:10]

Unnamed: 0,Sleep efficiency
0,0.92
1,0.8
2,0.87
3,0.92
4,0.58
5,0.93
6,0.91
7,0.72
8,0.82
9,0.78


This already looks so much better. Let's do grid search and run with the best parameters from that.

In [19]:
from sklearn.model_selection import GridSearchCV

In [31]:
model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': [None, 50, 100],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [None, 3, 5],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.2, 0.4, 0.6, 0.8]
}

# Set up the GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)

# Fit the model to the training data
grid_search.fit(train_x, train_y)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits


In [21]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

In [49]:
best_model = grid_search.best_estimator_
print("Best parameters found: ", grid_search.best_params_)

Best parameters found:  {'colsample_bytree': 0.6, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.8}


In [33]:
predictions = best_model.predict(test_x)
print("First 10 predictions:", predictions[:10])
print("First 10 true values:", test_y[:10])

First 10 predictions: [0.8458716  0.75178003 0.8839355  0.8746178  0.65838325 0.90464395
 0.93039197 0.79013485 0.7279593  0.7422841 ]
First 10 true values:    Sleep efficiency
0              0.92
1              0.80
2              0.87
3              0.92
4              0.58
5              0.93
6              0.91
7              0.72
8              0.82
9              0.78


In [34]:
mse = mean_squared_error(test_y, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(test_y, predictions)

print("Mean Squared Error: ", mse)
print("Root Mean Squared Error: ", rmse)
print("R² Score: ", r2)

Mean Squared Error:  0.008233361549997629
Root Mean Squared Error:  0.09073787274340098
R² Score:  0.5623822780046016


R2 score is not great but not bad. We will run with it for now.

## Modularizing Model Training

In [40]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    model_name: str
    n_estimators: float
    learning_rate: float
    max_depth: float
    subsample: float
    colsample_bytree: float
    target_column: str

In [41]:
from sleep_efficiency.constants import *
from sleep_efficiency.utils.common import read_yaml, create_directories

In [42]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):
    
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.XGBoost
        schema = self.schema.TARGET_COLUMN

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir = config.root_dir,
            train_data_path = config.train_data_path,
            test_data_path = config.test_data_path,
            model_name = config.model_name,
            n_estimators = params.n_estimators,
            learning_rate = params.learning_rate,
            max_depth = params.max_depth,
            subsample = params.subsample,
            colsample_bytree = params.colsample_bytree,
            target_column = schema.name
        )
        
        return model_trainer_config

In [43]:
import pandas as pd
import os
from sleep_efficiency import logger
import xgboost as xgb
import joblib

In [46]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

    def train(self):
        train_data = pd.read_csv(self.config.train_data_path)
        test_data = pd.read_csv(self.config.test_data_path)

        train_x = train_data.drop([self.config.target_column], axis=1)
        test_x = test_data.drop([self.config.target_column], axis=1)
        train_y = train_data[[self.config.target_column]]
        test_y = test_data[[self.config.target_column]]

        model = xgb.XGBRegressor(
            objective='reg:squarederror',
            random_state=42,
            n_estimators=self.config.n_estimators,
            learning_rate=self.config.learning_rate,
            max_depth=self.config.max_depth,
            subsample=self.config.subsample,
            colsample_bytree=self.config.colsample_bytree
        )
        model.fit(train_x, train_y)

        joblib.dump(model, os.path.join(self.config.root_dir, self.config.model_name))


In [47]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer_config = ModelTrainer(config=model_trainer_config)
    model_trainer_config.train()
except Exception as e:
    raise e

[2024-06-29 12:55:53,143: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-06-29 12:55:53,146: INFO: common: yaml file: params.yaml loaded successfully]
[2024-06-29 12:55:53,149: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-06-29 12:55:53,152: INFO: common: created directory at artifacts]
[2024-06-29 12:55:53,152: INFO: common: created directory at artifacts/model_trainer]
