In [3]:
import os

In [1]:
%pwd

'c:\\Users\\vishw\\Documents\\college_projects\\Obesity-Predictor\\research'

In [4]:
os.chdir("../")

In [5]:
%pwd

'c:\\Users\\vishw\\Documents\\college_projects\\Obesity-Predictor'

In [6]:
from dataclasses import dataclass
from pathlib import Path
from typing import Dict

@dataclass(frozen=True)
class ModelTrainingConfig:
    input_file: Path  # Path to the cleaned dataset
    models_dir: Path  # Directory where trained models will be saved
    params: Dict  # Parameters for train-test split and models


In [7]:
from ObesityPredictor.constants import *
from ObesityPredictor.utils.common import read_yaml, create_directories

# Configuration Manager for Model Training
class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_FILE_PATH, params_filepath=PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

    def get_model_training_config(self) -> ModelTrainingConfig:
        config = self.config["model_training"]
        return ModelTrainingConfig(
            input_file=Path(config["input_file"]),
            models_dir=Path(config["models_dir"]),
            params=self.params
        )

In [None]:
import pandas as pd
import os
import pickle
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from ObesityPredictor import logger
from ObesityPredictor.utils.common import read_yaml


class ModelTraining:
    def __init__(self, config: ModelTrainingConfig):
        self.config = config

    def load_data(self):
        """Load cleaned and encoded dataset."""
        try:
            logger.info(f"Loading data from {self.config.input_file}")
            df = pd.read_csv(self.config.input_file)
            X = df.drop(columns=["Obesity"])
            y = df["Obesity"]
            return X, y
        except Exception as e:
            logger.error(f"Error loading data: {str(e)}")
            raise e

    def split_data(self, X, y):
        """Split dataset into train and test sets."""
        try:
            test_size = self.config.params["train_test_split"]["test_size"]
            random_state = self.config.params["train_test_split"]["random_state"]
            stratify = y if self.config.params["train_test_split"]["stratify"] else None

            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=test_size, random_state=random_state, stratify=stratify
            )
            logger.info(f"Data split complete: Train size={X_train.shape}, Test size={X_test.shape}")
            return X_train, X_test, y_train, y_test
        except Exception as e:
            logger.error(f"Error during data splitting: {str(e)}")
            raise e

    def train_and_save_model(self, X_train, y_train, model, model_name):
        """Train a given model and save it."""
        try:
            logger.info(f"Training {model_name}...")
            model.fit(X_train, y_train)
            os.makedirs(self.config.models_dir / model_name, exist_ok=True)
            model_path = self.config.models_dir / model_name / "model.pkl"
            with open(model_path, "wb") as f:
                pickle.dump(model, f)
            logger.info(f"{model_name} saved at {model_path}")
        except Exception as e:
            logger.error(f"Error training {model_name}: {str(e)}")
            raise e

    def run(self):
        """Execute the model training pipeline."""
        X, y = self.load_data()
        X_train, X_test, y_train, y_test = self.split_data(X, y)

        # Train Logistic Regression
        log_reg_params = self.config.params["models"]["logistic_regression"]
        log_reg = LogisticRegression(**log_reg_params)
        self.train_and_save_model(X_train, y_train, log_reg, "logistic_regression")

        # Train Decision Tree
        dt_params = self.config.params["models"]["decision_tree"]
        decision_tree = DecisionTreeClassifier(**dt_params)
        self.train_and_save_model(X_train, y_train, decision_tree, "decision_tree")

In [10]:
# Running the Model Training Pipeline
try:
    config = ConfigurationManager()
    model_training_config = config.get_model_training_config()
    model_training = ModelTraining(config=model_training_config)
    model_training.run()
except Exception as e:
    raise e

[2025-02-14 13:33:14,465: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-02-14 13:33:14,468: INFO: common: yaml file: params.yaml loaded successfully]
[2025-02-14 13:33:14,469: INFO: 2130521663: Loading data from artifacts\data_cleaning_encoded\cleaned_encoded_data.csv]
[2025-02-14 13:33:14,470: ERROR: 2130521663: Error loading data: [Errno 2] No such file or directory: 'artifacts\\data_cleaning_encoded\\cleaned_encoded_data.csv']


FileNotFoundError: [Errno 2] No such file or directory: 'artifacts\\data_cleaning_encoded\\cleaned_encoded_data.csv'