In [1]:
%pwd

'c:\\Users\\vishw\\Documents\\college_projects\\Obesity-Predictor\\research'

In [2]:
import os

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\vishw\\Documents\\college_projects\\Obesity-Predictor'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataCleaningEncodingConfig:
    input_file: Path
    output_file: Path


In [None]:

from ObesityPredictor.utils.common import read_yaml, create_directories

In [6]:
from ObesityPredictor.constants import *
from ObesityPredictor.utils.common import read_yaml, create_directories
# Configuration Manager for Data Cleaning and Encoding
class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_FILE_PATH):
        self.config = read_yaml(config_filepath)
    
    def get_data_cleaning_encoding_config(self) -> DataCleaningEncodingConfig:
        config = self.config.data_cleaning_encoding
        return DataCleaningEncodingConfig(
            input_file=Path(config.input_file),
            output_file=Path(config.output_file)
        )

In [7]:
import pandas as pd
import os
from pathlib import Path
from ObesityPredictor import logger

class DataCleaningEncodingConfig:
    def __init__(self, input_file: Path, output_file: Path):
        self.input_file = input_file
        self.output_file = output_file

class DataCleaningEncoding:
    def __init__(self, config: DataCleaningEncodingConfig):
        self.config = config
    
    def load_data(self) -> pd.DataFrame:
        """
        Load the ingested data from the specified input file.
        """
        try:
            logger.info(f"Loading data from {self.config.input_file}")
            df = pd.read_csv(self.config.input_file)
            logger.info(f"Data loaded successfully with shape {df.shape}")
            return df
        except Exception as e:
            logger.error(f"Error loading data: {str(e)}")
            raise e

    def clean_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Perform data cleaning steps such as handling missing values and feature engineering.
        """
        try:
            logger.info("Cleaning data...")
            # Compute BMI
            df['BMI'] = df['Weight'] / (df['Height'] ** 2)
            df.drop(columns=['Height', 'Weight'], inplace=True)
            logger.info(f"Data cleaned, new shape: {df.shape}")
            return df
        except Exception as e:
            logger.error(f"Error during data cleaning: {str(e)}")
            raise e
    
    def encode_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Perform encoding of categorical variables.
        """
        try:
            logger.info("Encoding categorical features...")
            categorical_columns = ['Gender', 'family_history', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']
            df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)  # One-hot encoding
            
            # Encode the target variable
            obesity_mapping = {
                "Insufficient_Weight": 0,
                "Normal_Weight": 1,
                "Overweight_Level_I": 2,
                "Overweight_Level_II": 3,
                "Obesity_Type_I": 4,
                "Obesity_Type_II": 5,
                "Obesity_Type_III": 6
            }
            df['Obesity'] = df['Obesity'].map(obesity_mapping)
            
            logger.info(f"Data encoding complete, new shape: {df.shape}")
            return df
        except Exception as e:
            logger.error(f"Error during data encoding: {str(e)}")
            raise e

    def save_data(self, df: pd.DataFrame):
        """
        Save the cleaned and encoded data to the specified output file.
        """
        try:
            os.makedirs(self.config.output_file.parent, exist_ok=True)
            df.to_csv(self.config.output_file, index=False)
            logger.info(f"Cleaned and encoded data saved to {self.config.output_file}")
        except Exception as e:
            logger.error(f"Error saving cleaned data: {str(e)}")
            raise e

    def run(self):
        """
        Execute the data cleaning and encoding pipeline.
        """
        df = self.load_data()
        df = self.clean_data(df)
        df = self.encode_data(df)
        self.save_data(df)

In [8]:
# Running the Data Cleaning and Encoding Pipeline
try:
    config = ConfigurationManager()
    data_cleaning_encoding_config = config.get_data_cleaning_encoding_config()
    data_cleaning_encoding = DataCleaningEncoding(config=data_cleaning_encoding_config)
    data_cleaning_encoding.run()
except Exception as e:
    raise e


[2025-02-14 13:04:39,757: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-02-14 13:04:39,761: INFO: 1094122421: Loading data from artifacts\data_ingestion\Obesity_prediction.csv]
[2025-02-14 13:04:39,797: INFO: 1094122421: Data loaded successfully with shape (2111, 17)]
[2025-02-14 13:04:39,798: INFO: 1094122421: Cleaning data...]
[2025-02-14 13:04:39,811: INFO: 1094122421: Data cleaned, new shape: (2111, 16)]
[2025-02-14 13:04:39,812: INFO: 1094122421: Encoding categorical features...]
[2025-02-14 13:04:39,826: INFO: 1094122421: Data encoding complete, new shape: (2111, 23)]
[2025-02-14 13:04:39,852: INFO: 1094122421: Cleaned and encoded data saved to artifacts\data_cleaning_encoded\cleaned_encoded_data.csv]
