In [1]:
import os

In [2]:
%pwd

'd:\\Projects\\MlOps\\End-to-end-Machine-Learning-Project-with-MLflow\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\Projects\\MlOps\\End-to-end-Machine-Learning-Project-with-MLflow'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    unzip_data_dir: Path
    preprocessed_data:Path

In [6]:
from src.mlProject.constants import *
from src.mlProject.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir=config.root_dir,
            unzip_data_dir = config.unzip_data_dir,
            preprocessed_data=config.preprocessed_data
        )

        return data_validation_config

In [8]:
import os
from src.mlProject import logger
import pandas as pd
import numpy as np

In [9]:
import logging
from abc import ABC, abstractmethod
from typing import Union

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


class DataStrategy(ABC):
    """
    Abstract Class defining strategy for handling data
    """

    @abstractmethod
    def handle_data(self, data: pd.DataFrame) ->  pd.DataFrame: # Union[pd.DataFrame, pd.Series]:
        pass


class DataPreprocessStrategy(DataStrategy):
    """
    Data preprocessing strategy which preprocesses the data.
    """

    def handle_data(self, data: pd.DataFrame) -> pd.DataFrame:
        """
        Removes columns which are not required, Fills missing values with median average values, and converts the data type to int.
        """
        try:
            data.drop(
                [
                    "street",
                    "date",
                    "country",
                    "yr_renovated",
                ],
                axis=1, inplace=True
            )
            data[['floors','bathrooms','bedrooms']] = data[['floors','bathrooms','bedrooms']].astype("int")

            data['price'].replace(0,np.nan,inplace=True)
            #Once the values are replaced to nan let's fill them with mean
            data['price'].fillna(value=data["price"].mean(), inplace=True)

            return data
        except Exception as e:
            logging.error(e)
            raise e


class DataCatToNumeric(DataStrategy):
    """
    Data CatoNumeric strategy which convert categorical to numeric.
    """

    def handle_data(self, data: pd.DataFrame) -> pd.DataFrame:
        """
        Convert categorical fetures to numeric in the data.
        """
        try:
            data['city'], _ = pd.factorize(data['city'])
            data['statezip'], _ = pd.factorize(data['statezip']) # function returns factorized df with list of classes
            return data
        except Exception as e:
            logging.error(e)
            raise e


# class DataDivideStrategy(DataStrategy):
    # """
    # Data dividing strategy which divides the data into train and test data.
    # """

    # def handle_data(self, data: pd.DataFrame) -> Union[pd.DataFrame, pd.Series]:
    #     """
    #     Divides the data into train and test data.
    #     """
    #     try:
    #         X = data.drop("price", axis=1)
    #         y = data["price"]
    #         X_train, X_test, y_train, y_test = train_test_split(
    #             X, y, test_size=0.2, random_state=42
    #         )
    #         return X_train, X_test, y_train, y_test
    #     except Exception as e:
    #         logging.error(e)
    #         raise e


class DataOutlierHandlingStrategy(DataStrategy):
    """
    Data outlier handling strategy which replaces outliers with NaN and fills NaN with means.
    """

    def handle_data(self, data: pd.DataFrame) -> pd.DataFrame:
        """
        Replace outliers with NaN and fill NaN with means.
        """
        try:
            feature_with_outlier = ["price", "sqft_lot", "sqft_basement"]
            for feature in feature_with_outlier:
                self.replace_outliers_with_nan_iqr(data, feature)

            # Get means for features with outliers
            feature_means = data[feature_with_outlier].mean()

            # Replace NaNs with means
            data.fillna(feature_means, inplace=True)

            return data
        except Exception as e:
            logging.error(e)
            raise e

    @staticmethod
    def replace_outliers_with_nan_iqr(df, feature, inplace=True):
        desired_feature = df[feature]

        q1, q3 = desired_feature.quantile([0.25, 0.75])
        iqr = q3 - q1
        upper_bound = q3 + 1.5 * iqr
        lower_bound = q1 - 1.5 * iqr

        indices = (
            desired_feature[
                (desired_feature > upper_bound) | (desired_feature < lower_bound)
            ]
        ).index

        if not inplace:
            return desired_feature.replace(desired_feature[indices].values, np.nan)
        return desired_feature.replace(desired_feature[indices].values, np.nan, inplace=True)



class DataCleaning:
    """
    Data cleaning class which preprocesses the data and divides it into train and test data.
    """

    def __init__(self, data: pd.DataFrame, strategy: DataStrategy) -> None:
        """Initializes the DataCleaning class with a specific strategy."""
        self.df = data
        self.strategy = strategy

    def handle_data(self) -> Union[pd.DataFrame, pd.Series]:
        """Handle data based on the provided strategy"""
        return self.strategy.handle_data(self.df)


In [10]:
class DataValiadtion:
    def __init__(self, config: DataValidationConfig):
        self.config = config

    def advanced_processing(self)-> bool:
        try:
            data = pd.read_csv(self.config.unzip_data_dir)

            outlier_stratergy = DataOutlierHandlingStrategy()
            data_cleaner = DataCleaning(data, outlier_stratergy)
            data_cleaned = data_cleaner.handle_data()
    
            preprocess_strategy = DataPreprocessStrategy()
            data_cleaning = DataCleaning(data_cleaned, preprocess_strategy)
            preprocessed_data = data_cleaning.handle_data()
    
            cat_to_numeric_strategy = DataCatToNumeric()
            data_cleaner = DataCleaning(preprocessed_data, cat_to_numeric_strategy)
            data_numeric = data_cleaner.handle_data()
            logger.info("Advanced pre processing is done")

            data_numeric.to_csv(self.config.preprocessed_data, index=False)

            logger.info("data file saved to given path")
            return True
        
        except Exception as e:
            raise e



In [11]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValiadtion(config=data_validation_config)
    data_validation.advanced_processing()
except Exception as e:
    raise e

[2023-12-18 22:31:54,927: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-12-18 22:31:54,931: INFO: common: yaml file: params.yaml loaded successfully]
[2023-12-18 22:31:54,934: INFO: common: created directory at: artifacts]
[2023-12-18 22:31:54,935: INFO: common: created directory at: artifacts/data_validation]


[2023-12-18 22:31:55,029: INFO: 3030762188: Advanced pre processing is done]
[2023-12-18 22:31:55,066: INFO: 3030762188: data file saved to given path]
