In [1]:
import os
%pwd

'd:\\Python Hub\\MLOPS\\End-to-End-ML-Approach-to-Wine-Quality-Prediction-with-MLflow\\research'

In [2]:
os.chdir("../")

In [3]:
%pwd

'd:\\Python Hub\\MLOPS\\End-to-End-ML-Approach-to-Wine-Quality-Prediction-with-MLflow'

In [4]:
from dataclasses import dataclass 
from pathlib import Path 

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path 
    data_path: Path 
    train: Path
    test: Path

In [5]:
from MLproject.constants import * 
from MLproject.utils.common import read_yaml, create_directories

In [6]:
class ConfigurationManager:
    def __init__(self,
                config_filepath = CONFIG_FILE_PATH,
                schema_filepath = SCHEMA_FILE_PATH):
        
        self.config = read_yaml(config_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    def get_data_transformation_config(self) -> DataTransformationConfig:
        transformation_config = self.config.data_transformation 

        create_directories([transformation_config.root_dir]) 

        data_transformation_config = DataTransformationConfig(
            root_dir= transformation_config.root_dir,
            data_path= transformation_config.data_path,
            train= transformation_config.train,
            test= transformation_config.test 
        )    
        
        return data_transformation_config

In [7]:
import os
from MLproject import logger
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [11]:
class DataTransformation:
    def __init__(self, config=DataTransformationConfig):
        self.config = config

    def get_data_transformation_object(self):
        try:
            numerical_features = [
                "fixed acidity",
                "volatile acidity",
                "citric acid",
                "residual sugar",
                "chlorides",
                "free sulfur dioxide",
                "total sulfur dioxide",
                "density",
                "pH",
                "sulphates",
                "alcohol",
            ]

            data_transformer = ColumnTransformer(
                transformers=[
                    ("standard_scaler", StandardScaler(), numerical_features),
                ]
            )

            preprocessor = Pipeline(steps=[("data_transformer", data_transformer)])

            return preprocessor

        except Exception as e:
            logger.info(f"Error in creating data transformation object: {e}")
            raise e

    def train_test_split(self):
        try:
            data = pd.read_csv(self.config.data_path)
            logger.info("Read the dataset as DataFrame")

            logger.info("train test split initiated")
            train_set, test_set = train_test_split(data)

            train_set.to_csv(
                os.path.join(self.config.root_dir, "train.csv"), index=False
            )
            test_set.to_csv(os.path.join(self.config.root_dir, "test.csv"), index=False)

            logger.info("Splited data into training and test sets")
            logger.info(train_set.shape)
            logger.info(test_set.shape)

            return (
                self.config.train,
                self.config.test,
            )

        except Exception as e:
            logger.info(f"Error in splitting train test sets: {e}")
            raise e

    def inititate_data_transformation(self, train_set, test_set):
        try:
            train_set = pd.read_csv(self.config.train)
            test_set = pd.read_csv(self.config.test)

            logger.info("Reading train and test dataset completed")
            logger.info("Obtaining preprocessor object")

            preprocessor_obj = self.get_data_transformation_object()

            target_feature = "quality"

            independent_feature = [
                "fixed acidity",
                "volatile acidity",
                "citric acid",
                "residual sugar",
                "chlorides",
                "free sulfur dioxide",
                "total sulfur dioxide",
                "density",
                "pH",
                "sulphates",
                "alcohol",
            ]

            # Separate features and target
            input_train_df = train_set.drop(columns=[target_feature], axis=1)
            target_train_df = train_set[target_feature]

            input_test_df = test_set.drop(columns=[target_feature], axis=1)
            target_test_df = test_set[target_feature]

            logger.info(
                f"Applying preprocessing object on training dataframe and test dataframe."
            )

            # Apply preprocessor
            train_df = preprocessor_obj.fit_transform(input_train_df)
            test_df = preprocessor_obj.transform(input_test_df)

            logger.info(
                "successfully applied preprocessor object on train and test data"
            )

            # Combine features and target
            train_arr = np.c_[train_df, np.array(target_train_df)]

            test_arr = np.c_[test_df, np.array(target_test_df)]
            logger.info("returning train_array and test_array")
            
            
            logger.info("Saving train and test set in numpy file")
            np.save(os.path.join(self.config.root_dir,"train_arr.npy"), train_arr)
            np.save(os.path.join(self.config.root_dir,"test_arr.npy"), test_arr)

                
            return (
            train_arr,
            test_arr
        )

        except Exception as e:
            logger.info(f"Error in initiating data transformation: {e}")
            raise e

In [13]:
try:
    config = ConfigurationManager()
    data_tranformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_tranformation_config)
    train_data, test_data = data_transformation.train_test_split()
    train_arr, test_arr = data_transformation.inititate_data_transformation(train_data, test_data)
except Exception as e:
    raise e    

[ 2023-12-27 14:39:46,740:  INFO:  common: 33:  YAML file 'config\config.yaml' loaded successfully]
[ 2023-12-27 14:39:46,746:  INFO:  common: 33:  YAML file 'schema.yaml' loaded successfully]
[ 2023-12-27 14:39:46,748:  INFO:  common: 53:  Created directory at : artifacts]
[ 2023-12-27 14:39:46,750:  INFO:  common: 53:  Created directory at : artifacts/data_transformation]
[ 2023-12-27 14:39:46,760:  INFO:  3471270155: 38:  Read the dataset as DataFrame]
[ 2023-12-27 14:39:46,761:  INFO:  3471270155: 40:  train test split initiated]
[ 2023-12-27 14:39:46,789:  INFO:  3471270155: 48:  Splited data into training and test sets]
[ 2023-12-27 14:39:46,790:  INFO:  3471270155: 49:  (1199, 12)]
[ 2023-12-27 14:39:46,791:  INFO:  3471270155: 50:  (400, 12)]
[ 2023-12-27 14:39:46,802:  INFO:  3471270155: 66:  Reading train and test dataset completed]
[ 2023-12-27 14:39:46,807:  INFO:  3471270155: 67:  Obtaining preprocessor object]
[ 2023-12-27 14:39:46,815:  INFO:  3471270155: 94:  Applying p

In [16]:
file_path = "artifacts/data_transformation/test_arr.npy"
loaded_array = np.load(file_path)
print(loaded_array)

[[-3.72164793e-01  1.85349917e-01 -1.02193966e+00 ... -2.83907298e-03
  -7.54136143e-01  6.00000000e+00]
 [-3.72164793e-01  2.42651010e+00 -9.17190298e-01 ... -9.58361460e-01
  -8.47687933e-01  5.00000000e+00]
 [-8.28369867e-01 -2.02011103e-01 -9.17190298e-01 ... -4.80600266e-01
  -1.92825402e-01  6.00000000e+00]
 ...
 [-3.72164793e-01  4.06699072e-01 -1.12668902e+00 ... -2.41719670e-01
  -2.86377193e-01  6.00000000e+00]
 [-4.86216062e-01 -7.00046700e-01 -4.98192853e-01 ... -4.80600266e-01
  -9.41239724e-01  5.00000000e+00]
 [-7.14318599e-01  1.01540925e+00 -1.44093710e+00 ... -6.59760714e-01
  -9.41239724e-01  5.00000000e+00]]
