In [1]:
import os
os.chdir("../")
%pwd

'/home/anonymous/PycharmProjects/MLOps_heart_disease'

In [2]:
from dataclasses import dataclass
from pathlib import Path

In [3]:
@dataclass(frozen=True)
class FeatureEngineeringConfig:
    root_dir: Path
    data_path: Path
    target_column: str


In [4]:
from mlhrtds.constants import *
from mlhrtds.utils.common import read_yaml, create_directories

In [5]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    def get_data_transformation_config(self) -> FeatureEngineeringConfig:
        config = self.config.data_transformation
        schema =  self.schema.TARGET_COLUMN

        create_directories([config.root_dir])

        data_transformation_config = FeatureEngineeringConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            target_column = schema.name
        )

        return data_transformation_config
    

In [6]:
import os
from mlhrtds import logger
from sklearn.model_selection import train_test_split
import pandas as pd

In [11]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from mlhrtds import logger
from mlhrtds.utils.common import read_yaml, create_directories

# Assuming FeatureEngineeringConfig is defined elsewhere
# from mlds.entity.config_entity import FeatureEngineeringConfig

class DataTransformation:
    def __init__(self, config: FeatureEngineeringConfig):
        self.config = config

    def train_test_splitting(self):
        # Read the data from the specified path
        data = pd.read_csv(self.config.data_path)

        

        # Split the data into X (features) and y (target)
        X = data.drop([self.config.target_column], axis=1)
        y = data[[self.config.target_column]]
        
        # Define categorical and numerical columns
        categorical_columns = X.select_dtypes(include="object").columns
        numerical_columns = X.select_dtypes(exclude="object").columns

        # Perform one-hot encoding for categorical columns
        X = pd.get_dummies(X, columns=categorical_columns)
        print(X)
        # Define categorical and numerical columns
        categorical_columns = X.select_dtypes(include="object").columns
        numerical_columns = X.select_dtypes(exclude="object").columns

        # Split the data into training and test sets (75% train, 25% test)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
        # X_train.shape
        # X_train.shape, X_test.shape, y_train.shape, y_test.shape
        # Define transformers for numerical features (scaling in this case) .reshape(-1, 1)
        numerical_transformer = Pipeline(steps=[
            ('scaler', StandardScaler())
        ])

        # Define transformers for categorical features (one-hot encoding)
        categorical_transformer = Pipeline(steps=[
            ('onehot', OneHotEncoder())  # You can customize options for encoding here
        ])

        # Combine transformers using ColumnTransformer
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numerical_transformer, numerical_columns),
                ('cat', categorical_transformer, categorical_columns)
            ])

        # Create the final data preprocessing pipeline
        pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

        # Fit and transform the training data
        X_train_transformed = pipeline.fit_transform(X_train.shape)

        # Transform the test data
        X_test_transformed = pipeline.transform(X_test)

        # Save the transformed data
        pd.DataFrame(X_train_transformed).to_csv(os.path.join(self.config.root_dir, "X_train.csv"), index=False)
        pd.DataFrame(y).to_csv(os.path.join(self.config.root_dir, "y_train.csv"), index=False)
        pd.DataFrame(X_test_transformed).to_csv(os.path.join(self.config.root_dir, "X_test.csv"), index=False)
        pd.DataFrame(y_test).to_csv(os.path.join(self.config.root_dir, "y_test.csv"), index=False)
        print("Splitted data into training and test sets and performed feature engineering.")


In [12]:
try:
    config =  ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transform = DataTransformation(config=data_transformation_config)
    data_transform.train_test_splitting()
except Exception as e:
    raise e

[2023-09-20 21:04:12,358: INFO: common: yaml file: config/config.yaml loaded successfully]
[2023-09-20 21:04:12,361: INFO: common: yaml file: params.yaml loaded successfully]
[2023-09-20 21:04:12,364: INFO: common: yaml file: schema.yaml loaded successfully]
[2023-09-20 21:04:12,366: INFO: common: created directory at: artifacts]
[2023-09-20 21:04:12,368: INFO: common: created directory at: artifacts/data_transformation]
      age     bmi  children  sex_female  sex_male  smoker_no  smoker_yes  \
0      19  27.900         0        True     False      False        True   
1      18  33.770         1       False      True       True       False   
2      28  33.000         3       False      True       True       False   
3      33  22.705         0       False      True       True       False   
4      32  28.880         0       False      True       True       False   
...   ...     ...       ...         ...       ...        ...         ...   
1333   50  30.970         3       False    

ValueError: Expected 2D array, got 1D array instead:
array=[1003 11].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [85]:
pd.read_csv("/home/anonymous/PycharmProjects/MLOps_heart_disease/artifacts/data_transformation/X_train.csv")

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,-1.087167,-1.140875,-0.917500,-0.991067,0.991067,0.508399,-0.508399,-0.577734,1.754205,-0.590015,-0.571594
1,-0.802106,-0.665842,0.743605,1.009014,-1.009014,0.508399,-0.508399,-0.577734,-0.570059,1.694871,-0.571594
2,0.836992,1.528794,-0.086947,-0.991067,0.991067,0.508399,-0.508399,-0.577734,-0.570059,-0.590015,1.749494
3,0.551932,0.926476,-0.086947,-0.991067,0.991067,-1.966960,1.966960,-0.577734,-0.570059,1.694871,-0.571594
4,0.480667,-0.268178,0.743605,1.009014,-1.009014,0.508399,-0.508399,-0.577734,-0.570059,-0.590015,1.749494
...,...,...,...,...,...,...,...,...,...,...,...
998,-1.514757,0.139468,2.404710,1.009014,-1.009014,0.508399,-0.508399,1.730900,-0.570059,-0.590015,-0.571594
999,-0.018189,-1.105101,3.235263,1.009014,-1.009014,0.508399,-0.508399,-0.577734,-0.570059,1.694871,-0.571594
1000,1.335848,-0.887967,-0.917500,-0.991067,0.991067,0.508399,-0.508399,1.730900,-0.570059,-0.590015,-0.571594
1001,-0.160720,2.843247,0.743605,1.009014,-1.009014,-1.966960,1.966960,-0.577734,-0.570059,-0.590015,1.749494
