In [5]:
import os
os.chdir("../")
%pwd

'/Users/tapankheni/Data_Science/Data Science Projects/Credit_Card_Fault_Prediction'

In [4]:
from dataclasses import dataclass
from pathlib import Path
from typing import List

@dataclass
class DataTransformationConfig:
    root_dir: Path
    data_path: List[str]
    preprocessed_data_path: List[str]
    preprocessor_name: str

In [6]:
from CreditCardFraudDetection.constants import (PARAMS_YAML_FILE_PATH, CONFIG_YAML_FILE_PATH, SCHEMA_YAML_FILE_PATH)
from CreditCardFraudDetection.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(self,
                 params_yaml_file_path = PARAMS_YAML_FILE_PATH,
                 config_yaml_file_path = CONFIG_YAML_FILE_PATH,
                 schema_yaml_file_path = SCHEMA_YAML_FILE_PATH):

        print(params_yaml_file_path) 
        self.params = read_yaml(params_yaml_file_path)
        self.config = read_yaml(config_yaml_file_path)
        self.schema = read_yaml(schema_yaml_file_path)

        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir = config.root_dir,
            data_path = config.data_path,
            preprocessed_data_path = config.preprocessed_data_path,
            preprocessor_name = config.preprocessor_name
        )

        return data_transformation_config     

In [8]:
from CreditCardFraudDetection import logger
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import joblib

In [9]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def split_data(self, data: pd.DataFrame, stratify_column: str):
        train_data, test_data = train_test_split(data, stratify=data[stratify_column], test_size = 0.3, random_state = 42)
        logger.info("Data split into train and test sets successfully!")

        X_train, y_train, X_test, y_test = train_data.drop(stratify_column, axis=1), train_data[stratify_column], test_data.drop(stratify_column, axis=1), test_data[stratify_column]
        logger.info("Features and target variable retrieved successfully!")

        return X_train, X_test, y_train, y_test
    
    def get_skewed_features(self, data: pd.DataFrame) -> List[str]:
        skewed_features = []

        for col in data.select_dtypes(include=[np.number]).columns:
            mean = data[col].mean()
            median = data[col].median()
            mode = data[col].mode().iloc[0]

            if mean > median > mode:
                skewed_features.append(col)
            elif mean < median < mode:
                skewed_features.append(col)
            else:
                continue

        logger.info(f"Skewed features: {skewed_features}")

        return skewed_features
    
    def get_preprocessor(self, data: pd.DataFrame) -> Pipeline:
        non_skewed_features = list(data.columns)
        skewed_features = self.get_skewed_features(data)
        non_skewed_features = [col for col in non_skewed_features if col not in skewed_features]
        logger.info(f"Non-skewed features: {non_skewed_features}")

        power_transformation = PowerTransformer(method="yeo-johnson", copy=False, standardize=True)
        standard_scaler = StandardScaler()

        power_pipeline = Pipeline(
            steps=[
                ("power_transformation", power_transformation)
            ]
        )

        numeric_pipeline = Pipeline(
            steps=[
                ("scaler", standard_scaler)
            ]
        )

        preprocessor = ColumnTransformer(
        transformers=[
                ("power_pipeline", power_pipeline, skewed_features),
                ("numeric_pipeline", numeric_pipeline, non_skewed_features),
            ],
            remainder='passthrough'
        )
        logger.info("Preprocessor created successfully!")

        return preprocessor

    def perform_data_transformation(self):
        logger.info("Starting data transformation...")

        data_path = self.config.data_path[0]

        data = pd.read_csv(data_path)
        logger.info(f"Data loaded successfully from {data_path}")
        
        X_train, X_test, y_train, y_test = self.split_data(data, 'Class') 
        logger.info("Data split into train and test sets retrieved successfully!")

        preprocessor = self.get_preprocessor(X_train)
        logger.info("Preprocessor retrieved successfully!")

        logger.info(f"Before transformation: \n{X_train.head()}")

        X_train = preprocessor.fit_transform(X_train)
        X_test = preprocessor.transform(X_test)

        logger.info(f"After transformation: \n{X_train[:5]}")

        y_train = y_train.to_numpy()
        y_test = y_test.to_numpy()

        preprocessed_data_path = self.config.preprocessed_data_path[0]
        if not os.path.exists(preprocessed_data_path):
            os.makedirs(preprocessed_data_path, exist_ok=True)

        np.save(os.path.join(preprocessed_data_path, "X_train.npy"), X_train)
        np.save(os.path.join(preprocessed_data_path, "X_test.npy"), X_test)
        np.save(os.path.join(preprocessed_data_path, "y_train.npy"), y_train)
        np.save(os.path.join(preprocessed_data_path, "y_test.npy"), y_test)

        logger.info("Data saved successfully!")

        joblib.dump(preprocessor, os.path.join(preprocessed_data_path, self.config.preprocessor_name))
        logger.info("Preprocessor saved successfully!")

        logger.info("Data transformation completed successfully!")

In [10]:
try:
    config_manager = ConfigurationManager()
    data_transformation_config = config_manager.get_data_transformation_config()
    data_transformer = DataTransformation(config = data_transformation_config)
    data_transformer.perform_data_transformation()

except Exception as e:
    logger.error(f"Failed to perform data transformation! Error: {e}")
    raise e

params.yaml
[2024-07-05 16:29:16,949: INFO: common: yaml file: params.yaml loaded successfully]
[2024-07-05 16:29:16,952: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-07-05 16:29:16,956: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-07-05 16:29:16,957: INFO: common: created directory at: artifacts]
[2024-07-05 16:29:16,958: INFO: common: created directory at: artifacts/data_transformation]
[2024-07-05 16:29:16,959: INFO: 307610398: Starting data transformation...]
[2024-07-05 16:29:18,528: INFO: 307610398: Data loaded successfully from artifacts/data_ingestion/creditcard.csv]
[2024-07-05 16:29:18,675: INFO: 307610398: Data split into train and test sets successfully!]
[2024-07-05 16:29:18,692: INFO: 307610398: Features and target variable retrieved successfully!]
[2024-07-05 16:29:18,694: INFO: 307610398: Data split into train and test sets retrieved successfully!]
[2024-07-05 16:29:19,334: INFO: 307610398: Skewed features: ['V1', 'V6', 'V8',