In [2]:
import os
os.chdir("../")
%pwd

'/Users/tapankheni/Data_Science/Data Science Projects/Credit_Card_Fault_Prediction'

In [3]:
from dataclasses import dataclass
from pathlib import Path
from typing import List

@dataclass
class DataTransformationConfig:
    root_dir: Path
    data_path: List[str]
    preprocessed_data_path: List[str]
    preprocessor_name: str

In [4]:
from CreditCardFraudDetection.constants import (PARAMS_YAML_FILE_PATH, CONFIG_YAML_FILE_PATH, SCHEMA_YAML_FILE_PATH)
from CreditCardFraudDetection.utils.common import read_yaml, create_directories

In [5]:
class ConfigurationManager:
    def __init__(self,
                 params_yaml_file_path = PARAMS_YAML_FILE_PATH,
                 config_yaml_file_path = CONFIG_YAML_FILE_PATH,
                 schema_yaml_file_path = SCHEMA_YAML_FILE_PATH):
        
        self.params = read_yaml(params_yaml_file_path)
        self.config = read_yaml(config_yaml_file_path)
        self.schema = read_yaml(schema_yaml_file_path)

        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir = config.root_dir,
            data_path = config.data_path,
            preprocessed_data_path = config.preprocessed_data_path,
            preprocessor_name = config.preprocessor_name
        )

        return data_transformation_config     

In [7]:
from CreditCardFraudDetection import logger
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
import joblib

In [12]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def split_data(self, data: pd.DataFrame, stratify_column: str):
        train_data, test_data = train_test_split(data, stratify=data[stratify_column], test_size = 0.3, random_state = 42)
        logger.info("Data split into train and test sets successfully!")

        X_train, y_train, X_test, y_test = train_data.drop(stratify_column, axis=1), train_data[stratify_column], test_data.drop(stratify_column, axis=1), test_data[stratify_column]
        logger.info("Features and target variable retrieved successfully!")

        return X_train, X_test, y_train, y_test
    
    def get_preprocessor_for_default_payment(self, data: pd.DataFrame) -> Pipeline:
        cols_to_preprocess = list(data.columns)
        
        standardization = StandardScaler()

        scaler_pipeline = Pipeline(
            steps=[
                ("standardization", standardization)
            ]
        )

        preprocessor = ColumnTransformer(
            transformers=[
                ("scaler_pipeline", scaler_pipeline, cols_to_preprocess),
            ],
            remainder='passthrough'
        )

        preprocessor
        logger.info("Preprocessor created successfully!")

        return preprocessor

    def get_numerical_features(self, df) -> List[str]:
        numerical_features = []

        for col in df.columns:
            if df[col].nunique() > 13:
                numerical_features.append(col)
            
        return numerical_features 
    
    def remove_outliers(self, df: pd.DataFrame, numerical_features: List[str]) -> pd.DataFrame:
        for col in numerical_features:
            q1 = df[col].quantile(0.25)
            q3 = df[col].quantile(0.75)
            
            IQR = q3 - q1
            lower_bound = q1 - 1.5 * IQR
            upper_bound = q3 + 1.5 * IQR
            
            df[col] = df[col].clip(lower_bound, upper_bound)
            
        return df
    
    def handle_undocumented_categories(self, df: pd.DataFrame) -> pd.DataFrame:
        df["EDUCATION"] = df["EDUCATION"].map(lambda x : 4 if x in (5, 6, 0) else x)

        df["MARRIAGE"] = df["MARRIAGE"].map(lambda x : 3 if x == 0 else x)

        return df
    
    def handle_imbalance(self, df: pd.DataFrame) -> pd.DataFrame:
        logger.info("Handling imbalance in the data...")
        X = df.drop(columns=["default_status_next_month"], axis=1)
        y = df["default_status_next_month"]

        logger.info(f"Distribution of the target feature before resampling: {y.value_counts()}")

        smote = SMOTE(random_state=42)
        X, y = smote.fit_resample(X, y)

        logger.info(f"Distribution of the target feature after resampling: {y.value_counts()}")

        df = X.copy()
        df["default_status_next_month"] = y

        logger.info(f"Shape of the data: {df.shape}\n")

        return df

    def perform_data_transformation_for_default_payment(self):
        logger.info("Starting data transformation...")

        data_path = self.config.data_path[1]

        data = pd.read_csv(data_path)
        logger.info(f"Data loaded successfully from {data_path}")

        data.rename(columns = {'PAY_0' : 'PAY_1',
                               'default.payment.next.month' : 'default_status_next_month'}, inplace = True)
        logger.info("Columns renamed successfully")
        logger.info(f"Data columns: {data.columns}")

        numerical_features = self.get_numerical_features(data)
        logger.info(f"Numerical features: {numerical_features}")

        data = self.remove_outliers(data, numerical_features)
        logger.info("Outliers removed successfully")

        data = data[data["BILL_AMT1"] >= 0]
        logger.info("Data with BILL_AMT1 >= 0, shape: {data.shape}")

        data = self.handle_undocumented_categories(data)
        logger.info(f"EDUCATION: \n{data['EDUCATION'].value_counts()}")
        logger.info(f"MARRIAGE: \n{data['MARRIAGE'].value_counts()}")
        logger.info("Undocumented categories handled successfully")

        data.drop(columns=["BILL_AMT2", "BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6"], axis=1, inplace=True)
        logger.info("Columns BILL_AMT2, BILL_AMT3, BILL_AMT4, BILL_AMT5, BILL_AMT6 dropped successfully")
        logger.info(f"Data columns: {data.columns}")

        data = self.handle_imbalance(data)
        logger.info("Imbalance handled successfully")

        X_train, X_test, y_train, y_test = self.split_data(data, "default_status_next_month")
        logger.info("Data split into train and test sets successfully")

        preprocessor = self.get_preprocessor(X_train)
        logger.info("Preprocessor retrieved successfully")

        logger.info(f"Before transformation: \n{X_train.head()}")

        X_train = preprocessor.fit_transform(X_train)
        X_test = preprocessor.transform(X_test)

        logger.info(f"After transformation: \n{X_train[:5]}")

        y_train = y_train.to_numpy()
        y_test = y_test.to_numpy()

        preprocessed_data_path = self.config.preprocessed_data_path[1]
        if not os.path.exists(preprocessed_data_path):
            os.makedirs(preprocessed_data_path, exist_ok=True)

        np.save(os.path.join(preprocessed_data_path, "X_train.npy"), X_train)
        np.save(os.path.join(preprocessed_data_path, "X_test.npy"), X_test)
        np.save(os.path.join(preprocessed_data_path, "y_train.npy"), y_train)
        np.save(os.path.join(preprocessed_data_path, "y_test.npy"), y_test)

        logger.info("Data saved successfully!")

        joblib.dump(preprocessor, os.path.join(preprocessed_data_path, self.config.preprocessor_name))
        logger.info("Preprocessor saved successfully!")

        logger.info("Data transformation completed successfully!")

In [13]:
try:
    config_manager = ConfigurationManager()
    data_transformation_config = config_manager.get_data_transformation_config()
    data_transformer = DataTransformation(config = data_transformation_config)
    data_transformer.perform_data_transformation()

except Exception as e:
    logger.error(f"Failed to perform data transformation! Error: {e}")
    raise e

[2024-07-05 17:17:30,261: INFO: common: yaml file: params.yaml loaded successfully]
[2024-07-05 17:17:30,263: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-07-05 17:17:30,266: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-07-05 17:17:30,266: INFO: common: created directory at: artifacts]
[2024-07-05 17:17:30,266: INFO: common: created directory at: artifacts/data_transformation]
[2024-07-05 17:17:30,267: INFO: 1458035121: Starting data transformation...]
[2024-07-05 17:17:30,295: INFO: 1458035121: Data loaded successfully from artifacts/data_ingestion/default.csv]
[2024-07-05 17:17:30,296: INFO: 1458035121: Columns renamed successfully]
[2024-07-05 17:17:30,296: INFO: 1458035121: Data columns: Index(['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_1', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_A