In [3]:
import os
os.chdir("../")
%pwd

'/Users/tapankheni/Data_Science/Data Science Projects/Wafers_Fault_Prediction'

In [26]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    cols_to_drop: list

In [27]:
from WafersFault.constants import CONFIG_YAML_FILE_PATH, PARAMS_YAML_FILE_PATH, SCHEMA_YAML_FILE_PATH
from WafersFault.utils.common import read_yaml, create_directories

In [28]:
class ConfigurationManager:
    def __init__(self,
                 config_path = CONFIG_YAML_FILE_PATH,
                 params_path = PARAMS_YAML_FILE_PATH,
                 schema_path = SCHEMA_YAML_FILE_PATH):
        
        self.config = read_yaml(config_path)
        self.params = read_yaml(params_path)
        self.schema = read_yaml(schema_path)

        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        drop_schema = self.schema.COLS_TO_DROP

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            cols_to_drop=list(drop_schema.keys()),
        )

        return data_transformation_config     

In [29]:
from WafersFault import logger
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import RobustScaler
from imblearn.combine import SMOTETomek
from sklearn.model_selection import train_test_split

In [30]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig) -> None:
        self.config = config

    def do_data_transformation(self):

        df = pd.read_csv(self.config.data_path)

        logger.info("dropping the unnecessary columns")
        df.drop(columns=self.config.cols_to_drop, axis=1, inplace=True)

        logger.info("spliting dependent and independent features")
        X = df.iloc[:,:-1]
        y = df["Good/Bad"]

        logger.info(f"Shape of X: {X.shape}, shape of y: {y.shape}")

        logger.info("data preprocessing started....")
        numerical_columns = list(X.columns)

        preprocessor = ColumnTransformer(
            transformers=[
                ("Numeric_1", KNNImputer(n_neighbors=5), numerical_columns),
                ("Numeric_2", Pipeline([
                    ("imputer", SimpleImputer(strategy="median")),
                    ("scaler", (RobustScaler()))
                ]), numerical_columns)
            ]
        )

        preprocessor_pipeline = Pipeline(
            steps=[
                ("preprocessor", preprocessor)
            ]
        )

        X_transformed = preprocessor_pipeline.fit_transform(X)
        logger.info(f"Shape of the transformed X: {X.shape}")
        logger.info("data preprocessing completed....")

        logger.info("class imbalance handling phase started")

        resampler = SMOTETomek(sampling_strategy="auto")
        X_res, y_res = resampler.fit_resample(X_transformed, y)

        logger.info(f"Before resampling, Shape of the training instances: {np.c_[X_transformed, y].shape}\n")
        logger.info(f"After resampling, Shape of the training instances: {np.c_[X_res, y_res].shape}\n")
        logger.info(y.value_counts())
        logger.info(y_res.value_counts())

        logger.info("done with class imbalance")

        logger.info("train_test_split started....")
        X_train, y_train, X_test, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)
        y_train = np.where(y_train==-1, 0, 1)
        y_test = np.where(y_test==-1, 0, 1)

        logger.info(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
        logger.info(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")
        logger.info("train_test_split completed.....")

        np.save(os.path.join(self.config.root_dir, "X_train.npy"), X_train)
        np.save(os.path.join(self.config.root_dir, "X_test.npy"), X_test)
        np.save(os.path.join(self.config.root_dir, "y_train.npy"), y_train)
        np.save(os.path.join(self.config.root_dir, "y_test.npy"), y_test)


In [31]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    obj = DataTransformation(config=data_transformation_config)
    obj.do_data_transformation()

except Exception as e:
    raise e

[2024-03-12 11:27:24,425: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-03-12 11:27:24,427: INFO: common: yaml file: params.yaml loaded successfully]
[2024-03-12 11:27:24,467: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-03-12 11:27:24,472: INFO: common: created directory at: artifacts]
[2024-03-12 11:27:24,473: INFO: common: created directory at: artifacts/data_transformation]
[2024-03-12 11:27:24,483: INFO: 3373966542: dropping the unnecessary columns]
[2024-03-12 11:27:24,484: INFO: 3373966542: spliting dependent and independent features]
[2024-03-12 11:27:24,484: INFO: 3373966542: Shape of X: (100, 464), shape of y: (100,)]
[2024-03-12 11:27:24,485: INFO: 3373966542: data preprocessing started....]
[2024-03-12 11:27:24,550: INFO: 3373966542: Shape of the transformed X: (100, 464)]
[2024-03-12 11:27:24,552: INFO: 3373966542: data preprocessing completed....]
[2024-03-12 11:27:24,553: INFO: 3373966542: class imbalance handling phase started