In [1]:
import os
os.chdir("../")
%pwd

'/Users/tapankheni/Data_Science/Data Science Projects/Predictive_Maintenance_JetEngine'

In [14]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataTransformationConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    all_schema: list
    cols_to_drop: list
    

In [15]:
from PredictiveMaintenance.constants import CONFIG_YAML_FILE_PATH, PARAMS_YAML_FILE_PATH, SCHEMA_YAML_FILE_PATH
from PredictiveMaintenance.utils.common import read_yaml, create_directories

In [17]:
class ConfigurationManager:
    def __init__(self,
                 config_path = CONFIG_YAML_FILE_PATH,
                 params_path = PARAMS_YAML_FILE_PATH,
                 schema_path = SCHEMA_YAML_FILE_PATH):
        
        self.config = read_yaml(config_path)
        self.params = read_yaml(params_path)
        self.schema = read_yaml(schema_path)

        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        schema = self.schema.COLUMNS
        drop_schema = self.schema.COLS_TO_DROP_CATBOOST

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            train_data_path=config.train_data_path,
            test_data_path=config.test_data_path,
            all_schema=list(schema.keys()),
            cols_to_drop=list(drop_schema.keys())
        )

        return data_transformation_config      

In [18]:
from PredictiveMaintenance import logger
import pandas as pd
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [26]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def add_RUL_feature(self, df: pd.DataFrame) -> pd.DataFrame:
        df_RUL = df.groupby("Engine Number").agg({"Times/ in cycle":"max"})
        df_RUL.rename(columns={"Times/ in cycle": "total life"}, inplace=True)
        logger.info(df_RUL.head())

        df = df.merge(df_RUL, on=["Engine Number"], how="left")
        logger.info(df.head())

        df["RUL"] = df["total life"] - df["Times/ in cycle"]
        df.drop(["total life"], axis=1, inplace=True)
        logger.info(df.head())  

        return df
    
    def train_test_split(self, df1: pd.DataFrame, df2: pd.DataFrame):
        X_train = df1.iloc[:,:-1]
        y_train = df1.iloc[:,-1]
        X_test = df2.iloc[:,:len(df2.columns)+1]

        return X_train, y_train, X_test
    
    import pandas as pd

    def replace_outliers_with_median(self, df: pd.DataFrame) -> pd.DataFrame:
        Q1 = df.quantile(0.10) 
        Q3 = df.quantile(0.90)
        IQR = Q3 - Q1
        
        outlier_low = Q1 - 1.5 * IQR
        outlier_high = Q3 + 1.5 * IQR
        
        for col in df.columns:
            col_median = df[col].median() 
            df.loc[df[col] < outlier_low[col], col] = col_median  
            df.loc[df[col] > outlier_high[col], col] = col_median
            
        return df
    
    def prepare_test_data(self, df: pd.DataFrame) -> pd.DataFrame:
        df_cycle = df.groupby(["Engine Number"]).agg({"Times/ in cycle":"max"})
        df_cycle.rename(columns={"Times/ in cycle":"life"}, inplace=True)

        df = df.merge(df_cycle, how="left", on=["Engine Number"])

        df = df[(df["Times/ in cycle"] == df["life"])]
        df.drop(["life"], axis=1, inplace=True)

        logger.info(df.head())
        return df
    
    def do_data_transformation(self):
        train_folder_path = self.config.train_data_path
        test_folder_path = self.config.test_data_path

        dfs = {}
        for filename in os.listdir(train_folder_path):
            if filename.endswith(".txt"):
                file_path = os.path.join(train_folder_path, filename)

                df_name = os.path.splitext(filename)[0]

                df = pd.read_csv(file_path, sep="\s+", header=None, names=self.config.all_schema)

                dfs[df_name] = df

                logger.info(f"DataFrame '{df_name}' has been created with shape: {df.shape}")
        
        for filename in os.listdir(test_folder_path):
            if filename.endswith(".txt"):
                file_path = os.path.join(test_folder_path, filename)

                df_name = os.path.splitext(filename)[0]

                df = pd.read_csv(file_path, sep="\s+", header=None, names=self.config.all_schema)

                dfs[df_name] = df

                logger.info(f"DataFrame '{df_name}' has been created with shape: {df.shape}")

        train_FD001 = dfs["train_FD001"]
        train_FD003 = dfs["train_FD003"]
        test_FD001 = dfs["test_FD001"]
        test_FD003 = dfs["test_FD003"]

        logger.info("Start adding RUL features for train_FD001 and train_FD003\n")
        train_FD001 = self.add_RUL_feature(train_FD001)
        train_FD003 = self.add_RUL_feature(train_FD003)
        logger.info("Finished adding RUL features for train_FD001 and train_FD003\n")

        logger.info("Start preparing test_FD001 and test_FD003\n") 
        test_FD001 = self.prepare_test_data(test_FD001)
        test_FD003 = self.prepare_test_data(test_FD003)
        logger.info("Finished preparing test_FD001 and test_FD003\n")    
        
        logger.info("train_test_split process started.....")
        X_train_FD001, y_train_FD001, X_test_FD001 = self.train_test_split(df1=train_FD001, df2=test_FD001)
        X_train_FD003, y_train_FD003, X_test_FD003 = self.train_test_split(df1=train_FD003, df2=test_FD003)
        logger.info("train_test_split completed.")

        logger.info(f"X_train_FD001: {X_train_FD001.shape}, y_train_FD001: {y_train_FD001.shape}, X_test_FD001: {X_test_FD001.shape}")
        logger.info(f"X_train_FD003: {X_train_FD003.shape}, y_train_FD003: {y_train_FD003.shape}, X_test_FD003: {X_test_FD003.shape}")

        logger.info(f"Merging train and test datasets.")
        X_train = pd.concat([X_train_FD001, X_train_FD003])
        y_train = pd.concat([y_train_FD001, y_train_FD003])
        logger.info(f"X_train: {X_train.shape}, y_train: {y_train.shape}")

        X_test = pd.concat([X_test_FD001, X_test_FD003])
        logger.info(f"X_test: {X_test.shape}")
        logger.info(f"Completed train and test datasets.")

        logger.info("Dropping unnecessory columns.")
        logger.info(f"Before dropping: {X_train.columns}")

        X_train.drop(columns=self.config.cols_to_drop, axis=1, inplace=True)
        X_test.drop(columns=self.config.cols_to_drop, axis=1, inplace=True)

        logger.info(f"After dropping: {X_train.columns}")
        logger.info("Unnecessory columns are droped.")

        y_train.clip(upper=260)

        logger.info("Outlier dropping start")
        X_train = self.replace_outliers_with_median(X_train)
        X_test = self.replace_outliers_with_median(X_test)
        logger.info("Outlier dropping completed.")

        numerical_columns = list(X_train.columns[1:])

        logger.info("Data preprocessing start")
        preprocessor = ColumnTransformer(
            transformers=[
                ("Numeric", RobustScaler(), numerical_columns)
            ],
            remainder='passthrough'
        )
        preprocessor_pipeline = Pipeline(
            steps=[
                ("preprocessor", preprocessor)
            ]
        )

        cols_names = list(X_train.columns[1:])
        cols_names.extend(["Times/ in cycle"])

        X_train = pd.DataFrame(preprocessor_pipeline.fit_transform(X_train), columns=cols_names)
        X_test = pd.DataFrame(preprocessor_pipeline.transform(X_test), columns=cols_names)

        logger.info(X_train.head())
        logger.info(X_test.head())
        logger.info("Data preprocessing completed.")

        X_train.to_csv(os.path.join(self.config.root_dir, 'X_train.csv'), index=False)
        X_test.to_csv(os.path.join(self.config.root_dir, 'X_test.csv'), index=False)

        y_train.to_csv(os.path.join(self.config.root_dir, 'y_train.csv'), index=False)

In [28]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    obj = DataTransformation(config=data_transformation_config)
    obj.do_data_transformation()
except Exception as e:
    raise e 

[2024-03-05 15:15:59,240: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-03-05 15:15:59,242: INFO: common: yaml file: params.yaml loaded successfully]
[2024-03-05 15:15:59,246: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-03-05 15:15:59,247: INFO: common: created directory at: artifacts]
[2024-03-05 15:15:59,248: INFO: common: created directory at: artifacts/data_transformation]
[2024-03-05 15:15:59,276: INFO: 2268282833: DataFrame 'train_FD001' has been created with shape: (20631, 26)]
[2024-03-05 15:15:59,307: INFO: 2268282833: DataFrame 'train_FD003' has been created with shape: (24720, 26)]
[2024-03-05 15:15:59,327: INFO: 2268282833: DataFrame 'test_FD003' has been created with shape: (16596, 26)]
[2024-03-05 15:15:59,343: INFO: 2268282833: DataFrame 'test_FD001' has been created with shape: (13096, 26)]
[2024-03-05 15:15:59,348: INFO: 2268282833: Start adding RUL features for train_FD001 and train_FD003
]
[2024-03-05 15:15:59,353: INFO: 2