In [1]:
import os

In [2]:
%pwd

'c:\\DataScience\\Projects\\Predictive_maintenance\\Predictive_Maintenance_JetEngine\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\DataScience\\Projects\\Predictive_maintenance\\Predictive_Maintenance_JetEngine'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    ground_truth_data_path: Path
    columns: list
    columns_to_drop: list

In [6]:
from src.PredictiveMaintenance.constants import *
from src.PredictiveMaintenance.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        cols_schema=self.schema.COLUMNS
        cols_to_drop_schema=self.schema.COLUMNS_TO_DROP

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            train_data_path=config.train_data_path,
            test_data_path=config.test_data_path,
            ground_truth_data_path=config.ground_truth_data_path,
            columns=list(cols_schema.keys()),
            columns_to_drop=list(cols_to_drop_schema.keys())
        )
        return data_transformation_config

In [8]:
import os
from src.PredictiveMaintenance import logger
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [9]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        logger.info("data transformation starteD")
        self.config = config 
    
    def Calculate_RUL(self,df):
        max_cycles = df.groupby('Engine Number')['Times/ in cycle'].max()
        merged = df.merge(max_cycles.to_frame(name='max_time_cycle'), left_on='Engine Number',right_index=True)
        merged["RUL"] = merged["max_time_cycle"] - merged['Times/ in cycle']
        merged["RUL"].head(3)
        merged = merged.drop("max_time_cycle", axis=1)
        return merged
    
    def impute_outliers(self,data):
        sensors=data.drop(columns=['Engine Number', 'Times/ in cycle'],axis=1)
        sensors=data.columns
        for col in sensors:
            q1 = data[col].quantile(0.25)
            q3 = data[col].quantile(0.75)
            iqr = q3 - q1
            lower_bound = q1 - (1.5 * iqr)
            upper_bound = q3 + (1.5 * iqr)

            data.loc[data[col] < lower_bound, col] = data[col].median()
            data.loc[data[col] > upper_bound, col] = data[col].median()
        return data
    
    def train_test_spliting(self):

        train_data=pd.read_csv(self.config.train_data_path,names=self.config.columns,sep="\s+",header=None)
        test_data=pd.read_csv(self.config.test_data_path,names=self.config.columns,sep="\s+",header=None)
        ground_truth_data=pd.read_csv(self.config.ground_truth_data_path,names=["RUL"])
        logger.info("Data loaDeD")

        print(train_data.shape)
        train_data=self.Calculate_RUL(train_data)
        logger.info(train_data["RUL"].head(2))

        logger.info("rul calcualateD")
        print(train_data.shape)

        X_train=train_data.iloc[:,:-1]
        y_train=train_data.iloc[:,-1]
        logger.info(y_train.head(2))

        X_train=self.impute_outliers(X_train)
        logger.info("outliers imputeD")

        X_train=X_train.drop(self.config.columns_to_drop,axis=1)
        
        print(X_train.shape)

        print(test_data.shape)#.drop(columns=["Engine Number","Times/ in cycle","Burner fuel-air ratio", "Required fan speed","Total Fan inlet temperature","Total Fan inlet pressure", "Required fan conversion speed", "Mach Number(Setting_2)", "Bleed enthalpy","Engine pressure ratio(P50/P2)","TRA(Setting_3)"],axis=1)
        test_data=test_data.groupby("Engine Number").last().reset_index().drop(columns=["Engine Number","Times/ in cycle","Burner fuel-air ratio", "Required fan speed","Total Fan inlet temperature","Total Fan inlet pressure", "Required fan conversion speed", "Mach Number(Setting_2)", "Bleed enthalpy","Engine pressure ratio(P50/P2)","TRA(Setting_3)"],axis=1)
        
        columns = list(X_train.columns[1:])

        logger.info("Data preprocessing start")
        preprocessor = ColumnTransformer(
            transformers=[
                ("Numeric", StandardScaler(), columns)
            ],
            remainder='passthrough'
        )
        preprocessor_pipeline = Pipeline(
            steps=[
                ("preprocessor", preprocessor)
            ]
        )

        cols_names = list(X_train.columns[:])

        X_train = pd.DataFrame(preprocessor_pipeline.fit_transform(X_train), columns=cols_names)
        X_test = pd.DataFrame(preprocessor_pipeline.transform(test_data), columns=cols_names)

        X_train.to_csv(os.path.join(self.config.root_dir, "X_train.csv"),index = False)
        y_train.to_csv(os.path.join(self.config.root_dir,"y_train.csv"),index=False)
        X_test.to_csv(os.path.join(self.config.root_dir, "X_test.csv"),index = False)
        ground_truth_data.to_csv(os.path.join(self.config.root_dir,"RUL.csv"),index=False)

        logger.info("Splited data into training and test sets")
        logger.info(X_train.shape)
        logger.info(X_test.shape)

        print(X_train.shape)
        print(X_test.shape)



In [10]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.train_test_spliting()
except Exception as e:
    raise e

[2024-03-09 15:01:38,925: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-03-09 15:01:38,935: INFO: common: yaml file: params.yaml loaded successfully]
[2024-03-09 15:01:38,943: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-03-09 15:01:38,949: INFO: common: created directory at: artifacts]
[2024-03-09 15:01:38,952: INFO: common: created directory at: artifacts/data_transformation]
[2024-03-09 15:01:38,952: INFO: 2736021010: data transformation starteD]
[2024-03-09 15:01:39,782: INFO: 2736021010: Data loaDeD]
(53759, 26)
[2024-03-09 15:01:39,813: INFO: 2736021010: 0    148
1    147
Name: RUL, dtype: int64]
[2024-03-09 15:01:39,819: INFO: 2736021010: rul calcualateD]
(53759, 27)
[2024-03-09 15:01:39,822: INFO: 2736021010: 0    148
1    147
Name: RUL, dtype: int64]
[2024-03-09 15:01:39,953: INFO: 2736021010: outliers imputeD]
(53759, 15)
(33991, 26)
c  (259, 15)
[2024-03-09 15:01:39,963: INFO: 2736021010: Data preprocessing start]
[2024-03-09 15:01