In [1]:
import os 
import pandas as pd

In [2]:
%pwd

'c:\\Users\\admin\\Desktop\\Predictive Maintenance\\Predictive_Maintenance_With_MLops\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\admin\\Desktop\\Predictive Maintenance\\Predictive_Maintenance_With_MLops'

In [5]:
# writing entity : is just like a return type 


from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataCleaningConfig:
    root_dir: Path
    unzip_data_dir_train: Path
    
    

In [6]:
from src.Mlflow_Project.constants import *
from src.Mlflow_Project.utils.utility import FileOperations

In [7]:
# configuration manager class , will read all yaml files 

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = FileOperations.read_yaml(config_filepath)
        self.params = FileOperations.read_yaml(params_filepath)
        self.schema = FileOperations.read_yaml(schema_filepath)

        FileOperations.create_directories([self.config.artifacts_root])


    def data_cleaning_config(self) -> DataCleaningConfig:
        config = self.config.data_cleaning

        FileOperations.create_directories([config.root_dir])

        data_cleaning_config = DataCleaningConfig(
            root_dir=config.root_dir,
            unzip_data_dir_train = config.unzip_data_dir_train

        )

        return data_cleaning_config

In [8]:
import os 
from src.Mlflow_Project.__init__ import logger 
import pandas as pd 
import numpy as np 

In [9]:
class DataCleaning:
    def __init__(self, config: DataCleaningConfig):
        self.config = config


    ## Note: You can add different data transformtion techniques such as Scaler, PCA and all 
    # You can perform all kinds of EDA in ML cycle here before passing this data to the model 

    # defining train test split method 

    def data_cleaning(self):
        df = pd.read_csv(self.config.unzip_data_dir_train, sep = " ")

        columns_to_drop = ['Unnamed: 26', 'Unnamed: 27']
        df = df.drop(columns=columns_to_drop)

        index_names = ['engine', 'cycle']
        setting_names = ['setting_1', 'setting_2', 'setting_3']
        sensor_names= [ "(Fan inlet temperature) (◦R)",
                "(LPC outlet temperature) (◦R)",
                "(HPC outlet temperature) (◦R)",
                "(LPT outlet temperature) (◦R)",
                "(Fan inlet Pressure) (psia)",
                "(bypass-duct pressure) (psia)",
                "(HPC outlet pressure) (psia)",
                "(Physical fan speed) (rpm)",
                "(Physical core speed) (rpm)",
                "(Engine pressure ratio(P50/P2)",
                "(HPC outlet Static pressure) (psia)",
                "(Ratio of fuel flow to Ps30) (pps/psia)",
                "(Corrected fan speed) (rpm)",
                "(Corrected core speed) (rpm)",
                "(Bypass Ratio) ",
                "(Burner fuel-air ratio)",
                "(Bleed Enthalpy)",
                "(Required fan speed)",
                "(Required fan conversion speed)",
                "(High-pressure turbines Cool air flow)",
                "(Low-pressure turbines Cool air flow)" ]

        col_names = index_names + setting_names + sensor_names
        df.columns = col_names


        print("na values available in data \n")
        print(df.isna().sum())
        df = df.dropna()
        print("after droping na values \n")
        print(df.isna().sum())
        print("Uniques Values : ")
        print(df.nunique())
        print("Observing columns with only one uniques values: ")
        unwanted=[]
        for i in df.select_dtypes(include= np.number):
            if df[i].nunique()==1:
                unwanted.append(i)
        print(unwanted)
        print("columns have only one unique value, so we are dropping theses columns")
        df.drop(columns=unwanted, inplace=True)

        drop_columns = ['(Corrected core speed) (rpm)']

        df.drop(columns = drop_columns, axis=1, inplace=True)

         # add target fucntion 
        # define the maximum life of each engine, 
        #as this could be used to obtain the RUL at each point in time of the engine's life 

        #remaining useful life (RUL) of each engine in the test dataset.
        #RUL is equivalent of number of flights remained for the engine after the last datapoint in the test dataset.

        data_rul = df.groupby(['engine']).agg({'cycle':'max'})
        data_rul.rename(columns={'cycle':'life'},inplace=True)

        data_train = df.merge(data_rul,how='left',on=['engine'])

        data_train['RUL']=data_train['life']-data_train['cycle']
        data_train.drop(['life'],axis=1,inplace=True)

        drop_columns = ['(Corrected fan speed) (rpm)','(Physical fan speed) (rpm)','(HPC outlet temperature) (◦R)']
        data_train.drop(columns = drop_columns, axis=1, inplace=True)
        
        data_train.to_csv(os.path.join(self.config.root_dir, "train.csv"), index = False)
        #test.to_csv(os.path.join(self.config.root_dir, "test.csv"), index = False)

    


In [10]:
# defining Pipeline 

try:
    config = ConfigurationManager()
    data_cleaning_config = config.data_cleaning_config()
    data_cleaning = DataCleaning(config = data_cleaning_config)  
    data_cleaning.data_cleaning()
except Exception as e:
    raise e 

[2023-10-31 09:48:56,162: INFO: yaml file: config\config.yaml loaded successfully]
[2023-10-31 09:48:56,166: INFO: yaml file: params.yaml loaded successfully]
[2023-10-31 09:48:56,174: INFO: yaml file: schema.yaml loaded successfully]
[2023-10-31 09:48:56,177: INFO: created directory at: artifacts]
[2023-10-31 09:48:56,180: INFO: created directory at: artifacts/data_cleaning]
na values available in data 

engine                                     0
cycle                                      0
setting_1                                  0
setting_2                                  0
setting_3                                  0
(Fan inlet temperature) (◦R)               0
(LPC outlet temperature) (◦R)              0
(HPC outlet temperature) (◦R)              0
(LPT outlet temperature) (◦R)              0
(Fan inlet Pressure) (psia)                0
(bypass-duct pressure) (psia)              0
(HPC outlet pressure) (psia)               0
(Physical fan speed) (rpm)                 0
(Physi

In [39]:
df.shape

NameError: name 'df' is not defined

In [11]:
df = pd.read_csv("artifacts/data_ingestion/CMaps/train_FD001.txt", sep = " ")    

columns_to_drop = ['Unnamed: 26', 'Unnamed: 27']
df = df.drop(columns=columns_to_drop)

index_names = ['engine', 'cycle']
setting_names = ['setting_1', 'setting_2', 'setting_3']
sensor_names=[ "(Fan inlet temperature) (◦R)",
                "(LPC outlet temperature) (◦R)",
                "(HPC outlet temperature) (◦R)",
                "(LPT outlet temperature) (◦R)",
                "(Fan inlet Pressure) (psia)",
                "(bypass-duct pressure) (psia)",
                "(HPC outlet pressure) (psia)",
                "(Physical fan speed) (rpm)",
                "(Physical core speed) (rpm)",
                "(Engine pressure ratio(P50/P2)",
                "(HPC outlet Static pressure) (psia)",
                "(Ratio of fuel flow to Ps30) (pps/psia)",
                "(Corrected fan speed) (rpm)",
                "(Corrected core speed) (rpm)",
                "(Bypass Ratio) ",
                "(Burner fuel-air ratio)",
                "(Bleed Enthalpy)",
                "(Required fan speed)",
                "(Required fan conversion speed)",
                "(High-pressure turbines Cool air flow)",
                "(Low-pressure turbines Cool air flow)" ]

col_names = index_names + setting_names + sensor_names
df.columns = col_names

df.head()
#df.columns

Unnamed: 0,engine,cycle,setting_1,setting_2,setting_3,(Fan inlet temperature) (◦R),(LPC outlet temperature) (◦R),(HPC outlet temperature) (◦R),(LPT outlet temperature) (◦R),(Fan inlet Pressure) (psia),...,(Ratio of fuel flow to Ps30) (pps/psia),(Corrected fan speed) (rpm),(Corrected core speed) (rpm),(Bypass Ratio),(Burner fuel-air ratio),(Bleed Enthalpy),(Required fan speed),(Required fan conversion speed),(High-pressure turbines Cool air flow),(Low-pressure turbines Cool air flow)
0,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
1,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
2,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
3,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044
4,1,6,-0.0043,-0.0001,100.0,518.67,642.1,1584.47,1398.37,14.62,...,521.68,2388.03,8132.85,8.4108,0.03,391,2388,100.0,38.98,23.3669
