In [1]:
import os 

In [2]:
%pwd

'c:\\Users\\admin\\Desktop\\Predictive Maintenance\\Predictive_Maintenance_With_MLops\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\admin\\Desktop\\Predictive Maintenance\\Predictive_Maintenance_With_MLops'

In [5]:
import pandas as pd 

In [6]:
data = pd.read_csv("artifacts/data_cleaning/train.csv")
data.head()

Unnamed: 0,engine,cycle,setting_1,setting_2,(LPC outlet temperature) (◦R),(LPT outlet temperature) (◦R),(bypass-duct pressure) (psia),(HPC outlet pressure) (psia),(Physical core speed) (rpm),(HPC outlet Static pressure) (psia),(Ratio of fuel flow to Ps30) (pps/psia),(Bypass Ratio),(Bleed Enthalpy),(High-pressure turbines Cool air flow),(Low-pressure turbines Cool air flow),RUL
0,1,2,0.0019,-0.0003,642.15,1403.14,21.61,553.75,9044.07,47.49,522.28,8.4318,392,39.0,23.4236,190
1,1,3,-0.0043,0.0003,642.35,1404.2,21.61,554.26,9052.94,47.27,522.42,8.4178,390,38.95,23.3442,189
2,1,4,0.0007,0.0,642.35,1401.87,21.61,554.45,9049.48,47.13,522.86,8.3682,392,38.88,23.3739,188
3,1,5,-0.0019,-0.0002,642.37,1406.22,21.61,554.0,9055.15,47.28,522.19,8.4294,393,38.9,23.4044,187
4,1,6,-0.0043,-0.0001,642.1,1398.37,21.61,554.67,9049.68,47.16,521.68,8.4108,391,38.98,23.3669,186


In [18]:
# Updating entity 

from dataclasses import dataclass 
from pathlib import Path 


@dataclass(frozen = True)  
class DataTransformationConfig:
    root_dir: Path 
    data_path: Path
    scaler_name: str

In [8]:
from src.Mlflow_Project.constants import *
from src.Mlflow_Project.utils.utility import FileOperations

In [19]:
class ConfigurationManager:
    def __init__(self,
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH,
                 schema_filepath = SCHEMA_FILE_PATH):
        
        self.config = FileOperations.read_yaml(config_filepath)
        self.params = FileOperations.read_yaml(params_filepath)
        self.schema = FileOperations.read_yaml(schema_filepath)

        FileOperations.create_directories([self.config.artifacts_root])


    def get_data_transformation_config(self) -> DataTransformationConfig:
            
            config = self.config.data_transformation

            FileOperations.create_directories([config.root_dir])

            data_transformation_config = DataTransformationConfig(
                 root_dir = config.root_dir,
                 data_path = config.data_path,
                 scaler_name = config.scaler_name,

            )
            
            return data_transformation_config
    

In [15]:
import joblib 
from src.Mlflow_Project.__init__ import logger 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler

In [22]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config


    ## Note: You can add different data transformtion techniques such as Scaler, PCA and all 
    # You can perform all kinds of EDA in ML cycle here before passing this data to the model 

    # defining train test split method 

    def train_test_split(self):
        data = pd.read_csv(self.config.data_path)

        # Split the data into training and test sets. (0.75, 0.25) split. 

        train, test = train_test_split(data)
        # Fit scaler on training data
        scaler = StandardScaler()
        X_train = scaler.fit_transform(train)
        logger.info("train scaling done..!!")

        
        X_test = scaler.transform(test)
        logger.info("test data transform done...!!")

        train_scaled_df = pd.DataFrame(X_train, columns= train.columns)
        test_scaled_df = pd.DataFrame(X_test, columns = test.columns)

        train_scaled_df.to_csv(os.path.join(self.config.root_dir, "train_scaled.csv"), index = False)
        test_scaled_df.to_csv(os.path.join(self.config.root_dir, "test_scaled.csv"), index = False)

        joblib.dump(scaler, os.path.join(self.config.root_dir, self.config.scaler_name))


        logger.info("Splitted data into train and test")
        logger.info(train.shape)
        logger.info(test.shape)


        print(train.shape)
        print(test.shape)

        

In [23]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)  
    data_transformation.train_test_split()  # Call the correct method here
except Exception as e:
    print("An error occurred:", str(e))


[2023-10-13 09:31:14,504: INFO: yaml file: config\config.yaml loaded successfully]
[2023-10-13 09:31:14,507: INFO: yaml file: params.yaml loaded successfully]
[2023-10-13 09:31:14,512: INFO: yaml file: schema.yaml loaded successfully]
[2023-10-13 09:31:14,515: INFO: created directory at: artifacts]
[2023-10-13 09:31:14,517: INFO: created directory at: artifacts/data_transformation]
[2023-10-13 09:31:14,628: INFO: train scaling done..!!]
[2023-10-13 09:31:14,633: INFO: test data transform done...!!]
[2023-10-13 09:31:16,113: INFO: Splitted data into train and test]
[2023-10-13 09:31:16,115: INFO: (15472, 16)]
[2023-10-13 09:31:16,117: INFO: (5158, 16)]
(15472, 16)
(5158, 16)
