In [1]:
project_name="mice"

In [2]:
import os

In [3]:
%pwd

'f:\\ML_Ops_learning\\Mice-Protein-Expression\\research'

In [4]:
os.chdir("../")

In [5]:
%pwd

'f:\\ML_Ops_learning\\Mice-Protein-Expression'

## config/config.yaml

In [None]:
%%writefile -a config/config.yaml


data_transformation:
  root_dir: artifacts/data_transformation
  data_path: artifacts/data_ingestion/Data_Cortex_Nuclear.xls

# Entity Folder
1. Goto entity folder  src-> wine_quality -> entity-> __init__.py 
2. paste below code in __init__.py constructor file

In [36]:
project_name

'mice'

In [37]:
from dataclasses import dataclass
from pathlib import Path

In [39]:
#%%writefile -a src/{project_name}/entity/__init__.py


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

# Config folder

1. Copy entire ConfigurationManager class
2. Goto config folder src->wine_quality->config->configuration.py and paste entire ConfigurationManager class in configuration.py
3. Import DataIngestionConfig from wine_quality.entity using this line  
from wine_quality.entity import DataIngestionConfig

In [40]:
project_name

'mice'

In [7]:
from mice.constants import *
from mice.utils import read_yaml, create_directories

In [8]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    
        
    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
        )

        return data_transformation_config

Into file

In [41]:
# %%writefile -a src/{project_name}/config/__init__.py    
    
#     def get_data_transformation_config(self) -> DataTransformationConfig:
#         config = self.config.data_transformation

#         create_directories([config.root_dir])

#         data_transformation_config = DataTransformationConfig(
#             root_dir=config.root_dir,
#             data_path=config.data_path,
#         )

#         return data_transformation_config

Appending to src/mice/config/__init__.py


# Component folder

1. Copy entire DataIngestion class
2. Goto Component folder [ src-> wine_quality -> components - > data_ingestion.py ]
2. Paste DataIngestionclass in data_ingestion.py.py
3. Import dataIngestionconfig from entity using this line 

from projectname.entity import dataIngestionconfig

In [42]:
project_name 

'mice'

In [43]:
# %%writefile -a src/{project_name}/components/data_transformation.py
# from mice.entity import DataTransformationConfig

Appending to src/mice/components/data_transformation.py


In [45]:
#%%writefile -a src/{project_name}/components/data_transformation.py
import os
import pandas as pd
from mice import logger
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

In [46]:
#%%writefile -a src/{project_name}/components/data_transformation.py
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    
    ## Note: You can add different data transformation techniques such as Scaler, PCA and all
    #You can perform all kinds of EDA in ML cycle here before passing this data to the model

    def replace_nan_num(self,dataset):
        numerical_with_nan=[feature for feature in dataset.columns if dataset[feature].isnull().sum()>0 and dataset[feature].dtypes!='O']

        for feature in numerical_with_nan:
            ## We will replace by using median since there are outliers
            median_value=dataset[feature].median()
            
            ## create a new feature to capture nan values
            #dataset[feature+'nan']=np.where(dataset[feature].isnull(),1,0)
            dataset[feature].fillna(median_value,inplace=True)
        
        logger.info("Replaceed missing dataset with median")
        return dataset
    
    def correlation(self,dataset, threshold):
        col_corr = set()  # Set of all the names of correlated columns
        corr_matrix = dataset.corr()
        for i in range(len(corr_matrix.columns)):
            for j in range(i):
                if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                    colname = corr_matrix.columns[i]  # getting the name of column
                    col_corr.add(colname)
        return col_corr

    def scaling(self,dataset):
        '''Scaling Feature'''
        
        scaling_feature=[feature for feature in dataset.columns if feature not in ['MouseID','cls'] ]
        scaler=MinMaxScaler()
        scaler.fit(dataset[scaling_feature])
        data = pd.concat([dataset[['MouseID','cls']].reset_index(drop=True),
                    pd.DataFrame(scaler.transform(dataset[scaling_feature]), columns=scaling_feature)],
                    axis=1)
        logger.info("Completed scaling dataset")
        return(data)
        
    def train_test_spliting(self,data):
        #data = pd.read_csv(self.config.data_path)
        
        # Split the data into training and test sets. (0.75, 0.25) split.
        train, test = train_test_split(data)

        train.to_csv(os.path.join(self.config.root_dir, "train.csv"),index = False)
        test.to_csv(os.path.join(self.config.root_dir, "test.csv"),index = False)

        logger.info("Splited data into training and test sets")
        logger.info(train.shape)
        logger.info(test.shape)

        print(train.shape)
        print(test.shape)
        
    
    def transformation(self):
        data = pd.read_excel(self.config.data_path)
        logger.info("Converted Excel data to DataFrame")
        #encode the target data
        le=LabelEncoder()
        data['cls']= le.fit_transform(data['class'])
        logger.info("Encoded the dependent variable ")
        
        # drop the independent variable
        data=data.drop(['Genotype', 'Treatment', 'Behavior', 'class'],axis=1)
        
        # removing the missing values from numeric features
        data=self.replace_nan_num(data)
        
        
        #remove the correlated features
        corr_features = self.correlation(data.drop(['MouseID','cls'],axis=1), 0.9)
        data=data.drop(corr_features,axis=1)
        logger.info("Droped highly correlated features")
        
        #scaling the dependent variable
        data=self.scaling(data)
        self.train_test_spliting(data)
        
        
        

Appending to src/mice/components/data_transformation.py


In [35]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.transformation()
except Exception as e:
    raise e

[2023-11-30 19:56:57,460] - mice - INFO - __init__ file - LineNum:32   - yaml file: config\config.yaml loaded successfully
[2023-11-30 19:56:57,465] - mice - INFO - __init__ file - LineNum:32   - yaml file: params.yaml loaded successfully
[2023-11-30 19:56:57,489] - mice - INFO - __init__ file - LineNum:32   - yaml file: schema.yaml loaded successfully
[2023-11-30 19:56:57,492] - mice - INFO - __init__ file - LineNum:52   - created directory at: artifacts
[2023-11-30 19:56:57,494] - mice - INFO - __init__ file - LineNum:52   - created directory at: artifacts/data_transformation
[2023-11-30 19:56:57,907] - mice - INFO - 1351739426 file - LineNum:63   - Converted Excel data to DataFrame
[2023-11-30 19:56:57,909] - mice - INFO - 1351739426 file - LineNum:67   - Encoded the dependent variable 
[2023-11-30 19:56:57,967] - mice - INFO - 1351739426 file - LineNum:20   - Replaceed missing dataset with median
[2023-11-30 19:56:58,120] - mice - INFO - 1351739426 file - LineNum:79   - Droped high

## into the file
pipeline/stage_03_data_transformation.py

In [47]:
%%writefile src/{project_name}/pipeline/stage_03_data_transformation.py
from mice.config import ConfigurationManager
from mice.components.data_transformation import DataTransformation


class DataTransformationTrainingPipeline:
    def __init__(self):
        pass

    def main(self):
        config = ConfigurationManager()
        data_transformation_config = config.get_data_transformation_config()
        data_transformation = DataTransformation(config=data_transformation_config)
        data_transformation.transformation()

Overwriting src/mice/pipeline/stage_03_data_transformation.py


In [49]:
#%%writefile -a main.py
from mice.pipeline.stage_03_data_transformation import DataTransformationTrainingPipeline

STAGE_NAME = "Data Transformation stage"
try:
   logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<") 
   data_transformation = DataTransformationTrainingPipeline()
   data_transformation.main()
   logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
except Exception as e:
        logger.exception(e)
        raise e

Appending to main.py


In [50]:
os.system("python main.py")

0