In [1]:
# Step 19 continued: 
# We need to first go back one folder so we will use the same code as is present in the data_ingestion.ipynb file. 

In [2]:
%pwd

'c:\\Users\\RadhikaMaheshwari\\Desktop\\Test\\DeepLearning\\ETE\\research'

In [3]:
import os 
os.chdir('../')

In [4]:
%pwd

'c:\\Users\\RadhikaMaheshwari\\Desktop\\Test\\DeepLearning\\ETE'

In [5]:
# step 19 completed.

In [6]:
# Step 20: We first need to update config.yaml file. 
# Here we will need to mention data validation related configuration
# In this, what we are doing is - we are creating a key called data validation. Inside this we are giving root directory for
# storing contents of the data validation work. so we are creating data_validation folder in the artifacts folder. This
# path is now defined. 
# Next we are defining the unzipped data path of the file that was imported during the data ingestion phase.
# then we will create a status file so we are defining the path to the status.txt file. 
# When our data passes validation then we will put True in the status.txt file otherwise False. If it is True then only we will continue
# with the next stage of pipeline

In [7]:
# Once the above is done then Step 20 is completed. 

In [8]:
# Step 21: We need to define entity
from dataclasses import dataclass 
from pathlib import Path

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    STATUS_FILE: str
    unzip_data_dir: Path 
    all_schema: dict 

# all_schema is that we will be reading the schema file here so we are giving that as a variabnle too. As schema file is a .yaml 
# file hence we are assigning the path as dictionary. 


In [9]:
# Step 21 Completed

In [11]:
# Step 22: Defining Configuration manager
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories
class ConfigurationManager:
    def __init__(
        self, 
        config_filepath = CONFIG_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation
        schema = self.schema.COLUMNS

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir = config.root_dir,
            STATUS_FILE = config.STATUS_FILE,
            unzip_data_dir = config.unzip_data_dir,
            all_schema=schema
        )

        return data_validation_config



In [12]:
# Step 21 Completed. In the above step we have only grabbed the details which will be required during data validation phase. 

In [86]:
# Step 22: Now we need to create data validation component
import os 
from mlProject import logger 
import pandas as pd

class DataValidation:
    def __init__(self,config: DataValidationConfig):
        self.config = config 
    
    def validate_all_columns(self) -> bool: 
        try:
            validation_status = None 

            data = pd.read_csv(self.config.unzip_data_dir)
            all_cols = list(data.columns)

            all_schema = self.config.all_schema.keys()

            for col in all_cols:
                if col not in all_schema:
                    validation_status = False
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write(f"Validation status: {validation_status} \n")
                else:
                    validation_status = True
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write(f"Validation status: {validation_status} \n")
            
            return validation_status
        except Exception as e:
            raise e 
    
    def validate_all_data_type(self) -> bool:
        try:
            validation_status = None

            data = pd.read_csv(self.config.unzip_data_dir)
            all_cols = list(data.columns)

            all_schema = self.config.all_schema

            for col in all_cols:
                for sch in all_schema:
                    # if (col == sch):
                    #     print(f"{all_schema[sch]}")
                    # else:
                    #     pass
                    if ((col == sch ) and (data.dtypes[col] == all_schema[sch])):
                        validation_status = True
                        with open(self.config.STATUS_FILE, 'w') as f:
                            f.write(f'Validation status from dtypes function: {validation_status} \n')
            
            return validation_status
        except Exception as e:
            raise e

In [87]:
# Step 22: component preparation completed.

In [88]:
# Step 23: Now we need to write the code for pipeline
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValidation(data_validation_config)
    data_validation.validate_all_columns()
    data_validation.validate_all_data_type()
except Exception as e:
    raise e

[2024-05-22 22:52:10,557: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-05-22 22:52:10,568: INFO: common: yaml file: params.yaml loaded successfully]
[2024-05-22 22:52:10,583: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-05-22 22:52:10,585: INFO: common: Created Directory at: artifacts]
[2024-05-22 22:52:10,586: INFO: common: Created Directory at: artifacts/data_validation]


In [90]:
# Step 23 Completed
#Now we just need to convert this notebook into modular components

In [2]:
# Step 24: Data Transformation
# Here we are just going to perform train test split. 
# Here we can also do PCA, EDA, Feature Engineering. 
# Our dataset is already clean so we are not performing train test split. 
# In research folder we will create a file called 03_data_transformation.ipynb
#Step 24: completed