In [1]:
import os
# Change directory to the project root
os.chdir("../") 
print(os.getcwd()) # Confirm you are now in the main project folder

c:\Users\user\Desktop\Customer_Satisfaction_Prediction_to_Production


In [2]:
# Data Validation Config
# ----------------------------------
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    raw_data_dir: Path
    validated_data_file: Path


In [3]:
from customerSatisfaction.constants import *
from customerSatisfaction.utils.common import read_yaml, create_directories

In [4]:
#5. Update the configuration manager in src config


class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    
    def get_data_validation_config(self) -> DataValidationConfig:
        """
        Returns DataValidationConfig for Stage 02
        - root_dir: where cleaned CSVs will be saved
        - raw_data_dir: folder containing raw CSVs from Stage 01
        """
        config = self.config['data_validation']
        # Ensure output directory exists
        create_directories([config['root_dir']])

        raw_data_dir = Path(self.config['data_ingestion']['unzip_dir'])
        root_dir = Path(config['root_dir'])

        return DataValidationConfig(
            root_dir=root_dir,
            raw_data_dir=raw_data_dir,
            validated_data_file=Path(config['validated_data_file'])
        )

In [5]:
import os
import zipfile
import gdown
from customerSatisfaction import logger
from customerSatisfaction.utils.common import get_size

In [6]:
import os
import pandas as pd
from customerSatisfaction import logger
from customerSatisfaction.entity.config_entity import DataValidationConfig


class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config

    def validate_data(self):
        """
        Validate all CSV files and save clean versions
        """
        os.makedirs(self.config.root_dir, exist_ok=True)

        for file_name in os.listdir(self.config.raw_data_dir):
            if file_name.endswith(".csv"):
                file_path = os.path.join(self.config.raw_data_dir, file_name)

                logger.info(f"Validating file: {file_name}")
                df = pd.read_csv(file_path)

                if df.empty:
                    raise ValueError(f"{file_name} is empty")

                validated_path = os.path.join(self.config.root_dir, file_name)
                df.to_csv(validated_path, index=False)

        logger.info("Data validation completed successfully")


In [7]:
#7. Update the pipeline 

try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValidation(config=data_validation_config)
    data_validation.validate_data()
except Exception as e:
    raise e

[2026-01-14 18:16:52,922: INFO: common: YAML file config\config.yaml loaded successfully]
[2026-01-14 18:16:52,934: INFO: common: YAML file params.yaml loaded successfully]
[2026-01-14 18:16:52,939: INFO: common: Created directory at: artifacts]
[2026-01-14 18:16:52,942: INFO: common: Created directory at: artifacts/data_validation]
[2026-01-14 18:16:52,947: INFO: 2094464771: Validating file: olist_customers_dataset.csv]
[2026-01-14 18:16:56,249: INFO: 2094464771: Validating file: olist_geolocation_dataset.csv]
[2026-01-14 18:17:18,801: INFO: 2094464771: Validating file: olist_orders_dataset.csv]
[2026-01-14 18:17:25,695: INFO: 2094464771: Validating file: olist_order_items_dataset.csv]
[2026-01-14 18:17:29,963: INFO: 2094464771: Validating file: olist_order_payments_dataset.csv]
[2026-01-14 18:17:32,499: INFO: 2094464771: Validating file: olist_order_reviews_dataset.csv]
[2026-01-14 18:17:37,515: INFO: 2094464771: Validating file: olist_products_dataset.csv]
[2026-01-14 18:17:38,939: 

In [9]:
import pandas as pd

df_orders = pd.read_csv(
    "artifacts/data_validation/olist_order_payments_dataset.csv"
)

df_orders.head()


Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
0,b81ef226f3fe1789b1e8b2acac839d17,1,credit_card,8,99.33
1,a9810da82917af2d9aefd1278f1dcfa0,1,credit_card,1,24.39
2,25e8ea4e93396b6fa0d3dd708e76c1bd,1,credit_card,1,65.71
3,ba78997921bbcdc1373bb41e913ab953,1,credit_card,8,107.78
4,42fdf880ba16b47b59251dd489d4441a,1,credit_card,2,128.45


In [10]:
import pandas as pd

df_orders = pd.read_csv(
    "artifacts/data_validation/olist_orders_dataset.csv"
)

df_orders.head()


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15 00:00:00
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26 00:00:00
