In [1]:
import os

In [2]:
%pwd

'd:\\end to end mental_health_prediction\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\end to end mental_health_prediction'

In [5]:
import pandas as pd

df = pd.read_csv(r"artifacts\data_ingestion\unzipped_data\survey.csv")

df.head()

Unnamed: 0,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,no_employees,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments
0,2014-08-27 11:29:31,37,Female,United States,IL,,No,Yes,Often,6-25,...,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No,
1,2014-08-27 11:29:37,44,M,United States,IN,,No,No,Rarely,More than 1000,...,Don't know,Maybe,No,No,No,No,No,Don't know,No,
2,2014-08-27 11:29:44,32,Male,Canada,,,No,No,Rarely,6-25,...,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No,
3,2014-08-27 11:29:46,31,Male,United Kingdom,,,Yes,Yes,Often,26-100,...,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes,
4,2014-08-27 11:30:22,31,Male,United States,TX,,No,No,Never,100-500,...,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No,


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 27 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Timestamp                  1259 non-null   object
 1   Age                        1259 non-null   int64 
 2   Gender                     1259 non-null   object
 3   Country                    1259 non-null   object
 4   state                      744 non-null    object
 5   self_employed              1241 non-null   object
 6   family_history             1259 non-null   object
 7   treatment                  1259 non-null   object
 8   work_interfere             995 non-null    object
 9   no_employees               1259 non-null   object
 10  remote_work                1259 non-null   object
 11  tech_company               1259 non-null   object
 12  benefits                   1259 non-null   object
 13  care_options               1259 non-null   object
 14  wellness

In [7]:
from pathlib import Path
from dataclasses import dataclass

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    STATUS_FILE: str
    unzip_data_dir: Path
    all_schema: dict

In [8]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml,create_directories

In [9]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,
        params_filepath=PARAMS_FILE_PATH,
        schema_filepath=SCHEMA_FILE_PATH
    ):
        # Force convert to Path object in case they are strings
        config_filepath = Path(config_filepath)
        params_filepath = Path(params_filepath)
        schema_filepath = Path(schema_filepath)

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation
        schema = self.schema.COLUMNS

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir=config.root_dir,
            STATUS_FILE=config.STATUS_FILE,
            unzip_data_dir = config.unzip_data_dir,
            all_schema=schema,
        )

        return data_validation_config

In [10]:
import os
from mlProject import logger

In [11]:
import os
import pandas as pd
from mlProject import logger

class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config

    def validate_all_columns(self) -> bool:
        try:
            # ✅ Construct full path to the CSV file
            data_path = os.path.join(self.config.unzip_data_dir, "survey.csv")
            logger.info(f" Reading data from: {data_path}")

            # ✅ Load the data
            data = pd.read_csv(data_path)

            # ✅ Get actual and expected columns
            actual_columns = set(data.columns)
            expected_columns = set(self.config.all_schema.keys())

            # ✅ Compare columns
            missing_in_data = expected_columns - actual_columns
            extra_in_data = actual_columns - expected_columns

            validation_status = True
            if missing_in_data or extra_in_data:
                validation_status = False

            # ✅ Write result to status file
            with open(self.config.STATUS_FILE, 'w') as f:
                f.write(f"Validation status: {validation_status}\n")
                if missing_in_data:
                    f.write(f"Missing columns in data: {missing_in_data}\n")
                if extra_in_data:
                    f.write(f"Extra columns in data: {extra_in_data}\n")

            logger.info(f"Validation completed. Status: {validation_status}")
            return validation_status

        except Exception as e:
            logger.exception("Exception occurred during validation")
            raise e


In [12]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValidation(config=data_validation_config)
    data_validation.validate_all_columns()
except Exception as e:
    raise e

[2025-07-20 21:05:41,488: INFO: common: yaml file: D:\end to end mental_health_prediction\config.yaml loaded successfully]
[2025-07-20 21:05:41,488: INFO: common: yaml file: D:\end to end mental_health_prediction\params.yaml loaded successfully]
[2025-07-20 21:05:41,502: INFO: common: yaml file: D:\end to end mental_health_prediction\schema.yaml loaded successfully]
[2025-07-20 21:05:41,504: INFO: common: created directory at: artifacts]
[2025-07-20 21:05:41,505: INFO: common: created directory at: artifacts/data_validation]
[2025-07-20 21:05:41,505: INFO: 425775790:  Reading data from: artifacts/data_ingestion/unzipped_data\survey.csv]
[2025-07-20 21:05:41,511: INFO: 425775790: Validation completed. Status: True]
