In [1]:
import os
%pwd

'c:\\Users\\EI13136\\Documents\\mlops\\research'

In [2]:
os.chdir("../")
%pwd

'c:\\Users\\EI13136\\Documents\\mlops'

In [3]:
import pandas as pd

df = pd.read_csv("artifacts/data_ingestion/airline_passenger_satisfaction.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129880 entries, 0 to 129879
Data columns (total 24 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   ID                                      129880 non-null  int64  
 1   Gender                                  129880 non-null  object 
 2   Age                                     129880 non-null  int64  
 3   Customer Type                           129880 non-null  object 
 4   Type of Travel                          129880 non-null  object 
 5   Class                                   129880 non-null  object 
 6   Flight Distance                         129880 non-null  int64  
 7   Departure Delay                         129880 non-null  int64  
 8   Arrival Delay                           129487 non-null  float64
 9   Departure and Arrival Time Convenience  129880 non-null  int64  
 10  Ease of Online Booking                  1298

In [4]:
col =df.columns

In [5]:
df.head(2)

Unnamed: 0,ID,Gender,Age,Customer Type,Type of Travel,Class,Flight Distance,Departure Delay,Arrival Delay,Departure and Arrival Time Convenience,...,On-board Service,Seat Comfort,Leg Room Service,Cleanliness,Food and Drink,In-flight Service,In-flight Wifi Service,In-flight Entertainment,Baggage Handling,Satisfaction
0,1,Male,48,First-time,Business,Business,821,2,5.0,3,...,3,5,2,5,5,5,3,5,5,Neutral or Dissatisfied
1,2,Female,35,Returning,Business,Business,821,26,39.0,2,...,5,4,5,5,3,5,2,5,5,Satisfied


In [6]:
ls = df.dtypes.to_dict()
ls['Gender']== 'float64'

False

In [7]:
df.isnull().sum()

ID                                          0
Gender                                      0
Age                                         0
Customer Type                               0
Type of Travel                              0
Class                                       0
Flight Distance                             0
Departure Delay                             0
Arrival Delay                             393
Departure and Arrival Time Convenience      0
Ease of Online Booking                      0
Check-in Service                            0
Online Boarding                             0
Gate Location                               0
On-board Service                            0
Seat Comfort                                0
Leg Room Service                            0
Cleanliness                                 0
Food and Drink                              0
In-flight Service                           0
In-flight Wifi Service                      0
In-flight Entertainment           

In [8]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    status_file: str
    unzip_data_dir: Path
    all_schema: dict

In [9]:
from airline_passenger_satisfaction.constants import *
from airline_passenger_satisfaction.utils.common import read_yaml, create_directories

In [10]:
class ConfigurationManager:
    def __init__(self,
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH,
                 schema_filepath = SCHEMA_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    
    def get_data_validation(self)-> DataValidationConfig:
        config = self.config.data_validation
        schema = self.schema.COLUMNS

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir= config.root_dir,
            status_file= config.status_file,
            unzip_data_dir= config.unzip_data_dir,
            all_schema=schema,
        )        

        return data_validation_config

In [11]:
import os
from airline_passenger_satisfaction.logger import logger

In [12]:
import sys
import pandas as pd
from airline_passenger_satisfaction.exception import CustomException

class DataValidation:
    def __init__(self, config: DataValidationConfig) -> None:
        self.config = config

    def validate_all_columns(self) -> bool:
        try:
            data = pd.read_csv(self.config.unzip_data_dir)
            all_cols = data.dtypes.to_dict()
            all_schema = self.config.all_schema

            validation_status = True 

            with open(self.config.status_file, "w") as f:
                for col, d_type in all_schema.items():
                    if col not in all_cols.keys():
                        logger.warning(f"Column '{col}' was not found in the dataset!")
                        validation_status = False
                    elif data[col].dtype != d_type:
                        logger.warning(f"Column '{col}': Expected type {data[col].dtype}, Found type {d_type}")
                        validation_status = False

                f.write(f"validation status: {validation_status}")

            return validation_status

        except Exception as e:
            raise CustomException(e, sys)


In [13]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation()
    data_validation = DataValidation(config=data_validation_config)
    data_validation.validate_all_columns()
except Exception as e:
    raise CustomException(e,sys)

[2024-02-07 15:38:44,775] [INFO] [Airline Passenger Reviews Logger] [common] : yaml file config\config.yaml loaded successfully
[2024-02-07 15:38:44,777] [INFO] [Airline Passenger Reviews Logger] [common] : yaml file params.yaml loaded successfully
[2024-02-07 15:38:44,780] [INFO] [Airline Passenger Reviews Logger] [common] : yaml file schema.yaml loaded successfully
[2024-02-07 15:38:44,781] [INFO] [Airline Passenger Reviews Logger] [common] : Created directory at : artifacts
[2024-02-07 15:38:44,782] [INFO] [Airline Passenger Reviews Logger] [common] : Created directory at : artifacts/data_validation
