In [1]:
import os
os.chdir("../")
%pwd

'/Users/tapankheni/Data_Science/Data Science Projects/Wafers_Fault_Prediction'

In [2]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    local_data_file: Path

In [3]:
from WafersFault.constants import CONFIG_YAML_FILE_PATH, PARAMS_YAML_FILE_PATH, SCHEMA_YAML_FILE_PATH, MONGO_DB_URL, MONGO_DB_COLLECTION_NAME, MONGO_DB_DATABASE_NAME
from WafersFault.utils.common import read_yaml, create_directories

In [4]:
class ConfigurationManager:
    def __init__(self, 
                 config_filepath=CONFIG_YAML_FILE_PATH,
                 params_filepath=PARAMS_YAML_FILE_PATH, 
                 schema_filepath=SCHEMA_YAML_FILE_PATH):
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            local_data_file=config.local_data_file,
        )

        return data_ingestion_config

In [5]:
import pandas as pd
import numpy as np
import sys
from pymongo import MongoClient
from WafersFault import logger
from WafersFault.utils.common import get_size

In [7]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config

    def export_collection_as_dataframe(self) -> pd.DataFrame:
        logger.info("retrieving data from mongoDB as dataframe")

        mongo_client = MongoClient(MONGO_DB_URL)

        collection = mongo_client[MONGO_DB_DATABASE_NAME][MONGO_DB_COLLECTION_NAME]

        df = pd.DataFrame(list(collection.find()))

        logger.info(df.columns)

        if "_id" in df.columns.to_list():
            df.drop(columns=["_id"], axis=1, inplace=True)

        logger.info(df.columns)

        df.replace({"na": np.nan}, inplace=True)

        logger.info("data retrieval completed")

        return df
    
    def export_data_into_feature_store_file_path(self) -> None:
        logger.info("Exporting data from mongoDB to store it into the desired artifacts")

        sensor_data = self.export_collection_as_dataframe()
        logger.info(f"saving exported data into feature store file path: {self.config.root_dir}")

        if not os.path.exists(self.config.local_data_file):
            sensor_data.to_csv(self.config.local_data_file, index=False)

            logger.info(f"exported data from mongoDB stored at {self.config.local_data_file}")
        else:
            logger.info(f"file already exists of size: {get_size(Path(self.config.local_data_file))}")


    def initiate_data_ingestion(self) -> None:
        logger.info("data ingestion component initiated")

        self.export_data_into_feature_store_file_path()

        logger.info(f"data ingestion component completed")



In [9]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    obj = DataIngestion(config=data_ingestion_config)
    obj.initiate_data_ingestion()
except Exception as e:
    raise e

[2024-03-12 05:45:07,270: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-03-12 05:45:07,272: INFO: common: yaml file: params.yaml loaded successfully]
[2024-03-12 05:45:07,308: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-03-12 05:45:07,313: INFO: common: created directory at: artifacts]
[2024-03-12 05:45:07,313: INFO: common: created directory at: artifacts/data_ingestion]
[2024-03-12 05:45:07,314: INFO: 2183419248: data ingestion component initiated]
[2024-03-12 05:45:07,314: INFO: 2183419248: Exporting data from mongoDB to store it into the desired artifacts]
[2024-03-12 05:45:07,314: INFO: 2183419248: retrieving data from mongoDB as dataframe]
[2024-03-12 05:45:09,437: INFO: 2183419248: Index(['_id', 'Sensor-1', 'Sensor-2', 'Sensor-3', 'Sensor-4', 'Sensor-5',
       'Sensor-6', 'Sensor-7', 'Sensor-8', 'Sensor-9',
       ...
       'Sensor-582', 'Sensor-583', 'Sensor-584', 'Sensor-585', 'Sensor-586',
       'Sensor-587', 'Sensor-588', 'Sens