In [1]:
import os 


In [2]:
%pwd

'c:\\Users\\DELL\\OneDrive\\Desktop\\Projects\\Projects\\Machine learning\\Red_wine_project\\research'

In [3]:
os.chdir('../')

In [4]:
%pwd

'c:\\Users\\DELL\\OneDrive\\Desktop\\Projects\\Projects\\Machine learning\\Red_wine_project'

1️⃣ What Is a dataclass?

A dataclass is a Python feature that is used to store structured data (like config values) without writing boilerplate code.

Instead of manually writing:

__init__

assignments

repr methods

Python does it for you automatically.

2️⃣ Why Do We Use dataclass in ML Projects?
Without dataclass ❌ (messy)
class DataIngestionConfig:
    def __init__(self):
        self.root_dir = "artifact/data_ingestion"
        self.source_URL = "https://..."
        self.local_data_file = "artifact/data_ingestion/data.zip"

In [5]:
from dataclasses import dataclass
from pathlib import Path

# @dataclass automatically creates:
# - __init__ method
# - readable __repr__
# - comparison methods
#
# frozen=True makes the object IMMUTABLE
# → once created, values cannot be changed
# → ensures pipeline configuration stays consistent
@dataclass(frozen=True)
class DataIngestionConfig:
    # Directory where all data ingestion artifacts will be stored
    # Example: artifact/data_ingestion/
    root_dir: Path

    # Public URL from where the dataset is downloaded
    # Should be a direct downloadable link (GitHub raw, S3, etc.)
    source_URL: str

    # Local path where the downloaded ZIP file will be saved
    # Example: artifact/data_ingestion/data.zip
    local_data_file: Path

    # Directory where the ZIP file will be extracted
    # Final CSV files will be available here
    unzip_dir: Path


In [6]:
# Import all constant variables defined in the constants module
# These usually include fixed paths like:
# CONFIG_FILE_PATH, PARAMS_FILE_PATH, SCHEMA_FILE_PATH
# Keeping them in one place avoids hardcoding paths across the project
from Red_Wine_Prediction.constants import *

# Import reusable utility functions
# read_yaml       → reads YAML files (config.yaml, params.yaml, schema.yaml)
# create_directories → safely creates required directories for artifacts
# These utilities are shared across multiple pipeline stages
from Red_Wine_Prediction.utils.common import read_yaml, create_directories


In [7]:
class ConfigurationManager:
    """
    ConfigurationManager is responsible for:
    - Reading all YAML configuration files (config, params, schema)
    - Creating the main artifacts directory
    - Providing stage-specific configuration objects (dataclasses)
    """

    def __init__(
        self,
        config_filepath: Path = CONFIG_FILE_PATH,
        params_filepath: Path = PARAMS_FILE_PATH,
        schema_filepath: Path = SCHEMA_FILE_PATH,
    ) -> None:
        """
        Initializes the ConfigurationManager.

        Args:
            config_filepath (Path): Path to config.yaml (pipeline paths & stages)
            params_filepath (Path): Path to params.yaml (model hyperparameters)
            schema_filepath (Path): Path to schema.yaml (data validation rules)
        """

        # Load YAML files once and store them in memory
        # This avoids reading files repeatedly in each pipeline stage
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        # Create the root artifacts directory
        # All pipeline outputs (data, models, reports) will live inside this folder
        create_directories([Path(self.config.artifacts_root)])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        """
        Creates and returns a DataIngestionConfig object.

        This method:
        - Extracts the data_ingestion section from config.yaml
        - Creates required directories for the ingestion stage
        - Converts raw YAML values into strongly-typed Path objects

        Returns:
            DataIngestionConfig: Immutable config object for data ingestion stage
        """

        # Access the data_ingestion section from config.yaml
        config = self.config.data_ingestion

        # Create the data ingestion root directory
        create_directories([Path(config.root_dir)])

        # Convert YAML config values into a dataclass
        # Using Path ensures OS-independent file handling
        data_ingestion_config = DataIngestionConfig(
            root_dir=Path(config.root_dir),
            source_URL=config.source_URL,
            local_data_file=Path(config.local_data_file),
            unzip_dir=Path(config.unzip_dir),
        )

        # Return the prepared configuration to the DataIngestion component
        return data_ingestion_config


In [8]:
import sys
# Provides access to system-specific parameters and functions
# Used mainly for error handling or exiting the program safely

import urllib.request as request
# Used to download files from a URL
# In this project, it downloads the dataset ZIP file during data ingestion

import zipfile
# Used to extract ZIP files
# After downloading the dataset, this module unzips the contents

from Red_Wine_Prediction import logger
# Project-wide logger instance
# Used for consistent logging across all pipeline stages

from Red_Wine_Prediction.utils.common import get_size
# Utility function to get file size in KB
# Used to log dataset size after downloading


In [9]:
class DataIngestion:
    """
    DataIngestion handles:
    - Downloading the raw dataset from a remote source
    - Extracting the downloaded ZIP file into the artifact directory
    """

    def __init__(self, config: DataIngestionConfig) -> None:
        """
        Initializes the DataIngestion class.

        Args:
            config (DataIngestionConfig): Configuration object containing
                                          paths and source URL for ingestion
        """
        # Store the ingestion configuration
        self.config = config

    def download_file(self) -> str:
        """
        Downloads the dataset ZIP file from the source URL.

        - If the file already exists locally, it skips downloading
        - Logs file size if already present
        """

        # Check if the dataset ZIP file already exists
        if not os.path.exists(self.config.local_data_file):

            # Download the file from the given URL
            filename, headers = request.urlretrieve(
                url=self.config.source_URL,
                filename=self.config.local_data_file
            )

            # Log download success and HTTP response headers
            logger.info(f"{filename} downloaded! with following info: \n{headers}")

        else:
            # If file already exists, log its size
            logger.info(
                f"File already exists of size: {get_size(Path(self.config.local_data_file))}"
            )

    def extract_zip_file(self) -> None:
        """
        Extracts the downloaded ZIP file into the specified directory.

        - Creates the unzip directory if it does not exist
        - Extracts all files from the ZIP
        """

        # Path where the ZIP contents will be extracted
        unzip_path = self.config.unzip_dir

        # Ensure the extraction directory exists
        os.makedirs(unzip_path, exist_ok=True)

        # Open and extract the ZIP file
        with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
            zip_ref.extractall(unzip_path)

        # Log successful extraction
        logger.info(f"File extracted to {unzip_path}")


In [10]:
try:
    # Initialize ConfigurationManager
    # This loads all YAML files (config.yaml, params.yaml, schema.yaml)
    # and prepares the root artifact directory
    config = ConfigurationManager()

    # Get the configuration specific to the Data Ingestion stage
    # This returns a DataIngestionConfig dataclass
    data_ingestion_config = config.get_data_ingestion_config()

    # Initialize the DataIngestion component with its configuration
    data_ingestion = DataIngestion(config=data_ingestion_config)

    # Download the dataset ZIP file (if not already present)
    data_ingestion.download_file()

    # Extract the downloaded ZIP file into the artifact directory
    data_ingestion.extract_zip_file()

except Exception as e:
    # If any error occurs during the data ingestion stage,
    # re-raise the exception so it can be logged and handled by main.py
    raise e


[2026-01-02 21:34:18,039: INFO: common: YAML file loaded successfully: config\config.yaml]
[2026-01-02 21:34:18,040: INFO: common: YAML file loaded successfully: params.yaml]
[2026-01-02 21:34:18,042: INFO: common: YAML file loaded successfully: schema.yaml]
[2026-01-02 21:34:18,043: INFO: common: Directory created at: artifacts]
[2026-01-02 21:34:18,044: INFO: common: Directory created at: artifacts\data_ingestion]
[2026-01-02 21:34:19,947: INFO: 1496518022: artifacts\data_ingestion\data.zip downloaded! with following info: 
Connection: close
Content-Length: 23329
Cache-Control: max-age=300
Content-Security-Policy: default-src 'none'; style-src 'unsafe-inline'; sandbox
Content-Type: application/zip
ETag: "c69888a4ae59bc5a893392785a938ccd4937981c06ba8a9d6a21aa52b4ab5b6e"
Strict-Transport-Security: max-age=31536000
X-Content-Type-Options: nosniff
X-Frame-Options: deny
X-XSS-Protection: 1; mode=block
X-GitHub-Request-Id: CEFA:2B10AF:1505625:26E50AF:6957EC84
Accept-Ranges: bytes
Date: Fri

In [11]:
import sys
print(sys.executable)


c:\Users\DELL\Miniconda3\envs\Red_Wine_Predication\python.exe
