In [1]:
import os


In [2]:
%pwd

'f:\\end to end ml project\\END-TO-END-MACHINE-LEARNING-PROJECT\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd # one folder back

'f:\\end to end ml project\\END-TO-END-MACHINE-LEARNING-PROJECT'

In [5]:
# congif.yaml

"""Here the process is happening in artifile """

from dataclasses import dataclass  # the dataclass decorator for simplifying class creation
from pathlib import Path  # Path for handling file system paths 

@dataclass(frozen=True)  #  simplifies the class structure; frozen makes it immutable
class DataIngestionConfig:
    root_dir: Path  # Root directory for managing all data ingestion-related files and outputs
    source_URL: str  # URL of the data source to be downloaded
    local_data_file: Path  # Path where the downloaded data file will be stored locally
    unzip_dir: Path  # Directory where the data will be extracted after downloading


In [6]:
# form utils ....common..... read.yaml  
from wine_quality.constants import *
from wine_quality.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,  # Path to the main config file
        params_filepath = PARAMS_FILE_PATH,  # Path to the parameters file
        schema_filepath = SCHEMA_FILE_PATH  # Path to the schema file
    ):
        self.config = read_yaml(config_filepath)  # Load configuration settings
        self.params = read_yaml(params_filepath)  # Load parameter settings
        self.schema = read_yaml(schema_filepath)  # Load schema details

        create_directories([self.config.artifacts_root])  # Ensure artifact root directory exists

        # Artifacts setup starts here for data ingestion

    def get_data_ingestion_config(self) -> DataIngestionConfig:  # Returns data ingestion configuration
        config = self.config.data_ingestion  # Access data ingestion settings

        create_directories([config.root_dir])  # Ensure the root directory for data ingestion exists

        # Populate and return the DataIngestionConfig object with relevant paths and URLs
        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir
        )

        return data_ingestion_config


In [8]:
import os
import urllib.request as request
import zipfile
from wine_quality import logger
from wine_quality.utils.common import get_size

In [9]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config  # Store the DataIngestionConfig object for file paths and URLs

    def download_file(self):
        """
        Downloads the data file from the source URL if it does not already exist locally.
        Logs file information upon download or skips if the file is already present.
        """
        if not os.path.exists(self.config.local_data_file):  # Check if file already exists
            filename, headers = request.urlretrieve(
                url=self.config.source_URL,  # URL to download the file from
                filename=self.config.local_data_file  # Local path to save the file
            )
            logger.info(f"{filename} downloaded with the following info: \n{headers}")  # Log successful download
        else:
            # Log that the file already exists and display its size
            logger.info(f"File already exists of size: {get_size(Path(self.config.local_data_file))}")

    def extract_zip_file(self):
        """
        Extracts the downloaded zip file into the specified directory.
        Creates the extraction directory if it does not exist.
        """
        unzip_path = self.config.unzip_dir  # Directory where the files will be extracted
        os.makedirs(unzip_path, exist_ok=True)  # Ensure the extraction directory exists
        with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
            zip_ref.extractall(unzip_path)  # Extract all files into the unzip directory

  

In [10]:
try:
    # Initialize ConfigurationManager to read and set up configuration files
    config = ConfigurationManager()

    # Get the data ingestion configuration settings
    data_ingestion_config = config.get_data_ingestion_config()

    # Initialize DataIngestion with the retrieved configuration
    data_ingestion = DataIngestion(config=data_ingestion_config)

    # Download the data file from the source URL
    data_ingestion.download_file()

    # Extract the downloaded zip file to the specified directory
    data_ingestion.extract_zip_file()

except Exception as e:
    # Raise any exception encountered for debugging or logging purposes
    raise e


[2024-11-26 05:17:21,331: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-11-26 05:17:21,334: INFO: common: yaml file: Params.yaml loaded successfully]
[2024-11-26 05:17:21,340: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-11-26 05:17:21,340: INFO: common: created directory at: artifacts]
[2024-11-26 05:17:21,346: INFO: common: created directory at: artifacts/data_ingestion]
[2024-11-26 05:17:23,757: INFO: 1415412580: artifacts/data_ingestion/data.zip downloaded with the following info: 
Connection: close
Content-Length: 21984
Cache-Control: max-age=300
Content-Security-Policy: default-src 'none'; style-src 'unsafe-inline'; sandbox
Content-Type: application/zip
ETag: "85a38be3d4c70e3b6dc97fc47873623da88c9b850e1526aeb57ff236bdd2815c"
Strict-Transport-Security: max-age=31536000
X-Content-Type-Options: nosniff
X-Frame-Options: deny
X-XSS-Protection: 1; mode=block
X-GitHub-Request-Id: A798:34E87F:53F7:B2E9:67450A2A
Accept-Ranges: bytes
Date: Mon, 2