In [4]:
import os

In [5]:
%pwd

'/Users/sanket/Documents/ML Project/Full-Stack Machine Learning Deployment with MLOps and AWS EC2/research'

In [6]:
# Print current working directory
print(os.getcwd())

/Users/sanket/Documents/ML Project/Full-Stack Machine Learning Deployment with MLOps and AWS EC2/research


In [7]:
os.chdir("../")

In [8]:
%pwd

'/Users/sanket/Documents/ML Project/Full-Stack Machine Learning Deployment with MLOps and AWS EC2'

In [9]:
from dataclasses import dataclass, field
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path

In [10]:
from MlOpsProject.constants import *
from MlOpsProject.utils.common import read_yaml, create_directories

In [11]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        
        create_directories([self.config.artifacts_root])
        
        
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        
        create_directories([config.root_dir])
        
        data_ingestion_config = DataIngestionConfig(
            root_dir = config.root_dir,
            source_URL = config.source_URL,
            local_data_file = config.local_data_file,
            unzip_dir = config.unzip_dir
        )
        
        return data_ingestion_config

In [12]:
import os 
import urllib.request as request
import zipfile
from MlOpsProject import logger
from MlOpsProject.utils.common import get_size


In [13]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig): # The constructor (__init__) initializes the class with a configuration object config of type DataIngestionConfig.
        self.config = config # self.config: This stores the configuration object, which is expected to contain the settings needed for data ingestion, such as the source URL and the local file path.
    
    def download_file(self):
        if not os.path.exists(self.config.local_data_file):
            filename, headers = request.urlretrieve( 
                url = self.config.source_URL,
                filename = self.config.local_data_file
            )
            logger.info(f"{filename} download! with following info: \n{headers}")
            # If the file does not exist:
            # It uses 'request.urlretrieve' to download the file from the URL specified by 'self.config.source_URL' and saves it to the local path specified by 'self.config.local_data_file'.
            # 'filename' and 'headers' capture the file path and the HTTP headers received during the download.
            # A log message is generated using 'logger.info', indicating the download is complete, along with information from the headers.
        else:
            logger.info(f"File already exists of size : {get_size(Path(self.config.local_data_file))}")
            #It logs a message indicating that the file already exists and displays its size using the get_size function, which takes the local file path as a parameter.
            
    def extract_zip_file(self):
        """
        zip_file path : str
        Extracts the zip file into the data directory
        Funtion returns None
        """
        
        unzip_path = self.config.unzip_dir
        os.makedirs(unzip_path,exist_ok=True)
        with zipfile.ZipFile(self.config.local_data_file,'r') as zip_ref:
            zip_ref.extractall(unzip_path)

In [18]:
try: # This block attempts to execute the data ingestion workflow, and if any errors occur, they are caught by the except block.
    config = ConfigurationManager() # An instance of the 'ConfigurationManager' class is created. This class is likely responsible for managing and providing configurations required for the application.
    data_ingestion_config = config.get_data_ingestion_config() # An instance of the ConfigurationManager class is created. This class is likely responsible for managing and providing configurations required for the application.
    data_ingestion = DataIngestion(config= data_ingestion_config) # The 'get_data_ingestion_config' method of the 'ConfigurationManager' object is called to retrieve the specific configuration settings for data ingestion. This configuration object (data_ingestion_config) contains information like URLs, file paths, and other parameters required for the data ingestion process
    data_ingestion.download_file()# The download_file method of the DataIngestion instance is called to handle the file download operation. It checks if the file already exists and, if not, downloads it from the specified URL.
    data_ingestion.extract_zip_file() # This line calls the extract_zip_file method on the DataIngestion instance. It handles the extraction of downloaded zip files. This would typically involve unzipping the downloaded file and saving its contents to a specified location
except Exception as e:
    raise e # When an exception is caught, it is re-raised using raise e, allowing it to propagate up the call stack without modifying the original exception. This preserves the error context and stack trace, making it easier to diagnose the problem.

## Yaml file is empty error initially.
# The error indicates that the YAML file being read is empty, meaning it does not contain any data. 
# Let's add some dummy values for time being in schema.yaml and params.yaml file.

[2024-10-04 16:14:08,378: INFO: common: YAML file: config/config.yaml loaded successfully.]
[2024-10-04 16:14:08,380: INFO: common: YAML file: params.yaml loaded successfully.]
[2024-10-04 16:14:08,383: INFO: common: YAML file: schema.yaml loaded successfully.]
[2024-10-04 16:14:08,385: INFO: common: created directory at: artifacts]
[2024-10-04 16:14:08,386: INFO: common: created directory at: artifacts/data_ingestion]
[2024-10-04 16:14:08,387: INFO: 1418873313: File already exists of size : ~ 25 KB]
