In [1]:
import os
import urllib.request as request
from zipfile import ZipFile

from collections import namedtuple

os.chdir("../")

In [2]:
DataIngestionConfig = namedtuple(
    "DataIngestionConfig", [
        "root_dir",  # Directory where the artifacts of data ingestion will be saved
        "source_URL",  # URL of the data
        "zipped_data_file_path",  # Path of the downloaded zipped data file
        "unzipped_file_dir",  # Directory of the unzipped data file
    ]
)

In [3]:
from DeepClassifier.constants import *
from DeepClassifier.utils import read_yaml, create_directories

## Configuration Manager

In [4]:
class ConfigurationManager:
    def __init__(
        self,
        config_file_path: Path = CONFIG_FILE_PATH,
        params_file_path: Path = PARAMS_FILE_PATH
    ) -> None:
        """Inits ConfigurationManager.

        Args:
            config_file_path (Path, optional): Path of the config.yaml file.
                Defaults to the constant CONFIG_FILE_PATH.
            params_file_path (Path, optional): Path of the params.yaml file.
                Defaults to the constant PARAMS_FILE_PATH.
        """
        # Getting information in the config.yaml and params.yaml file
        self.config = read_yaml(yaml_file_path=config_file_path)
        self.params = read_yaml(yaml_file_path=params_file_path)
        
        # Creating the 'artifacts' directory
        create_directories(paths_of_directories=[self.config.artifacts_root])
    
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        """Creates and returns DataIngestionConfig.

        Returns:
            DataIngestionConfig: The DataIngestionConfig.
        """
        # Getting the values in the `data_ingestion` key of the config.yaml
        # file
        config = self.config.data_ingestion
        
        # Creating the directory 'artifacts/data_ingestion'
        create_directories(paths_of_directories=[config.root_dir])
        
        # Creating and returning `DataIngestionConfig`
        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            zipped_data_file_path=config.zipped_data_file_path,
            unzipped_file_dir=config.unzipped_file_dir
        )
        return data_ingestion_config

## Components

In [5]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig) -> None:
        """Inits DataIngestion.

        Args:
            config (DataIngestionConfig): The DataIngestionConfig.
        """
        self.config = config
    
    def download_data_file(self) -> None:
        """Downloads the data file.
        """
        # Download only when the file is not already downloaded
        if not os.path.exists(self.config.zipped_data_file_path):
            filename, headers = request.urlretrieve(
                url=self.config.source_URL,
                filename=self.config.zipped_data_file_path
            )
    
    def _get_updated_list_of_files(self, list_of_files: list) -> list:
        """Returns an updated list of files that include only those files which
        are needed for training.

        Args:
            list_of_files (list): The list of files to be updated.

        Returns:
            list: The updated list of files.
        """
        updated_list_of_files = []
        for file in list_of_files:
            # Only get those files having an extension of 'jpg' and those that
            # are in either the 'Cat' or the 'Dog' directories
            if file.endswith(".jpg") and ("Cat" in file or "Dog" in file):
                updated_list_of_files.append(file)
        
        return updated_list_of_files
    
    def _preprocess(self, zf: ZipFile, file: str, working_dir: str) -> None:
        """Extracts a file from the zipped data file.

        Args:
            zf (ZipFile): The zip file of the data.
            file (str): The file (path) in the zip data to be extracted.
            working_dir (str): The directory in which the file is to be
                extracted.
        """
        # Creating the path of the file that is to be extracted
        target_file_path = os.path.join(working_dir, file)
        
        # We extract the file only if it does not already exists
        if not os.path.exists(target_file_path):
            zf.extract(file, working_dir)
        
        # If the size of the extracted file is 0 KB, we delete it
        if os.path.getsize(target_file_path) == 0:
            os.remove(target_file_path)
    
    def unzip_and_clean_data_file(self) -> None:
        """Unzips and cleans the data file.
        """
        with ZipFile(file=self.config.zipped_data_file_path, mode="r") as zf:
            # Getting the list of files in the downloaded zip file
            list_of_files = zf.namelist()
            
            # Updating the list of files to only include files that we want
            # for training
            updated_list_of_files = self._get_updated_list_of_files(
                list_of_files=list_of_files
            )
            
            # Extracting the files
            for file in updated_list_of_files:
                self._preprocess(
                    zf=zf,
                    file=file,
                    working_dir=self.config.unzipped_file_dir
                )

In [6]:
try:
    config = ConfigurationManager()

    data_ingestion_config = config.get_data_ingestion_config()

    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_data_file()
    data_ingestion.unzip_and_clean_data_file()
except Exception as e:
    raise e

[2023-06-30 12:24:32,960: INFO: common]: YAML file 'configs\config.yaml' loaded successfully
[2023-06-30 12:24:32,975: INFO: common]: YAML file 'params.yaml' loaded successfully
[2023-06-30 12:24:32,978: INFO: common]: Created the directory at: artifacts
[2023-06-30 12:24:32,981: INFO: common]: Created the directory at: artifacts/data_ingestion
