# Data Ingestion Stage

## Entity

In [7]:
from collections import namedtuple
from dataclasses import dataclass
from pathlib import Path

# DataIngestionConfig = namedtuple("DataIngestionConfig",
# [
#     "root_dir",
#     "source_URL",
#     "local_data_file",
#     "unzip_dir"
# ])


# Alternative way
@dataclass(frozen = True)
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path




## Configuartion Manager 

In [8]:
from CNN_CLASSIFIER.constants import *
from CNN_CLASSIFIER.utils.common import read_yaml, create_directories

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH, 
        params_filepath = PARAMS_FILE_PATH): 
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
    

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
               root_dir = config.root_dir,
               source_URL = config.source_URL,
               local_data_file = config.local_data_file,
               unzip_dir = config.unzip_dir,
        ) 
        return data_ingestion_config

## Components

In [10]:
import os
import urllib.request as request
from zipfile import ZipFile
from CNN_CLASSIFIER.logging import logger


class DataIngestionConfig:
    def __init__(self, config: DataIngestionConfig):
        self.config = config
        print(config)

    def download_data(self):
        if not os.path.exists(self.config.local_data_file):
            filename, headers = request.urlretrieve(
                url = self.config.source_URL,
                filename = self.config.local_data_file
            )
    def _get_update_list_of_files(self, list_of_files):
        return [f for f in list_of_files if f.endswith('.jpg') and ("Cats" in f or "Dogs" in f)]
    
    def _preprocess(self,zf: ZipFile,f: str, working_dir: str):
        target_filepath = os.path.join(working_dir, f)
        # If not exists extract the zip 
        if not os.path.exists(target_filepath):
            zf.extract(f, working_dir)

        # Remove Zero KB files
        if os.path.getsize(target_filepath) == 0:
            os.remove(target_filepath)
                  
    def unzip_and_clean(self):
        with ZipFile(file = self.config.local_data_file,mode='r') as zf:
            list_of_files = zf.namelist()
            updated_list_of_files = self._get_update_list_of_files(list_of_files)
            for f in updated_list_of_files:
                self._preprocess(zf,f,self.config.unzip_dir)

# Example
'''
x/y/Cats/1.jpg
x/y/Dogs/1.jpg
x/y/1.jpg ----> will be ignored
0.kb -----> will be ignored
Note : _function_ == Hidden methods 
'''

'\nx/y/Cats/1.jpg\nx/y/Dogs/1.jpg\nx/y/1.jpg ----> will be ignored\n0.kb -----> will be ignored\nNote : _function_ == Hidden methods \n'

## Pipeline

In [None]:
%pwd
os.chdir("/Users/pavankumar/Projects/CNN-Classifier-End-to-end/")

In [None]:
%pwd

'/Users/pavankumar/Projects/CNN-Classifier-End-to-end'

In [11]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestionConfig(config = data_ingestion_config)
    data_ingestion.download_data()
    data_ingestion.unzip_and_clean()

except Exception as e:
    logger.error(e)

[2024-05-01 10:56:04,590: ERROR: 3098241455: [Errno 2] No such file or directory: 'config/config.yaml']
