# Data Ingestion

In [1]:
import os
os.chdir("../")

## Entity


In [2]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_url: str
    local_file_path: Path
    unzip_dir : Path

In [3]:
pwd

'/Users/sachinsen/Documents/DL Project/CNN_Classifier'

In [None]:
pwd

## Utils


In [14]:
from CNNClassifier.constants import CONFIG_FILE_PATH,PARAM_FILE_PATH
from CNNClassifier.utils.utils import read_yaml,create_dir


## Constant Using functions

In [7]:
import os
import sys
import yaml
from src.CNNClassifier import logger
import json
import joblib
from pathlib import Path
from typing import Any
from ensure import ensure_annotations   # to fix the output in what format to receive 
from box.exceptions import BoxValueError
from box import ConfigBox



@ensure_annotations
def read_yaml(path_to_yaml : Path) -> ConfigBox:
    with open(path_to_yaml, "rb") as yaml_file:
        content = yaml.safe_load(yaml_file)
        return ConfigBox(content)
    
@ensure_annotations
def save_json():
    pass

@ensure_annotations
def load_json():
    pass

@ensure_annotations
def save_mode():
    pass

@ensure_annotations
def load_model():
    pass

@ensure_annotations
def get_size():
    pass

@ensure_annotations
def create_dir(path_to_directory:list, verbose=True):
    for path in path_to_directory:
        os.makedirs(path, exist_ok=True)
        if verbose:
            logger.info(f"Create directory ay : {path}")
    





## Read Config Yaml


In [37]:
from CNNClassifier.constants import CONFIG_FILE_PATH,PARAM_FILE_PATH


ARTIFACTS_DIR : artifacts

DATA_INGESTION:
  ROOT_DIR : artifacts/data_ingestion
  SOURCE_URL: https://download.microsoft.com/download/3/E/1/3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_5340.zip
  LOCAL_DATA_FILE: artifacts/data_ingestion/data.zip
  UNZIP_DIR : artifacts/data_ingestion

## Configuration manager

In [23]:
class ConfigurationManager:
    def __init__(self,config_filepath = CONFIG_FILE_PATH,params_filepath = PARAM_FILE_PATH):
                 self.config = read_yaml(config_filepath) # to read Yaml file from COnfig
                 self.params = read_yaml(params_filepath)
                 create_dir([self.config.ARTIFACTS_DIR])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
            config = self.config.DATA_INGESTION

            create_dir([config.ROOT_DIR])

            data_ingestion_config= DataIngestionConfig(
                                    root_dir=config.ROOT_DIR,
                                    source_url=config.SOURCE_URL,
                                    local_file_path=config.LOCAL_DATA_FILE,
                                    unzip_dir=config.UNZIP_DIR)
            return data_ingestion_config


## Data Ingestion


In [24]:
import os
import urllib.request as request
from zipfile import ZipFile

In [33]:
class DataIngestion:
    def __init__(self, config:DataIngestionConfig) -> None:
        self.config = config

    def download_file(self):
        if not os.path.exists(self.config.local_file_path):
            request.urlretrieve(
                url= self.config.source_url,
                filename= self.config.local_file_path
            )

        else:
            logger.info(f"file already exist {self.config.local_file_path}")


    def get_updated_list_file(self,list_of_files):
        return [f for f in list_of_files if f.endswith(".jpg")]

    def preprocess(self,zf,f,working_dir):
        target_file_path = os.path.join(working_dir,f)
        if not os.path.exists(target_file_path):
            zf.extract(f,working_dir)

    def unzip_clean(self):
        with ZipFile(file = self.config.local_file_path,mode='r') as zf:
            list_of_files = zf.namelist()
            updated_list_of_file= self.get_updated_list_file(list_of_files=list_of_files)
            for f in updated_list_of_file:
                self.preprocess(zf,f,self.config.unzip_dir)


    



In [34]:
pwd

'/Users/sachinsen/Documents/DL Project/CNN_Classifier'

In [35]:
from CNNClassifier.logger import logger
from CNNClassifier.config.configuration import ConfigurationManager


logger.info(f"Date Ingestion started")
config = ConfigurationManager()

data_ingestion_config = config.get_data_ingestion_config()

data_ingestion= DataIngestion(config=data_ingestion_config)

data_ingestion.download_file()
data_ingestion.unzip_clean()

logger.info(f"Data ingestion complete")


[2023-11-07 22:31:11,453: INFO: 3146933557]: Date Ingestion started
[2023-11-07 22:31:11,458: INFO: utils]: Create directory ay : artifacts
[2023-11-07 22:31:11,458: INFO: utils]: Create directory ay : artifacts/data_ingestion
[2023-11-07 22:31:11,459: INFO: 2236342710]: file already exist artifacts/data_ingestion/data.zip
[2023-11-07 22:31:16,499: INFO: 3146933557]: Data ingestion complete
