In [4]:
from collections import namedtuple

DataIngestionConfig= namedtuple("DataIngestionConfig",[
    "artifact_dir", "source_download_url", "downloaded_data_file_path", "unzipped_data_dir"
])

In [1]:
from typing import List
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    artifact_dir:Path
    source_download_url:str
    downloaded_data_file_path:Path
    unzipped_data_dir:Path

In [2]:
from pathlib import Path
from DeepClassifier.constants import *
from DeepClassifier.utils import read_yaml_file, make_directories

class ConfigurationManager:

    def __init__(self, 
                config_filepath:Path=CONFIG_FILEPATH,
                params_filepath:Path=PARAMS_FILEPATH
                ) -> None:
        self.config= read_yaml_file(filepath=config_filepath)
        self.params= read_yaml_file(filepath=params_filepath)

        dir_to_be_created= [self.config.root_artifact_dir,]
        make_directories(dir_to_be_created)

    
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        
        data_ingestion_config_info= self.config.data_ingestion

        make_directories([data_ingestion_config_info.artifact_dir])

        data_ingestion_config= DataIngestionConfig(
            artifact_dir= data_ingestion_config_info.artifact_dir,
            source_download_url= data_ingestion_config_info.source_download_url,
            downloaded_data_file_path= data_ingestion_config_info.downloaded_data_file_path,
            unzipped_data_dir= data_ingestion_config_info.unzipped_data_dir
        )

        return data_ingestion_config

In [3]:
import os
import urllib.request as request
from zipfile import ZipFile
from typing import List
from DeepClassifier import logger

class DataIngestion:
    def __init__(self, config:DataIngestionConfig):
        self.config= config

    def download_data(self):
        """
        This function downloads data from the url to the desired location.
        """
        if not os.path.exists(self.config.downloaded_data_file_path):
            filepath, httpmessage= request.urlretrieve(
                url= self.config.source_download_url, 
                filename=self.config.downloaded_data_file_path
            )

    def _get_updated_list_of_files(self, list_of_files:List) -> List:
        updated_list_of_files= [file for file in list_of_files if file.endswith('.jpg') and ('Cat' in file or 'Dog' in file)]
        return updated_list_of_files


    def _preprocess(self, zf:ZipFile, f:str, dir:str):
        target_file_path= os.path.join(dir, f)

        if not os.path.exists(target_file_path):
            zf.extract(f, dir)

        if os.path.getsize(target_file_path)==0:
            os.remove(target_file_path)


    def unzip_and_clean_data(self):
        """ 
        This function performs a validity check on the files:
        1. Extract those files which end with .jpg ext.
        2. Remove those extracted files which have a file size of 0.
        """
        with ZipFile(file=self.config.downloaded_data_file_path, mode='r') as file_obj:
            list_of_files= file_obj.namelist()
            updated_list_of_files= self._get_updated_list_of_files(list_of_files=list_of_files)

            for file in updated_list_of_files:
                self._preprocess(file_obj, file, self.config.unzipped_data_dir)



    def __del__(self):
        logger.info('Data Ingestion Stage Completed')






In [4]:
os.chdir('..')
os.getcwd()

'd:\\Full Stack Data Science\\Python Project\\DL\\Current Batch\\Deep_CNN_Classifier'

In [5]:
try:
    
    config= ConfigurationManager()
    d=config.get_data_ingestion_config()
    d_ing= DataIngestion(d)
    d_ing.download_data()
    d_ing.unzip_and_clean_data()
except Exception as e:
    raise (e)

[2022-10-18 13:12:41,217: INFO: common]: Loaded the content from configs\config.yaml successfully
[2022-10-18 13:12:41,220: INFO: common]: Loaded the content from params.yaml successfully
[2022-10-18 13:12:41,222: INFO: common]: artifacts created successfully.
[2022-10-18 13:12:41,225: INFO: common]: artifacts/data_ingestion created successfully.


In [6]:
from DeepClassifier import logger
logger.info('hello')

[2022-10-18 13:12:52,108: INFO: 2843173021]: hello


In [None]:
# Data Cleaning:
## 1. Unwanted pdf files in root dir
## 2. Files of different format inside images folder 
## 3. Image file with 0 kb size.

## check namelist output

In [None]:
from zipfile import ZipFile

with ZipFile('kagglecatsanddogs_5340.zip', 'r') as f:
    x= f.namelist()

[print(i) for i in x if i.endswith('.jpg')]
    # print(f)


In [1]:
s= 'txt/txt.txt'
s.endswith('txt')

True