In [42]:
from collections import namedtuple
import os 
os.chdir('../')

In [43]:
DataIngestionConfig = namedtuple('DataIngestionConfig', [
    'root_dir','source_URL', 'local_data_file','unzip_dir'
])

In [44]:
from deepClassifier.constants import *
from deepClassifier.utils import read_yaml, create_directories

In [67]:
class ConfigurationManager:
    def __init__(self, 
                config_file_path = CONFIG_FILE_PATH, 
                params_file_path = PARAMS_FILE_PATH):
        try:
            self.config = read_yaml(config_file_path)
            self.params = read_yaml(params_file_path)
            create_directories([self.config.artifacts_root])
        except Exception as e:
            raise e

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        try:
            config = self.config.data_ingestion            

            root_dir = config.root_dir
            
            create_directories([config.root_dir])

            source_URL = config.source_URL
            local_data_file = config.local_data_file
            unzip_dir = config.unzip_dir

            
            data_ingestion_config = DataIngestionConfig(
                root_dir= root_dir,
                source_URL= source_URL,
                local_data_file= local_data_file,
                unzip_dir= unzip_dir
            )

            return data_ingestion_config
        except Exception as e:
            raise e

In [69]:
import os
import urllib.request as request 
from zipfile import ZipFile

class DataIngestionComponent:
    def __init__(self, config: DataIngestionConfig):
        try:
            self.config = config
        except Exception as e:
            raise e 

    def download_file(self):
        try:
            if not os.path.exists(self.config.local_data_file):
                filename, headers = request.urlretrieve(
                    url = self.config.source_URL,
                    filename=self.config.local_data_file
                )
        except Exception as e:
            raise e  
        
    def _get_updated_list_of_files(self,list_of_files):
        try:
            return [file for file in list_of_files if file.endswith(".jpg") and ('Cat' in file or 'Dog' in file)] 
        except Exception as e:
            raise e 
        
    def _preprocess(self, zf: ZipFile, file: str, working_dir):
        try:
            target_file_path = os.path.join(working_dir, file) 

            if not os.path.exists(target_file_path):
                zf.extract(file, working_dir)

            #remove file with size 0
            if os.path.getsize(target_file_path) == 0:
                os.remove(target_file_path)
        except Exception as e:
            raise e

    def unzip_and_clean(self):
        try:
            with ZipFile(file=self.config.local_data_file,mode = 'r') as zf:
                list_of_files = zf.namelist()
                updated_list_of_files = self._get_updated_list_of_files(list_of_files) 
                for file in updated_list_of_files:
                    self._preprocess(zf, file, self.config.unzip_dir)
            
        except Exception as e:
            raise e  

In [60]:
with ZipFile(file = r'D:\Music\Music\kagglecatsanddogs_5340.zip', mode = 'r') as zf:
    list_of_files = zf.namelist()

In [61]:
list_of_files[0:7]

['PetImages/Cat/',
 'PetImages/Cat/0.jpg',
 'PetImages/Cat/1.jpg',
 'PetImages/Cat/10.jpg',
 'PetImages/Cat/100.jpg',
 'PetImages/Cat/1000.jpg',
 'PetImages/Cat/10000.jpg']

In [62]:
updated_list_of_files = [file for file in list_of_files if file.endswith(".jpg") and ('Cat' in file or 'Dog' in file)][0:7]
updated_list_of_files

['PetImages/Cat/0.jpg',
 'PetImages/Cat/1.jpg',
 'PetImages/Cat/10.jpg',
 'PetImages/Cat/100.jpg',
 'PetImages/Cat/1000.jpg',
 'PetImages/Cat/10000.jpg',
 'PetImages/Cat/10001.jpg']

In [63]:
for f in updated_list_of_files:
    print(f)

PetImages/Cat/0.jpg
PetImages/Cat/1.jpg
PetImages/Cat/10.jpg
PetImages/Cat/100.jpg
PetImages/Cat/1000.jpg
PetImages/Cat/10000.jpg
PetImages/Cat/10001.jpg


In [70]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestionComponent(config=data_ingestion_config)
    data_ingestion.download_file()
    data_ingestion.unzip_and_clean()
except Exception as e:
    raise e

[2023-02-12 11:00:16,421: INFO: common]: yaml file: configs\config.yaml loaded successfully
[2023-02-12 11:00:16,426: INFO: common]: yaml file: params.yaml loaded successfully
[2023-02-12 11:00:16,431: INFO: common]: created directory at: artifacts
[2023-02-12 11:00:16,433: INFO: common]: created directory at: artifacts/data_ingestion
