In [1]:
from dataclasses import dataclass
from secondClassifier.constants import *
from secondClassifier.utils.common import read_yaml, create_directory
from pathlib import Path
import os
import tarfile 
from urllib import request
from secondClassifier.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH

In [2]:
os.chdir('../')

In [3]:
@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path
    clean_dir: Path

In [4]:
class ConfigurationManager:
    """Creating configuration data at each stage of project
    """
    def __init__(self, 
                 config_filepath= CONFIG_FILE_PATH,
                 params_filepath= PARAMS_FILE_PATH):
        self.config=read_yaml(config_filepath)
        self.params= read_yaml(params_filepath)
        ##Creates directory for root directory
        create_directory(self.config.artifacts_root)
    
    def get_dataingestion_config(self)->DataIngestionConfig:
        """assign values to the data class DataIngestion

        Returns:
            DataIngestionConfig: Data ingestion data class
        """
        config_ingestion= self.config.data_ingestion
        ##Create directory
        create_directory(config_ingestion.root_dir)
        data_ingestion_config= DataIngestionConfig(
            root_dir= config_ingestion.root_dir,
            source_URL= config_ingestion.source_URL,
            local_data_file= config_ingestion.local_data_file,
            unzip_dir= config_ingestion.unzip_dir,
            clean_dir= config_ingestion.clean_dir
        )
        return data_ingestion_config

In [5]:
class dataIngestion:
    """Performing Data ingestion process
    1. Retrieving the data from url
    2. unzipping the data
    3. cleaning up the data (if required.)
    """
    def __init__(self, config:DataIngestionConfig):
        """Define configuration details of data ingestion"""
        self.config= config

    def download_data(self):
        """Retrieves the data from url and downloads to specific folder."""
        ##Checking if data set file already exits else download
        if not os.path.exists(self.config.local_data_file):
            request.urlretrieve(self.config.source_URL,
                                filename=self.config.local_data_file)

    def unzip_dataset(self):
        """Un zips the dataset from tgz
        """
        if os.path.exists(self.config.local_data_file) and not os.path.exists(self.config.clean_dir) :
            with tarfile.open(self.config.local_data_file) as dsf:
                dsf.extractall(self.config.unzip_dir)
        
    def cleanup_dataset(self):
        """Cleansup the data set apart from actual data
        """
        ##Cleaning un necessary files. till now we have only one file
        ## loop around the folder and delete files which are not in folder
        for file_folder in os.listdir(self.config.clean_dir):
            ## if there is no suffix for the file it is a folder
            if Path(file_folder).suffix!='':
                os.remove(os.path.join(self.config.clean_dir, file_folder))
        
    

In [6]:
##tarfile.open("http://download.tensorflow.org/example_images/flower_photos.tgz")

In [7]:
ingestion_config= ConfigurationManager()
data_ingestion_config= ingestion_config.get_dataingestion_config()

In [8]:
dataingestion= dataIngestion(data_ingestion_config)

In [9]:
dataingestion.download_data()

In [10]:
dataingestion.unzip_dataset()

In [11]:
dataingestion.cleanup_dataset()

In [12]:
Path('file.txt').suffix

'.txt'

In [13]:
for file_folder in  os.listdir('artifacts/data_ingestion'):
    #print(type(Path(file_folder).suffix))
    if len( Path(file_folder).suffix):
        print(f'Hello {file_folder}')

Hello flower_photos.tgz
