In [1]:
import os


os.chdir("../")

In [2]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_url: str
    local_data_file: Path
    unzip_dir: Path
    prefix: str

In [3]:
from cnn_classifier.constants import *
from cnn_classifier.utils.common import read_yaml, create_directories


class ConfigurationManager:
    def __init__(
        self,
        config_file_path: Path = CONFIG_FILE_PATH,
        params_file_path: Path = PARAMS_FILE_PATH,
    ):
        self.config = read_yaml(config_file_path)
        self.params = read_yaml(params_file_path)

        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        cfg = self.config.data_ingestion

        create_directories([cfg.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=cfg.root_dir,
            source_url=cfg.source_url,
            local_data_file=cfg.local_data_file,
            unzip_dir=cfg.unzip_dir,
            prefix=cfg.prefix,
        )
        
        return data_ingestion_config

In [4]:
import zipfile

import gdown

from cnn_classifier import logger


class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config
        
    def download_data(self):
        try:
            root_dir = self.config.root_dir
            source_url = self.config.source_url
            local_data_file = self.config.local_data_file
            os.makedirs(root_dir, exist_ok=True)
            logger.info(f"Downloading data from {source_url} into file {local_data_file}")
            
            file_id = source_url.split("/")[-2]
            prefix = self.config.prefix
            gdown.download(f"{prefix}{file_id}", local_data_file)
            logger.info(f"Downloaded data from {source_url} into file {local_data_file}")
            
        except Exception as e:
            logger.error(f"Failed to download data: {e}")
            raise e
        
    def extract_zip_file(self):
        unzip_dir = self.config.unzip_dir
        os.makedirs(unzip_dir, exist_ok=True)
        with zipfile.ZipFile(self.config.local_data_file, "r") as f:
            f.extractall(unzip_dir)
            logger.info(f"Extracted data from {self.config.local_data_file} into {unzip_dir}")
        

In [5]:
try:
    configuration_manager = ConfigurationManager()
    data_ingestion_config = configuration_manager.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_data()
    data_ingestion.extract_zip_file()
    logger.info("Data ingestion completed")
except Exception as e:
    logger.error(f"Data ingestion failed: {e}")
    raise e

[ 2024-02-20 23:51:07,734 ] 34 common cnn_classifier -  INFO - Loaded YAML file successfully from: config/config.yaml
[ 2024-02-20 23:51:07,739 ] 34 common cnn_classifier -  INFO - Loaded YAML file successfully from: params.yaml
[ 2024-02-20 23:51:07,742 ] 55 common cnn_classifier -  INFO - Created directory at: artifacts
[ 2024-02-20 23:51:07,744 ] 55 common cnn_classifier -  INFO - Created directory at: artifacts/data_ingestion
[ 2024-02-20 23:51:07,745 ] 18 3371331221 cnn_classifier -  INFO - Downloading data from https://drive.google.com/file/d/1z0mreUtRmR-P-magILsDR3T7M6IkGXtY/view?usp=sharing into file artifacts/data_ingestion/data.zip


Downloading...
From (original): https://drive.google.com/uc?/export=download&id=1z0mreUtRmR-P-magILsDR3T7M6IkGXtY
From (redirected): https://drive.google.com/uc?%2Fexport=download&id=1z0mreUtRmR-P-magILsDR3T7M6IkGXtY&confirm=t&uuid=8e57a254-595a-4c0a-81cc-98ac7b9ebb18
To: /Users/wilsvenleong/Downloads/learning-materials/mlops-with-mlflow-dvc/artifacts/data_ingestion/data.zip
100%|██████████| 49.0M/49.0M [00:07<00:00, 6.19MB/s]

[ 2024-02-20 23:51:17,120 ] 23 3371331221 cnn_classifier -  INFO - Downloaded data from https://drive.google.com/file/d/1z0mreUtRmR-P-magILsDR3T7M6IkGXtY/view?usp=sharing into file artifacts/data_ingestion/data.zip





[ 2024-02-20 23:51:17,872 ] 34 3371331221 cnn_classifier -  INFO - Extracted data from artifacts/data_ingestion/data.zip into artifacts/data_ingestion
[ 2024-02-20 23:51:17,874 ] 7 86133812 cnn_classifier -  INFO - Data ingestion completed
