In [1]:
import os

In [2]:
%pwd

'/home/vishal/Vishal/Chicken_Disease_Detection/research'

In [3]:
os.chdir("../")

In [19]:
%pwd

'/home/vishal/Vishal/Chicken_Disease_Detection'

In [35]:
# Entity = return type of function
from dataclasses import dataclass

@dataclass
class DataIngestionConfig:
    root_dir: str
    kaggle_dataset: str
    local_data_file: str
    unzip_dir: str

In [36]:
from cnnClassifier.constants import *
from cnnClassifier.utils.common import read_yaml, create_directories

In [37]:
import yaml
from pathlib import Path

class ConfigurationManager:
    def __init__(self, config_path=CONFIG_FILE_PATH):
        self.config = self.read_yaml(config_path)
        create_directories([self.config.artifacts_root])
    
    def read_yaml(self, file_path: Path) -> dict:
        with open(file_path, "r") as file:
            return yaml.safe_load(file)

    def get_data_ingestion_config(self):
        # Extract the data_ingestion config section from the YAML file
        config = self.config["data_ingestion"]
        # Ensure the directory exists
        create_directories([config["root_dir"]])
        
        # Create and return a DataIngestionConfig object
        data_ingestion_config = DataIngestionConfig(
            root_dir=config["root_dir"],
            kaggle_dataset=config["kaggle_dataset"],
            local_data_file=config["local_data_file"],
            unzip_dir=config["unzip_dir"]
        )

        return data_ingestion_config




In [38]:
import os
import urllib.request as request
import zipfile
from cnnClassifier import logger
from cnnClassifier.utils.common import get_size

In [41]:
import kaggle
import os
import requests
from tqdm import tqdm
import zipfile
import logging
from pathlib import Path
from kaggle.api.kaggle_api_extended import KaggleApi

logger = logging.getLogger(__name__)

class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config
        self.api = KaggleApi()
        self.api.authenticate()

    def download_file(self):
        if not os.path.exists(self.config.local_data_file):
            # Download the file with a progress bar
            url = f"https://www.kaggle.com/api/v1/datasets/download/{self.config.kaggle_dataset}"
            response = requests.get(url, stream=True)

            total_size_in_bytes = int(response.headers.get('content-length', 0))
            block_size = 1024  # 1 Kilobyte

            with open(self.config.local_data_file, 'wb') as file, tqdm(
                desc="Downloading",
                total=total_size_in_bytes,
                unit='iB',
                unit_scale=True,
                unit_divisor=1024,
            ) as bar:
                for data in response.iter_content(block_size):
                    file.write(data)
                    bar.update(len(data))

            logger.info(f"File downloaded to: {self.config.local_data_file}")
        else:
            logger.info(f"File already exists of size: {get_size(Path(self.config.local_data_file))}")

    def get_size(self, file_path):
        # Get the size of the file
        size = os.path.getsize(file_path)
        return f"{size / (1024 * 1024):.2f} MB"

    def extract_zip_file(self):
        """
        Extracts the zip file into the data directory with a progress bar.
        """
        unzip_path = self.config.unzip_dir
        os.makedirs(unzip_path, exist_ok=True)

        # Check if the downloaded file is a ZIP
        if not self.config.local_data_file.endswith('.zip'):
            logger.error("The downloaded file is not a ZIP file. Cannot extract.")
            return

        # Check if the ZIP file exists
        if not os.path.exists(self.config.local_data_file):
            logger.error("ZIP file not found. Cannot extract.")
            return

        try:
            # Open and extract the ZIP file with a progress bar
            with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
                total_files = len(zip_ref.infolist())
            
                # Use tqdm to display a progress bar for the extraction
                for member in tqdm(zip_ref.infolist(), desc="Extracting", total=total_files):
                    zip_ref.extract(member, unzip_path)
        
            logger.info(f"ZIP file extracted successfully to: {unzip_path}")
    
        except zipfile.BadZipFile:
            logger.error("The file is not a valid ZIP file.")
        except Exception as e:
            logger.error(f"An error occurred while extracting the ZIP file: {str(e)}")

In [42]:
def main():
    try:
        # Initialize Configuration Manager
        config_manager = ConfigurationManager(config_path=CONFIG_FILE_PATH)
        
        # Get Data Ingestion Configuration
        data_ingestion_config = config_manager.get_data_ingestion_config()
        
        # Create DataIngestion object and download/extract data
        data_ingestion = DataIngestion(config=data_ingestion_config)
        data_ingestion.download_file()
        data_ingestion.extract_zip_file()
        
    except Exception as e:
        raise e

if __name__ == "__main__":
    main()


Downloading: 100%|██████████| 7.90G/7.90G [29:47<00:00, 4.74MiB/s]  

[2024-10-25 16:17:47,128: INFO: 980294484: File downloaded to: artifacts/data_ingestion/data.zip]



Extracting: 100%|██████████| 6812/6812 [06:56<00:00, 16.34it/s]
