In [1]:
import os

In [2]:
%pwd

'd:\\Artificial Intelligence and Machine Learning\\End2End Projects\\Paddy Doctor - Paddy Disease Classification\\paddy-doctor\\research'

In [3]:
os.chdir("../")
%pwd

'd:\\Artificial Intelligence and Machine Learning\\End2End Projects\\Paddy Doctor - Paddy Disease Classification\\paddy-doctor'

In [4]:
## Update Entity for Data Ingestion
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen = True)
class DataIngestionConfig:
    root_dir: Path
    dataset_name: str
    local_data_file: Path
    class_weight: Path


In [5]:
## Update configuration manager
from src.paddydoctor.constants import *
from src.paddydoctor.utils.common import *


class ConfigurationManager:
    def __init__(self, config_filepath = CONFIG_FILEPATH, 
                 params_filepath = PARAMS_FILEPATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories_files([self.config.artifacts_root])

    def get_data_ingestion_config(self)->DataIngestionConfig:
        config = self.config.data_ingestion
        create_directories_files([config.root_dir])
        data_ingestion_config = DataIngestionConfig(root_dir = config.root_dir, 
                                                    dataset_name = config.dataset_name, 
                                                    local_data_file = config.local_data_file, 
                                                    class_weight = config.class_weight)
        return data_ingestion_config
        

In [6]:
## Update Components
import os
from kaggle.api.kaggle_api_extended import KaggleApi
from paddydoctor.logging import logger
import zipfile
from src.paddydoctor.utils.common import *

In [7]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config
        self.api = KaggleApi()
        self.api.authenticate()
    
    def download_files(self):
        if not os.path.exists(os.path.join(self.config.local_data_file, "train.csv")) and (get_size(os.path.join(self.config.local_data_file, "train_images")))<100:
            logger.info(f"Downloading Dataset: {self.config.dataset_name} to directory: {self.config.local_data_file}")
            ## Downloading
            self.api.competition_download_files(competition = self.config.dataset_name, 
                                            path = self.config.local_data_file)
            logger.info("Files Downloaded Successfully")

            ## Unzipping
            zipfile_path = os.path.join(self.config.local_data_file, f"{self.config.dataset_name}.zip")

            if zipfile.is_zipfile(zipfile_path):
                with zipfile.ZipFile(zipfile_path, "r") as zip_ref:
                    zip_ref.extractall(path = self.config.local_data_file)
                    logger.info(f"Files Successfully unzipped to {self.config.local_data_file}")
                os.remove(zipfile_path)
                
            
            else:
                logger.error(f"{zipfile_path} is not a valid zip file")
        else:
            logger.info("File already Downloaded")
            return

In [8]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config = data_ingestion_config)
    data_ingestion.download_files()

   
    to_check = os.path.join(data_ingestion_config.local_data_file, "train_images")
    classes = os.listdir(to_check)
    paths = [os.path.join(to_check, c) for c in classes]
    lengths = [(c,len(os.listdir(p))) for (p,c) in zip(paths,classes)]
    total = sum([x[1] for x in lengths])
    class_weights = {c:total/count for c,count in lengths}
    save_json(Path(data_ingestion_config.class_weight), class_weights)

except Exception as e:
    raise e

[2024-09-16 10:09:11,955: INFO: common: config\config.yaml loaded successfully]
[2024-09-16 10:09:11,957: INFO: common: params\params.yaml loaded successfully]
[2024-09-16 10:09:11,958: INFO: common: Directories and Files successfully created]
[2024-09-16 10:09:11,959: INFO: common: Directories and Files successfully created]
[2024-09-16 10:09:11,964: INFO: 3364030265: File already Downloaded]
[2024-09-16 10:09:11,976: INFO: common: Data Saved Successfully to artifacts\class_weight.json]
