In [1]:
import os

In [2]:
%pwd

'c:\\Users\\satya\\Documents\\data_science_roadmap\\Deep_Learning\\Project\\Kidney-Disease-Classification-MLFlow-DVC\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\satya\\Documents\\data_science_roadmap\\Deep_Learning\\Project\\Kidney-Disease-Classification-MLFlow-DVC'

In [None]:
#Entity is the return type of any function(say here data ingestion).
from dataclasses import dataclass
from pathlib import Path

#Data class because no one can add any new variable. Also it throws error if the return type is not same as mentioned
@dataclass(frozen=True) #To access this as class variable from other files we use dataclass(entity) and frozen= True means no one can add any new variable
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path

In [27]:
from src.cnnClassifier.constants import *  #IMPORTING THE TWO CONSTANTS PATH OF CONFIG AND PARAMS FROM CONSTANTS

from src.cnnClassifier.utils.common import read_yaml, create_directories #Importing from utils the common functions like read_yaml and creat_directories.


In [None]:
# Configuration manager - This is where the data ingestion paths are prepared using the config(template of artifacts) and entity files(ensuring the return types)
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root]) #Dot accessing can be done because of Configbox which is the return type of the read_yaml function
        #This will retrurn artifacts the value of artifacts_root from config.yaml 

    def get_data_ingestion_config(self) -> DataIngestionConfig: #We have keep the return type as DataIngestionConfig so that the function does not return any other return type written in data_ingestion.
        #For example it will only return the 4 things mentioned in the class. It will return error if anything passed except this because we have kept frozen = True
        #Also remeber we have used the ensure annotation decorators, So what will it do that it will not allow any other return type.
        #If let say the type of URL is str and if it is not str then it will give error. So be sure to check the datatype in config.yaml file and the types passed in entity.
        config = self.config.data_ingestion

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir 
        )
        return data_ingestion_config


In [None]:
# Components- It is responsible for the actual data ingestion at the specified locations in artifacts. The main logic of the component goes here
import os
import zipfile
import gdown
from src.cnnClassifier.logging.logs import logger
from src.cnnClassifier.utils.common import get_size

In [30]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig): #This comes from Configuration manager. 
        self.config = config #By this way we can access all the four variables of DataIngestionConfig


    
    def download_file(self)-> str:  # Downloading the data from google drive
       '''
       Fetch data from URL
       '''
       try:
           dataset_url = self.config.source_URL
           zip_download_dir= self.config.local_data_file
           os.makedirs("artifacts/data_ingestion", exist_ok=True)
           logger.info(f"Downloading data from {dataset_url} into file {zip_download_dir}")
           
           file_id=dataset_url.split("/")[-2]
           prefix='https://drive.google.com/uc?export=download&id='
           gdown.download(prefix+file_id,zip_download_dir) #Src and destination
           logger.info(f"Downloaded data from {dataset_url} into file {zip_download_dir}")
       except Exception as e:
           raise e
       
    def extract_zip_file(self): #Extracting the zip file
           """
           zip_file_path: str
           Extracts the zip file into the data directory
           Function returns None
           """
           unzip_path = self.config.unzip_dir
           os.makedirs(unzip_path, exist_ok=True)
           with zipfile.ZipFile(self.config.local_data_file,'r') as zip_ref: #Take teh source path form local_data_file and unzips at the destination path unzip_path
               zip_ref.extractall(unzip_path)
       
       
           
        

In [31]:
# Pipeline- To run the main functions of the components in order
try:
    config = ConfigurationManager() #Initializing the configuration manager class to get data ingestion config
    data_ingestion_config = config.get_data_ingestion_config() #Getting the data ingestion config for preparing the data ingestion paths
    data_ingestion = DataIngestion(config=data_ingestion_config) #Initializing the data ingestion class to access the data ingestion main functions
    data_ingestion.download_file()
    data_ingestion.extract_zip_file()
except Exception as e:
    raise e

[2025-03-25 11:20:01,759: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-03-25 11:20:01,761: INFO: common: yaml file: params.yaml loaded successfully]
[2025-03-25 11:20:01,762: INFO: common: created directory at: artifacts]
[2025-03-25 11:20:01,764: INFO: common: created directory at: artifacts/data_ingestion]
[2025-03-25 11:20:01,765: INFO: 1189576724: Downloading data from https://drive.google.com/file/d/1_K7AfIg4ynhyhrggex0QMBM3M6yjJS3C/view?usp=sharing into file artifacts/data_ingestion/data.zip]


Downloading...
From (original): https://drive.google.com/uc?export=download&id=1_K7AfIg4ynhyhrggex0QMBM3M6yjJS3C
From (redirected): https://drive.google.com/uc?export=download&id=1_K7AfIg4ynhyhrggex0QMBM3M6yjJS3C&confirm=t&uuid=0947f6e2-c93f-4359-bb9e-668c2b87553e
To: c:\Users\satya\Documents\data_science_roadmap\Deep_Learning\Project\Kidney-Disease-Classification-MLFlow-DVC\artifacts\data_ingestion\data.zip
100%|██████████| 687M/687M [00:25<00:00, 27.2MB/s] 

[2025-03-25 11:20:30,858: INFO: 1189576724: Downloaded data from https://drive.google.com/file/d/1_K7AfIg4ynhyhrggex0QMBM3M6yjJS3C/view?usp=sharing into file artifacts/data_ingestion/data.zip]



