The main point of this notebook is to develop and test different elements of the code like : DataLoaders, Models, etc. Once they are ready, I can push them into the desired files make them work in the pipeline.

We will read the data from the directory and make it avaliable for the next step in the process. 

In [5]:
import os
from PIL import Image
import torch
from torch.utils.data import Dataset

In [None]:
class CustomImageDataset(Dataset):
    '''
    The main goal of this section is to make a Dataset reader. 

    The two essetial elemets of this section are the 
        1. __init__ : (which defines how to read the dir)
        2. __getitem__ : which defines get one row.
    '''
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform

        self.classes = os.listdir(root_dir)
        self.image_paths = []
        self.labels = []

        #loop to read the classes within the directory
        for label, class_name in enumerate(self.classes):
            class_path = os.path.join(root_dir, class_name)
            #loop to read load the images from individual directories.
            for img_name in os.listdir(class_path):
                self.image_paths.append(os.path.join(class_path, img_name))
                self.labels.append(label)
    
    def __len__(self):
        return len(self.image_paths)
    
    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        image = Image.open(img_path).convert("JPEG")
        label = self.labels[idx]

        if self.transform:
            image = self.transform(image)
        
        return image, label

In [20]:
import kagglehub
import kaggle

# Download the dataset and store it in the required place.
dataset_handle = 'alxmamaev/flowers-recognition'
download_path = '/Users/Soham/Documents/Projects/kaggle-flower-recognition-image-classification/artifacts/data_ingestion'

kaggle.api.dataset_download_files(dataset_handle, path=download_path, unzip=True)
print(f"Dataset '{dataset_handle}' downloaded and unzipped to '{download_path}'")

Dataset URL: https://www.kaggle.com/datasets/alxmamaev/flowers-recognition


KeyboardInterrupt: 

These is the dataclass we will be using to download the data into the correct folders.

In [21]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir : Path
    root_dataset_name : str 
    local_data_file : Path

Now, we will write a class that will read config.yaml and params.yaml to make the requird directories. 

In [22]:
from src.cnn_classifier.constants import *
from src.cnn_classifier.utils.common import read_yaml, create_directories

In [23]:
class ConfigurationManager():
    def __init__(
            self,
            config_path = CONFIG_FILE_PATH,
            params_path = PARAMS_FILE_PATH
    ):
        self.config = read_yaml(config_path)
        self.params = read_yaml(params_path)

        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir = config.root_dir,
            root_dataset_name = config.root_dataset_name,
            local_data_file = config.local_data_file
        )

        return data_ingestion_config


In [24]:
from src.cnn_classifier.utils.logger import logger
import kagglehub
import kaggle


In [25]:
class DataIngestion:
    def __init__(self, config : DataIngestionConfig):
        self.config = config

    def download_file(self) -> str:
        try:
            root_dataset_name = self.config.root_dataset_name
            root_dir = self.config.root_dir
            os.makedirs('artifacts/data_ingestion', exist_ok=True)
            logger.info(f'Downloading the dataset {root_dataset_name} into directory {root_dir}')

            kaggle.api.dataset_download_files(root_dataset_name, path=root_dir, unzip=True)
            logger.info(f'Dataset Downloaded into {root_dir}')
        except Exception as e:
            raise e

In [26]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_file()
except Exception as e:
    raise e

[ 2026-02-20 19:20:10,040 : INFO : common : yaml file config/config.yaml loaded successfully ]
[ 2026-02-20 19:20:10,042 : INFO : common : yaml file params.yaml loaded successfully ]
[ 2026-02-20 19:20:10,043 : INFO : common : created directory at: artifacts ]
[ 2026-02-20 19:20:10,044 : INFO : common : created directory at: artifacts/data_ingestion ]
[ 2026-02-20 19:20:10,044 : INFO : 1938957132 : Downloading the dataset alxmamaev/flowers-recognition into directory artifacts/data_ingestion ]
Dataset URL: https://www.kaggle.com/datasets/alxmamaev/flowers-recognition
[ 2026-02-20 19:20:15,978 : INFO : 1938957132 : Dataset Downloaded into artifacts/data_ingestion ]
