# Environment Setting Up

In [1]:
import os
from dotenv import load_dotenv

# Loading environment variables from .env
load_dotenv()

# Changing directory to main directory for easy data access
working_directory = os.getenv("WORKING_DIRECTORY")
os.chdir(working_directory)

# Checking the change
%pwd

'/workspaces/TumorTracer'

In [2]:
from pathlib import Path

# Checking the change
print("Git folder exists:", Path(".git").exists())

Git folder exists: True


# 1. Data Ingestion

In [3]:
from dataclasses import dataclass
from pathlib import Path
from cnnClassifier import get_logger

logger = get_logger()

@dataclass(frozen=True)
class DataIngestionConfig:
    """
    Immutable configuration class to hold all necessary paths 
    and dataset identifiers required for the data ingestion stage.
    """
    root_dir: Path          # Base directory for all ingestion outputs
    kaggle_dataset: str     # The Kaggle dataset identifier "owner/dataset"
    download_zip: Path      # Path where the downloaded ZIP file will be saved
    extracted_file: Path    # Path where the final extracted file will be stored

In [4]:
from cnnClassifier.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH
from cnnClassifier.utils.common import read_yaml, create_directories

class ConfigurationManager:
    def __init__(self, config_file_path=CONFIG_FILE_PATH, params_file_path=PARAMS_FILE_PATH) -> None:
        """
        Reads configuration files (config.yaml and params.yaml), 
        ensures necessary directories exist, and prepares structured config objects.
        """
        # Load both config and params YAML files into memory
        if not Path(config_file_path):
            logger.error(f"Config file not found at: {config_file_path}")
            raise FileNotFoundError(f"Config file not found at: {config_file_path}")
        else:
            self.config = read_yaml(config_file_path)

        if not Path(config_file_path):
            logger.error(f"Params file not found at: {params_file_path}")
            raise FileNotFoundError(f"Params file not found at: {params_file_path}")
        else:
            self.params = read_yaml(params_file_path)

        logger.info(f"Loading configuration from {config_file_path} and parameters from {params_file_path}")

        # Create the root artifacts directory (if not already present)
        create_directories([self.config.artifacts_root])

    def get_ingestion_config(self) -> DataIngestionConfig:
        """
        Creates and returns a DataIngestionConfig object with paths defined 
        for downloading and extracting the dataset.
        
        Returns:
        - DataIngestionConfig: Structured config object for ingestion stage.
        """
        config = self.config.data_ingestion

        # Ensure the data_ingestion directory exists
        create_directories([config.root_dir])

        # Build and return a structured configuration object for ingestion
        ingestion_config = DataIngestionConfig(
            root_dir=Path(config.root_dir),
            kaggle_dataset=config.kaggle_dataset,
            download_zip=Path(config.download_zip),
            extracted_file=Path(config.extracted_file),
        )
        
        logger.info(f"DataIngestionConfig created with: {ingestion_config}")

        return ingestion_config

In [5]:
import os
import json
from cnnClassifier.utils.common import create_directories, save_json

def setup_kaggle_auth_from_secret(secret_env_var: str = "KAGGLE_JSON") -> None:
    """
    Sets up Kaggle API authentication using a secret stored in an environment variable.

    Parameters:
    - secret_env_var (str): The name of the environment variable that contains
                            the Kaggle credentials as a JSON string.

    Raises:
    - ValueError: If the environment variable is missing or contains invalid JSON.
    - Exception: For any other unhandled errors.
    """
    # Read from environment variable (injected from Codespaces secret)
    kaggle_json_str = os.getenv(secret_env_var)

    if kaggle_json_str is None:
        raise ValueError(f"{secret_env_var} secret not found.")
    
    try:
        # Validate it's a proper JSON
        kaggle_json_data = json.loads(kaggle_json_str)
    except json.JSONDecodeError as exception:
        raise ValueError(f"{secret_env_var} does not contain valid JSON: {exception}")

    # Setting directory path
    kaggle_dir = Path.home() / ".kaggle"
    kaggle_json_path = kaggle_dir / "kaggle.json"

    try:
        create_directories([kaggle_dir])
        save_json(kaggle_json_path, kaggle_json_data)

        # Set permissions
        os.chmod(kaggle_json_path, 0o600)

        # Set the environment variable explicitly for kaggle to pick up
        os.environ["KAGGLE_CONFIG_DIR"] = str(kaggle_dir)
    
    except Exception as exception:
        raise exception
    

from kaggle.api.kaggle_api_extended import KaggleApi

setup_kaggle_auth_from_secret()

api = KaggleApi()
api.authenticate()

[2025-06-30 19:48:38,827: INFO: common: Directory: /home/codespace/.kaggle created successfully.]
[2025-06-30 19:48:38,828: INFO: common: Directory: /home/codespace/.kaggle created successfully.]
[2025-06-30 19:48:38,829: INFO: common: JSON file saved at: /home/codespace/.kaggle/kaggle.json]


In [None]:
import kaggle
import zipfile

class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config

    def download_files(self) -> None:
        """
        Downloads dataset from Kaggle using kaggle API.
        """
        try:
            kaggle.api.dataset_download_files(
                dataset=self.config.kaggle_dataset,
                path=self.config.root_dir,
                unzip=False
            )

            logger.info(f"Successfully downloaded dataset {self.config.kaggle_dataset} at: {self.config.root_dir}")

        except Exception as exception_error:
            logger.error(f"Unexpected error file downloading dataset: {exception_error}")
            raise exception_error

    def extract_files(self) -> None:
        """
        Extracts the downloaded ZIP file.
        """
        try:
            with zipfile.ZipFile(self.config.download_zip, "r") as zip_ref:
                zip_ref.extractall(self.config.root_dir)
                logger.info(f"Successfully extracted dataset {self.config.kaggle_dataset} at: {self.config.extracted_file}")

            if not self.config.extracted_file.exists():
                logger.warning(f"Expected file not found after extraction: {self.config.extracted_file}")

        except zipfile.BadZipFile:
            logger.error(f"Invalid zip file format.")
        
        except Exception as exception_error:
            logger.error(f"Unexpected error file unziping dataset: {exception_error}")
            raise exception_error

In [None]:
try:
    config_manager = ConfigurationManager()
    ingestion_config = config_manager.get_ingestion_config()

    data_ingestor = DataIngestion(config=ingestion_config)
    data_ingestor.download_files()
    data_ingestor.extract_files()

except Exception as exception:
    logger.exception(f"Unexpected error during data ingestion pipeline: {exception}")
    raise exception

[2025-06-30 19:48:38,853: INFO: common: YAML file: config/config.yaml loaded successfully]
[2025-06-30 19:48:38,855: INFO: common: YAML file: params.yaml loaded successfully]
[2025-06-30 19:48:38,856: INFO: 485166113: Loading configuration from config/config.yaml and parameters from params.yaml]
[2025-06-30 19:48:38,857: INFO: common: Directory: artifacts created successfully.]
[2025-06-30 19:48:38,858: INFO: common: Directory: artifacts/data_ingestion created successfully.]
[2025-06-30 19:48:38,859: INFO: 485166113: DataIngestionConfig created with: DataIngestionConfig(root_dir=PosixPath('artifacts/data_ingestion'), kaggle_dataset='mohamedhanyyy/chest-ctscan-images', download_zip=PosixPath('artifacts/data_ingestion/chest-ctscan-images.zip'), extracted_file=PosixPath('artifacts/data_ingestion/Data'))]
Dataset URL: https://www.kaggle.com/datasets/mohamedhanyyy/chest-ctscan-images
[2025-06-30 19:48:48,952: INFO: 2954299612: Successfully downloaded dataset mohamedhanyyy/chest-ctscan-image