In [6]:
# --- Imports & safe project path setup ---
import sys, os, zipfile, urllib.request
from pathlib import Path
from dataclasses import dataclass
import yaml

# Resolve project root as the parent of this 'research' folder
CWD = Path.cwd()
if (CWD.name.lower() == "research") and (CWD.parent / "src").exists():
    PROJECT_ROOT = CWD.parent
else:
    _p = CWD
    PROJECT_ROOT = None
    for _ in range(6):
        if (_p / "src").exists() and (_p / "config").exists():
            PROJECT_ROOT = _p
            break
        _p = _p.parent
    if PROJECT_ROOT is None:
        raise RuntimeError("Could not locate project root with 'src' and 'config' folders.")

SRC_DIR = PROJECT_ROOT / "src"
if str(SRC_DIR) not in sys.path:
    sys.path.insert(0, str(SRC_DIR))

print("Project root:", PROJECT_ROOT)
print("Src path added:", SRC_DIR)


Project root: c:\Users\yogar\OneDrive\Desktop\MLops\Chicken_Disease_Classification\Chicken_Disease_Classification
Src path added: c:\Users\yogar\OneDrive\Desktop\MLops\Chicken_Disease_Classification\Chicken_Disease_Classification\src


In [7]:
# --- Optional: use helpers if package is available ---
try:
    from cnnClassifier.utils.common import create_directories, read_yaml
    package_available = True
except Exception as _e:
    package_available = False
    print("Package import failed (will use fallback):", _e)

if not package_available:
    def read_yaml(path_to_yaml: Path):
        with open(path_to_yaml, "r", encoding="utf-8") as f:
            return yaml.safe_load(f)
    def create_directories(paths):
        for p in paths:
            Path(p).mkdir(parents=True, exist_ok=True)

Package import failed (will use fallback): No module named 'box'


In [8]:
# --- Configuration dataclass & loader ---
@dataclass
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path

def load_data_ingestion_config(config_path: Path = None) -> DataIngestionConfig:
    if config_path is None:
        config_path = PROJECT_ROOT / "config" / "config.yaml"
    cfg = read_yaml(config_path)
    di = cfg.get("data_ingestion", {})
    if not di:
        raise KeyError("'data_ingestion' section not found in config.yaml")
    return DataIngestionConfig(
        root_dir = Path(di["root_dir"]),
        source_URL = di["source_URL"],
        local_data_file = Path(di["local_data_file"]),
        unzip_dir = Path(di["unzip_dir"]),
    )

config = load_data_ingestion_config()
config


DataIngestionConfig(root_dir=WindowsPath('artifacts/data_ingestion'), source_URL='https://github.com/entbappy/Branching-tutorial/raw/master/Chicken-fecal-images.zip', local_data_file=WindowsPath('artifacts/data_ingestion/data.zip'), unzip_dir=WindowsPath('artifacts/data_ingestion'))

In [9]:
# --- DataIngestion implementation ---
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config
        create_directories([self.config.root_dir])

    def download_file(self):
        """Download the dataset zip to local_data_file. Skips if already exists."""
        target = PROJECT_ROOT / self.config.local_data_file
        target.parent.mkdir(parents=True, exist_ok=True)
        if target.exists() and target.stat().st_size > 0:
            print(f"File already exists at {target} (size={target.stat().st_size} bytes). Skipping download.")
            return target
        print("Downloading from:", self.config.source_URL)
        print("Saving to:", target)
        urllib.request.urlretrieve(self.config.source_URL, target)
        print("Download complete.")
        return target

    def extract_zip_file(self):
        """Extract the downloaded zip file into unzip_dir."""
        zip_path = PROJECT_ROOT / self.config.local_data_file
        extract_to = PROJECT_ROOT / self.config.unzip_dir
        extract_to.mkdir(parents=True, exist_ok=True)
        if not zip_path.exists():
            raise FileNotFoundError(f"Expected zip file not found at {zip_path}")
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to)
        print(f"Extracted to: {extract_to.resolve()}")
        return extract_to


In [10]:
# --- Run ingestion (download + extract) ---
ingestion = DataIngestion(config)
try:
    ingestion.download_file()
except Exception as e:
    print("Download step failed (possibly due to no internet). Continuing to extraction if zip already exists.\n", e)

ingestion.extract_zip_file()

File already exists at c:\Users\yogar\OneDrive\Desktop\MLops\Chicken_Disease_Classification\Chicken_Disease_Classification\artifacts\data_ingestion\data.zip (size=11616915 bytes). Skipping download.
Extracted to: C:\Users\yogar\OneDrive\Desktop\MLops\Chicken_Disease_Classification\Chicken_Disease_Classification\artifacts\data_ingestion


WindowsPath('c:/Users/yogar/OneDrive/Desktop/MLops/Chicken_Disease_Classification/Chicken_Disease_Classification/artifacts/data_ingestion')