In [1]:
import os

In [2]:
%pwd

"c:\\Arjun's Work\\Chest-Cancer-Detection\\research"

In [3]:
os.chdir('../')

In [4]:
%pwd

"c:\\Arjun's Work\\Chest-Cancer-Detection"

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path

In [6]:
from chest_cancer_detection.constants import *
from chest_cancer_detection.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir 
        )

        return data_ingestion_config
      

In [8]:
import os
import zipfile
import gdown
from chest_cancer_detection import logger
from chest_cancer_detection.utils.common import get_size

<p>I used a 7-Zip archive, which is why I needed a specialized package to extract it otherwise, the standard method would have worked fine</P>

In [18]:
!pip install py7zr

Collecting py7zr
  Downloading py7zr-1.0.0-py3-none-any.whl.metadata (17 kB)
Collecting texttable (from py7zr)
  Using cached texttable-1.7.0-py2.py3-none-any.whl.metadata (9.8 kB)
Collecting pycryptodomex>=3.20.0 (from py7zr)
  Downloading pycryptodomex-3.23.0-cp37-abi3-win_amd64.whl.metadata (3.5 kB)
Collecting brotli>=1.1.0 (from py7zr)
  Downloading Brotli-1.1.0-cp310-cp310-win_amd64.whl.metadata (5.6 kB)
Collecting pyzstd>=0.16.1 (from py7zr)
  Downloading pyzstd-0.17.0-cp310-cp310-win_amd64.whl.metadata (2.6 kB)
Collecting pyppmd<1.3.0,>=1.1.0 (from py7zr)
  Downloading pyppmd-1.2.0-cp310-cp310-win_amd64.whl.metadata (5.6 kB)
Collecting pybcj<1.1.0,>=1.0.0 (from py7zr)
  Downloading pybcj-1.0.6-cp310-cp310-win_amd64.whl.metadata (3.8 kB)
Collecting multivolumefile>=0.2.3 (from py7zr)
  Using cached multivolumefile-0.2.3-py3-none-any.whl.metadata (6.3 kB)
Collecting inflate64<1.1.0,>=1.0.0 (from py7zr)
  Downloading inflate64-1.0.3-cp310-cp310-win_amd64.whl.metadata (4.5 kB)
Downl

In [19]:
import py7zr
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config


    
     
    def download_file(self)-> str:
        '''
        Fetch data from the url
        '''

        try: 
            dataset_url = self.config.source_URL
            zip_download_dir = self.config.local_data_file
            os.makedirs("artifacts/data_ingestion", exist_ok=True)
            logger.info(f"Downloading data from {dataset_url} into file {zip_download_dir}")

            file_id = dataset_url.split("/")[-2]
            prefix = 'https://drive.google.com/uc?/export=download&id='
            gdown.download(prefix+file_id,zip_download_dir)

            logger.info(f"Downloaded data from {dataset_url} into file {zip_download_dir}")

        except Exception as e:
            raise e
        
    
    def extract_zip_file(self):
        """
        Extracts .7z file into the data directory
        """
        unzip_path = self.config.unzip_dir
        os.makedirs(unzip_path, exist_ok=True)
        
        logger.info(f"Extracting .7z file to {unzip_path}")
        with py7zr.SevenZipFile(self.config.local_data_file, mode='r') as archive:
            archive.extractall(path=unzip_path)
        logger.info(f"Extraction complete: {unzip_path}")

In [20]:
import zipfile

try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_file()
    try:
        data_ingestion.extract_zip_file()
    except zipfile.BadZipFile:
        print("Downloaded file is not a valid zip file. Please check the download URL or file content.")
except Exception as e:
    raise e

[2025-06-18 12:19:47,598: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-06-18 12:19:47,600: INFO: common: yaml file: params.yaml loaded successfully]
[2025-06-18 12:19:47,603: INFO: common: created directory at: artifacts]
[2025-06-18 12:19:47,605: INFO: common: created directory at: artifacts/data_ingestion]
[2025-06-18 12:19:47,607: INFO: 4101888485: Downloading data from https://drive.google.com/file/d/1-qbnpa-TAquDxN3rNyQ7h6nF65RK3F-I/view?usp=sharing into file artifacts/data_ingestion/data.zip]


Downloading...
From: https://drive.google.com/uc?/export=download&id=1-qbnpa-TAquDxN3rNyQ7h6nF65RK3F-I
To: c:\Arjun's Work\Chest-Cancer-Detection\artifacts\data_ingestion\data.zip
100%|██████████| 44.1M/44.1M [00:12<00:00, 3.60MB/s]


[2025-06-18 12:20:11,461: INFO: 4101888485: Downloaded data from https://drive.google.com/file/d/1-qbnpa-TAquDxN3rNyQ7h6nF65RK3F-I/view?usp=sharing into file artifacts/data_ingestion/data.zip]
[2025-06-18 12:20:11,463: INFO: 4101888485: Extracting .7z file to artifacts/data_ingestion]
[2025-06-18 12:20:14,564: INFO: 4101888485: Extraction complete: artifacts/data_ingestion]
