In [1]:
import os


In [2]:
%pwd

'c:\\Users\\Shabbir\\Desktop\\YT_Prac\\Text-Summarization-Project\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\Shabbir\\Desktop\\YT_Prac\\Text-Summarization-Project'

# Config.yaml and Entity:

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path

# Configuration:

In [6]:
from TextSummarizer.constants import *
from TextSummarizer.utils.common import create_directories, read_yaml

In [7]:
class configurationManager:
    def __init__(
        self,
        config_filepath=None,
        params_filepath=None,
    ):
        if config_filepath is None:
            config_filepath = CONFIG_FILE_PATH
        if params_filepath is None:
            params_filepath = PARAMS_FILE_PATH
            
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir = config.root_dir,
            source_URL = config.source_URL,
            local_data_file = config.local_data_file,
            unzip_dir = config.unzip_dir,
        )

        return data_ingestion_config

# Components:

In [8]:
import os
import urllib.request as request
import zipfile
from TextSummarizer.logging import logger
from TextSummarizer.utils.common import get_size

In [None]:
import requests
from pathlib import Path

class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config
        
    def download_file(self):
        """Download data.zip from source_URL using safe streaming."""
        url = self.config.source_URL
        out_path = Path(self.config.local_data_file)
        tmp_path = out_path.with_suffix(out_path.suffix + ".part")

        # if file exists, skip download
        if out_path.exists():
            logger.info(f"File already exists of size: {get_size(out_path)}")
            return

        logger.info(f"Downloading from: {url}")

        try:
            with requests.get(url, stream=True, timeout=60) as r:
                r.raise_for_status()  # throws error for HTTP != 200

                out_path.parent.mkdir(parents=True, exist_ok=True)

                # write to .part temporary file first
                with open(tmp_path, "wb") as f:
                    for chunk in r.iter_content(chunk_size=1024*1024):
                        if chunk:
                            f.write(chunk)

            # ensure non-zero file
            if tmp_path.stat().st_size == 0:
                raise RuntimeError("Downloaded file is empty (0 bytes).")

            # move .part → final filename
            tmp_path.replace(out_path)

            logger.info(
                f"{out_path} downloaded successfully! "
                f"Size: {get_size(out_path)}"
            )

        except Exception as e:
            logger.exception(f"Download failed: {e}")
            if tmp_path.exists():
                tmp_path.unlink()  # remove partial file
            raise e

    def extract_zip_file(self):
        """Safely extract ZIP file member-by-member with clear error logging."""
        zip_path = Path(self.config.local_data_file)
        extract_dir = Path(self.config.unzip_dir)

        if not zip_path.exists():
            logger.error(f"ZIP file not found at: {zip_path}")
            raise FileNotFoundError(f"{zip_path} does not exist")

        logger.info(f"Extracting ZIP: {zip_path}")

        try:
            with zipfile.ZipFile(zip_path, "r") as z:
                members = z.namelist()
                logger.info(f"Total files/folders in ZIP: {len(members)}")

                for i, member in enumerate(members, start=1):
                    target = extract_dir / member

                    # Create directories as needed
                    if member.endswith("/") or member.endswith("\\"):
                        target.mkdir(parents=True, exist_ok=True)
                        logger.debug(f"[{i}/{len(members)}] Dir created: {member}")
                        continue

                    target.parent.mkdir(parents=True, exist_ok=True)

                    # Extract file in chunks
                    with z.open(member) as src, open(target, "wb") as dst:
                        while True:
                            chunk = src.read(1024 * 64)  # 64 KB
                            if not chunk:
                                break
                            dst.write(chunk)
                        dst.flush()
                        os.fsync(dst.fileno())

                    logger.debug(f"[{i}/{len(members)}] Extracted: {member}")

            logger.info(f"Extraction completed successfully → {extract_dir}")

        except Exception as e:
            logger.exception(f"Error while extracting ZIP: {e}")
            raise


# Creating the Pipeline

In [22]:
try:
    config = configurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_file()
    data_ingestion.extract_zip_file()

except Exception as e:
    raise e

[2025-11-23 18:13:19,079: INFO: common]: YAML file: config\config.yaml loaded successfully
[2025-11-23 18:13:19,080: INFO: common]: YAML file: params.yaml loaded successfully
[2025-11-23 18:13:19,081: INFO: common]: Created directory at: artifacts
[2025-11-23 18:13:19,082: INFO: common]: Created directory at: artifacts/data_ingestion
[2025-11-23 18:13:19,084: INFO: 1713455135]: Downloading from: https://github.com/entbappy/Branching-tutorial/raw/refs/heads/master/samsumdata.zip
[2025-11-23 18:13:22,564: INFO: 1713455135]: artifacts\data_ingestion\data.zip downloaded successfully! Size: 23073.25 KB
