In [1]:
import os

In [2]:
%pwd

'/Users/sheetal/Desktop/Study/NLP/Projects/textSummarization/research'

In [3]:
os.chdir("/Users/sheetal/Desktop/Study/NLP/Projects/textSummarization")

In [4]:
%pwd

'/Users/sheetal/Desktop/Study/NLP/Projects/textSummarization'

In [5]:
"""
This block will go in the entity module.
It defines the DataIngestionConfig class, which is used to configure data ingestion settings.
This class is frozen, meaning its instances are immutable after creation.
"""
from dataclasses import dataclass
from pathlib import Path
@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path

In [6]:
from textSummarizer.constants import *
from textSummarizer.utils.common import read_yaml, create_directories


In [None]:
"""
This block will go in the configuration module.
It defines the ConfigurationManager class, which is responsible for managing configuration settings.
It reads configuration from YAML files and provides methods to access specific configurations.
"""
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir 
        )

        return data_ingestion_config

In [8]:
import os
import urllib.request as request
import zipfile
from textSummarizer.utils.common import get_size
from textSummarizer.logging import logger


In [9]:
"""
This block will go in the component.
It defines the DataIngestion class, which is responsible for downloading and extracting data files.
It uses the DataIngestionConfig class to access configuration settings.
"""
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config
        
    def download_file(self):
        if not os.path.exists(self.config.local_data_file) or get_size(self.config.local_data_path) == 0:
            filename,header=request.urlretrieve(
                url=self.config.source_URL,
                filename=self.config.local_data_file
            )
            logger.info(f"Downloaded file: {filename} with headers: {header}")
        else:
            logger.info(f"File already exists of size {get_size(self.config.local_data_file)}")
    def extract_zip_file(self):
        unzip_dir = self.config.unzip_dir
        os.makedirs(unzip_dir, exist_ok=True)
        with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
            zip_ref.extractall(unzip_dir)
        logger.info(f"Extracted zip file to {unzip_dir}")

In [None]:
try:
    config= ConfigurationManager()

    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion= DataIngestion(config=data_ingestion_config)
    data_ingestion.download_file()
    data_ingestion.extract_zip_file()
except Exception as e:
    logger.exception(e)
    raise e

[2025-06-10 14:05:48,335: INFO: common: YAML file config/config.yaml loaded successfully.]
[2025-06-10 14:05:48,336: INFO: common: YAML file params.yaml loaded successfully.]
Directory already exists: artifacts
1
**************************************************
Directory already exists: artifacts/data_ingestion
2
3
[2025-06-10 14:05:49,587: INFO: 2097861503: Downloaded file: artifacts/data_ingestion/data.zip with headers: Connection: close
Content-Length: 7903594
Cache-Control: max-age=300
Content-Security-Policy: default-src 'none'; style-src 'unsafe-inline'; sandbox
Content-Type: application/zip
ETag: "dbc016a060da18070593b83afff580c9b300f0b6ea4147a7988433e04df246ca"
Strict-Transport-Security: max-age=31536000
X-Content-Type-Options: nosniff
X-Frame-Options: deny
X-XSS-Protection: 1; mode=block
X-GitHub-Request-Id: 9219:1993B3:7098B6:84021B:68488FCD
Accept-Ranges: bytes
Date: Tue, 10 Jun 2025 20:05:48 GMT
Via: 1.1 varnish
X-Served-By: cache-den-kden1300042-DEN
X-Cache: HIT
X-Cache-