## Data Ingestion Module

In [1]:
import os
%pwd

'c:\\Users\\sanja\\Desktop\\Data_science\\portfolio-projects\\04_NLP_LLMs\\text-summarization-mlops-hf\\research'

In [2]:
os.chdir("../")
%pwd

'c:\\Users\\sanja\\Desktop\\Data_science\\portfolio-projects\\04_NLP_LLMs\\text-summarization-mlops-hf'

### Basic configuration

In [None]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataIngestionConfig:
    root_dir: Path
    dataset_name: str
    raw_dataset_dir: Path


### Configuration updates

In [4]:
from src.textSummarizer.constants import *
from src.textSummarizer.utils.common import read_yaml, create_directories


class ConfigurationManager:
    def __init__(self, 
                config_path = CONFIG_FILE_PATH, 
                params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_path)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        create_directories([config.root_dir])
        
        data_ingestion_config = DataIngestionConfig(
            root_dir = config.root_dir,
            dataset_name = config.dataset_name,
            raw_dataset_dir = config.raw_dataset_dir,
        )
        return data_ingestion_config


### Components

In [5]:
import os
from datasets import load_dataset
from src.textSummarizer.logging import logger


class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config

    def fetch_and_save_dataset(self):
        logger.info("Loading dataset from Hugging Face...")
        dataset = load_dataset(self.config.dataset_name)

        logger.info("Saving raw dataset locally...")
        dataset.save_to_disk(self.config.raw_dataset_dir)
        
        logger.info("Saving dataset splits locally...")
        dataset['train'].to_csv(os.path.join(self.config.root_dir, "samsum-train.csv"), index=False)
        dataset['test'].to_csv(os.path.join(self.config.root_dir, "samsum-test.csv"), index=False)
        dataset['validation'].to_csv(os.path.join(self.config.root_dir, "samsum-validation.csv"), index=False)

        logger.info("Data ingestion from Hugging Face completed.")


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
config = ConfigurationManager()
data_ingestion_config = config.get_data_ingestion_config()
data_ingestion = DataIngestion(config=data_ingestion_config)

data_ingestion.fetch_and_save_dataset()


[2025-07-02 17:04:55,474: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-07-02 17:04:55,476: INFO: common: yaml file: params.yaml loaded successfully]
[2025-07-02 17:04:55,478: INFO: common: created directory at: artifacts]
[2025-07-02 17:04:55,479: INFO: common: created directory at: artifacts/data_ingestion]
[2025-07-02 17:04:55,480: INFO: 1442623699: Loading dataset from Hugging Face...]
[2025-07-02 17:04:57,863: INFO: 1442623699: Saving raw dataset locally...]


Saving the dataset (1/1 shards): 100%|██████████| 14732/14732 [00:00<00:00, 794773.83 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 818/818 [00:00<00:00, 125011.50 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 819/819 [00:00<00:00, 154554.80 examples/s]

[2025-07-02 17:04:57,917: INFO: 1442623699: Saving dataset splits locally...]



Creating CSV from Arrow format: 100%|██████████| 15/15 [00:00<00:00, 64.86ba/s]
Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 75.59ba/s]
Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 68.20ba/s]

[2025-07-02 17:04:58,195: INFO: 1442623699: Data ingestion from Hugging Face completed.]



