In [19]:
import os
os.chdir("c:/Text-Summarization")

In [20]:
%pwd

'c:\\Text-Summarization'

In [21]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    tokenizer: str

In [22]:
import sys
sys.path.append("src")

In [23]:
%pip install -e src

import sys
from pathlib import Path

src_path = str((Path.cwd().parent / "src").resolve())
if src_path not in sys.path:
	sys.path.append(src_path)

from Text_Summarization.constants import *
from Text_Summarization.utils.common import read_yaml, create_dictionaries

Obtaining file:///C:/Text-Summarization/src
Note: you may need to restart the kernel to use updated packages.


ERROR: file:///C:/Text-Summarization/src does not appear to be a Python project: neither 'setup.py' nor 'pyproject.toml' found.


In [24]:
from pathlib import Path
CONFIG_FILE_PATH = Path("config/config.yaml")
PARAMS_FILE_PATH = Path("params.yaml")

class ConfigurationManager:
    def __init__(self,
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH):
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_dictionaries([self.config.artifacts_root],verbose=True)

    def get_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_dictionaries([config.root_dir], verbose=True)
        
        data_transformation_config =  DataTransformationConfig(
            root_dir=Path(self.config.data_transformation.root_dir),
            data_path=Path(self.config.data_transformation.data_path),
            tokenizer=self.config.data_transformation.tokenizer
        )
        return data_transformation_config

In [25]:
import os
import logging
from transformers import AutoTokenizer
from datasets import load_dataset,load_from_disk

In [26]:
import shutil

class DataTransformation:
    def __init__(self,config : DataTransformationConfig):
        self.config=config
        self.tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer)

    def convert_to_features(self, example_batch):
        input_encodings = self.tokenizer(example_batch['dialogue'], max_length=1024, truncation=True)
        with self.tokenizer.as_target_tokenizer():
            target_encodings = self.tokenizer(example_batch['summary'], max_length=128, truncation=True)
        return {
            'input_ids': input_encodings['input_ids'],
            'attention_mask': input_encodings['attention_mask'],
            'labels': target_encodings['input_ids'],
        }
    
    def convert(self):
        dataset_samsum = load_from_disk(self.config.data_path)
        dataset_samsum_pt = dataset_samsum.map(self.convert_to_features, batched=True)
        output_dir = os.path.abspath(os.path.join(self.config.root_dir, "samsum_dataset"))
        # Remove output directory if it exists
        if os.path.exists(output_dir):
            shutil.rmtree(output_dir)
        os.makedirs(output_dir, exist_ok=True)
        dataset_samsum_pt.save_to_disk(output_dir)
        print(f"Saved tokenized dataset to: {output_dir}")

In [27]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_transformation_config()

    data_transformation = DataTransformation(config=data_transformation_config)
    status = data_transformation.convert()
except Exception as e:
    raise e

[2025-08-25 14:28:37,814] INFO - common - YAML file loaded successfully from config\config.yaml
[2025-08-25 14:28:37,816] INFO - common - YAML file loaded successfully from params.yaml
[2025-08-25 14:28:37,817] INFO - common - Created directory at artifacts
[2025-08-25 14:28:37,817] INFO - common - Created directory at artifacts/data_transformation
[2025-08-25 14:28:37,816] INFO - common - YAML file loaded successfully from params.yaml
[2025-08-25 14:28:37,817] INFO - common - Created directory at artifacts
[2025-08-25 14:28:37,817] INFO - common - Created directory at artifacts/data_transformation


Map: 100%|██████████| 14732/14732 [00:01<00:00, 8587.63 examples/s] 
Map:   0%|          | 0/819 [00:00<?, ? examples/s]
Map: 100%|██████████| 819/819 [00:00<00:00, 6042.30 examples/s]
Map:   0%|          | 0/818 [00:00<?, ? examples/s]
Map: 100%|██████████| 818/818 [00:00<00:00, 5586.15 examples/s]
Map: 100%|██████████| 818/818 [00:00<00:00, 5586.15 examples/s]0<?, ? examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 14732/14732 [00:00<00:00, 460353.49 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 14732/14732 [00:00<00:00, 460353.49 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 819/819 [00:00<00:00, 98969.57 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 818/818 [00:00<?, ? examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 818/818 [00:00<?, ? examples/s]

Saved tokenized dataset to: c:\Text-Summarization\artifacts\data_transformation\samsum_dataset



