In [1]:
import os

In [None]:
%pwd

In [3]:
os.chdir("../")

In [None]:
%pwd

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen= True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    tokenizer_name: Path

In [18]:
from src.constants import *
from src.utils.modular import read_yaml_file, create_dir

In [7]:
class ConfigurationManager:
    def __init__(self,
                 config_filepath=CONFIG_FILE_PATH,
                 params_filepath=PARAMS_FILE_PATH):
        
        self.config=read_yaml_file(config_filepath)
        self.params=read_yaml_file(params_filepath)

        create_dir([self.config.artifacts_root])

    def get_data_transformation_config(self)-> DataTransformationConfig:
        config=self.config.data_transformation

        create_dir([config.root_dir])

        data_transformation_config=DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            tokenizer_name=config.tokenizer_name

        )
        return data_transformation_config

In [8]:
!pip install --upgrade datasets


Collecting datasets
  Downloading datasets-4.1.1-py3-none-any.whl.metadata (18 kB)
Collecting filelock (from datasets)
  Using cached filelock-3.19.1-py3-none-any.whl.metadata (2.1 kB)
Collecting numpy>=1.17 (from datasets)
  Using cached numpy-2.0.2-cp39-cp39-win_amd64.whl.metadata (59 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Using cached pyarrow-21.0.0-cp39-cp39-win_amd64.whl.metadata (3.4 kB)
Collecting dill<0.4.1,>=0.3.0 (from datasets)
  Using cached dill-0.4.0-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets)
  Using cached pandas-2.3.2-cp39-cp39-win_amd64.whl.metadata (19 kB)
Collecting requests>=2.32.2 (from datasets)
  Using cached requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting tqdm>=4.66.3 (from datasets)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.5.0-cp39-cp39-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Using cached multipr

In [20]:
import os
from src.logging import logging
from transformers import AutoTokenizer
from datasets import load_dataset, load_from_disk


  from .autonotebook import tqdm as notebook_tqdm


In [15]:
class DataTransformation:
    def __init__(self,config: DataTransformationConfig):
        self.config=config
        self.tokenizer=AutoTokenizer.from_pretrained(config.tokenizer_name)

    def convert_examples_to_features(self,example_batch):
        input_encodings= self.tokenizer(example_batch['dialogue'],max_length=1024,truncation=True)

        with self.tokenizer.as_target_tokenizer():
            target_encodings=self.tokenizer(example_batch['summary'],max_length=128,truncation=True)

            return{
                'input_ids':input_encodings['input_ids'],
                'attention_mask':input_encodings['attention_mask'],
                'labels':target_encodings['input_ids']
            }
        
    def convert(self):
        dataset_samsum=load_from_disk(self.config.data_path)
        dataset_samsum_pt=dataset_samsum.map(self.convert_examples_to_features,batched=True)
        dataset_samsum_pt.save_to_disk(os.path.join(self.config.root_dir,"samsum_dataset"))

In [16]:
from src.exception import CustomException
import sys

In [21]:
try:
    config=ConfigurationManager()
    data_transformation_config=config.get_data_transformation_config()
    data_transformation=DataTransformation(config=data_transformation_config)
    data_transformation.convert()
except Exception as e:
    raise CustomException(e,sys)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Map: 100%|██████████| 14732/14732 [00:03<00:00, 4207.13 examples/s]
Map: 100%|██████████| 819/819 [00:00<00:00, 4010.94 examples/s]
Map: 100%|██████████| 818/818 [00:00<00:00, 2360.02 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 14732/14732 [00:00<00:00, 348553.32 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 819/819 [00:00<00:00, 72901.85 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 818/818 [00:00<00:00, 75887.30 examples/s]
