In [2]:
import os

In [3]:
%pwd

'/Users/soogeunpark/Documents/text_summarizer_cicd/Text-Summarizer/research'

In [4]:
os.chdir("../")

First, I modify and update the ```config.yaml```

Then, I set up the entity:

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen = True)
class DataTransformationConfig:
    """
    Providing the format of the data transformation config
    """
    root_dir: Path
    data_path: Path
    result_file_name: str
    tokenizer_name: Path
# tokenizer is also a path.. interesting

Now the ```config/configuration.py```

In [6]:
from TextSummarizer.constants import *
from TextSummarizer.utils.common import read_yaml, create_directories

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH
    ):
        """
        This class is used to manage the configuration of the project
        """

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_roots]) 
        # this refers to the artifacts_roots in the config.yaml
        # this creates the 'artifacts' directory
        # because self.config does the "read_yaml" function which uses ConfigBox, the artifacts_roots can be just accessed by using the dot notation

    def data_transformation_config(self) -> DataTransformationConfig:
        """
        This function returns the data transformation config
        """
        
        config = self.config.data_transformation
        
        create_directories([config.root_dir])
        
        data_transformation_config = DataTransformationConfig(
            root_dir = config.root_dir,
            data_path = config.data_path,
            result_file_name = config.result_file_name,
            tokenizer_name = config.tokenizer_name
        )
    
        return data_transformation_config
 


In [7]:
import os
from TextSummarizer.logging import logger
from transformers import AutoTokenizer
from transformers import BertTokenizerFast, EncoderDecoderModel
from datasets import load_dataset, load_from_disk

Now the components:

In [8]:


class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        """
        This class is used to execute data validation
        """
        self.config = config
        
        # also I initialize the tokenizer - adopted from the information given in the config.yaml
        self.tokenizer = BertTokenizerFast.from_pretrained(self.config.tokenizer_name)
        
        logger.info(f"Tokenizer initalized: {self.config.tokenizer_name}.")
    
    # now I need the function that convert data ('examples') into features ('tokens')
    def convert_examples_to_features(self, example_batch):
        input_encodings = self.tokenizer(example_batch['dialogue'], 
                                         max_length = 1024,
                                         truncation = True)
        
        with self.tokenizer.as_target_tokenizer():
            target_encodings = self.tokenizer(example_batch['summary'],
                                              max_length = 128,
                                              truncation = True)
            
        return{
            'input_ids': input_encodings['input_ids'],
            'attention_mask': input_encodings['attention_mask'],
            'labels': target_encodings['input_ids']
        }
    
    # this method then executes the data transformation
    def convert(self):
        dataset_samsum = load_from_disk(self.config.data_path)
        logger.info(f"Dataset loaded from {self.config.data_path}")
        
        dataset_samsum_pt = dataset_samsum.map(self.convert_examples_to_features, batched = True)
        # batched processing instead of individual observation unit processing, to allow more efficient processing
        
        logger.info(f"Dataset converted to features success")
        
        dataset_samsum_pt.save_to_disk(os.path.join(self.config.root_dir, self.config.result_file_name))
        
        logger.info(f"Dataset saved at {os.path.join(self.config.root_dir, self.config.result_file_name)}")

Now working on the pipeline:

In [9]:
try:
    config = ConfigurationManager()
    
    data_transformation_config = config.data_transformation_config()

    data_transformation = DataTransformation(config = data_transformation_config)
    # this takes the data_transformation_config, and uses it to create the DataTransformation class

    # now we use the method defined in the data_transformation class
    
    data_transformation.convert()
    
except Exception as e:
    logger.error(e)
    raise e


[2024-01-03 15:27:14,952: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-01-03 15:27:14,954: INFO: common: yaml file: params.yaml loaded successfully]
[2024-01-03 15:27:14,954: INFO: common: created directory at: artifacts]
[2024-01-03 15:27:14,955: INFO: common: created directory at: artifacts/data_transformation]
[2024-01-03 15:27:15,506: INFO: 3822845383: Tokenizer initalized: mrm8488/bert-small2bert-small-finetuned-cnn_daily_mail-summarization.]
[2024-01-03 15:27:15,514: INFO: 3822845383: Dataset loaded from artifacts/data_ingestion/samsum_dataset]


Map: 100%|██████████| 14732/14732 [00:01<00:00, 14657.20 examples/s]
Map: 100%|██████████| 819/819 [00:00<00:00, 14560.47 examples/s]
Map: 100%|██████████| 818/818 [00:00<00:00, 8373.15 examples/s]

[2024-01-03 15:27:16,704: INFO: 3822845383: Dataset converted to features success]



Saving the dataset (1/1 shards): 100%|██████████| 14732/14732 [00:00<00:00, 300929.65 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 819/819 [00:00<00:00, 237233.08 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 818/818 [00:00<00:00, 199368.97 examples/s]

[2024-01-03 15:27:16,766: INFO: 3822845383: Dataset saved at artifacts/data_transformation/samsumdata_transformed]





Now let's convert into modular coding..