In [1]:
%pwd

'e:\\Additional Projects\\ML Projects for Resume\\text-summarization-english-end-to-end-project\\research'

In [2]:
import os
os.chdir("../")

In [3]:
%pwd

'e:\\Additional Projects\\ML Projects for Resume\\text-summarization-english-end-to-end-project'

In [4]:
#entity
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataTransformationConfig:
    root_dir : Path
    data_path : Path
    tokenizer_name : Path
    transformed_data_path : Path

In [10]:
#ConfigurationManager
from textSummarizer.constant import *
from textSummarizer.utils.common import *
from textSummarizer.logging import logging

class ConfigurationManager:
    def __init__(self, config_file_path = CONFIG_FILE_PATH, params_file_path = PARAMS_FILE_PATH):
        self.config = read_yaml_file(config_file_path)
        self.params = read_yaml_file(params_file_path)
        
        logging.info(f"Read successfully the yaml files")
        
        create_directories([self.config.artifacts_root])
    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        create_directories([config.root_dir])
        
        data_transformation_config = DataTransformationConfig(
            root_dir= config.root_dir,
            data_path= config.data_path,
            tokenizer_name = config.tokenizer_name,
            transformed_data_path= config.transformed_data_path
        )
        return data_transformation_config
        

In [6]:
cg = ConfigurationManager()
print(cg.data_transformation_config())

[2024-04-28 00:55:44,177 INFO root common 17 - yaml file- config\config.yaml is loaded successfully]
[2024-04-28 00:55:44,177 INFO root common 17 - yaml file- params.yaml is loaded successfully]
[2024-04-28 00:55:44,177 INFO root 2694761416 11 - Read successfully the yaml files]


Directory created: artifacts
Directory created: artifacts/data_transformation
DataTransformationConfig(root_dir='artifacts/data_transformation', data_path='artifacts/data_ingestion/samsum_dataset', tokenizer_name='google/pegasus_cnn_dailymail', transformed_data_path='artifacts/data_transformation/samsum_data')


In [7]:
#Components
from transformers import AutoTokenizer
from datasets import load_dataset, load_from_disk

class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_name)
    
    def convert_text_to_features(self, text_batch):
        input_encodings = self.tokenizer(text_batch['dialogue'], max_length = 1024, truncation = True)

        with self.tokenizer.as_target_tokenizer():
            target_encodings = self.tokenizer(text_batch['summary'], max_length = 128, truncation = True)

        return {
            'input_ids': input_encodings['input_ids'],
            'attention_mask': input_encodings['attention_mask'],
            'labels': target_encodings['input_ids']
        }    

    def initiate_data_transformation(self):
        data = load_from_disk(self.config.data_path)
        data_pt = data.map(self.convert_text_to_features, batched= True)
        data_pt.save_to_disk(self.config.transformed_data_path)

  from .autonotebook import tqdm as notebook_tqdm
[2024-04-28 00:55:55,252 INFO datasets config 58 - PyTorch version 2.3.0 available.]


# Pipeline

In [24]:
class DataTransformationPipeline:
    def __init__(self):
        pass
    
    def main(self):
        try:
            config = ConfigurationManager()
            data_transformation_config = config.get_data_transformation_config()
            data_transformation = DataTransformation(data_transformation_config)
            print(data_transformation)
            data_transformation.initiate_data_transformation()
        except Exception as e:
            raise e

In [25]:
dt = DataTransformationPipeline()
dt.main()

[2024-04-28 01:36:17,858 INFO root common 17 - yaml file- config\config.yaml is loaded successfully]
[2024-04-28 01:36:17,860 INFO root common 17 - yaml file- params.yaml is loaded successfully]
[2024-04-28 01:36:17,862 INFO root 2767533730 11 - Read successfully the yaml files]


Directory created: artifacts
Directory created: artifacts/data_transformation
<__main__.DataTransformation object at 0x0000018912D80A10>


Map: 100%|██████████| 14732/14732 [00:04<00:00, 3319.13 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 14732/14732 [00:00<00:00, 675010.78 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 819/819 [00:00<00:00, 52420.80 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 818/818 [00:00<00:00, 87239.13 examples/s]
