In [1]:
import os

In [2]:
%pwd

'/home/priyanshu1303d/Projects/Text_Summarizer/research'

In [3]:
os.chdir("../")

In [5]:
%pwd

'/home/priyanshu1303d/Projects/Text_Summarizer'

In [None]:
from pathlib import Path
from dataclasses import dataclass

@dataclass(frozen= True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    tokenizer_name: Path

In [7]:
from textSummarizer.constants  import *
from textSummarizer.utils.common import read_yaml, create_directories

In [39]:
class ConfigurationManager:
    def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath= PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        # self.config.artifacts_root = Path(self.config.artifacts_root)
        print(self.config.artifacts_root)
        print(type(self.config.artifacts_root))
        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationConfig: 
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path= config.data_path,
            tokenizer_name= config.tokenizer_name,
        )
        return data_transformation_config


In [79]:
from textSummarizer.logging import logger
from transformers import AutoTokenizer
from datasets import load_dataset, load_from_disk,DatasetDict

In [82]:
from transformers import AutoTokenizer
from datasets import load_from_disk, DatasetDict
import os

class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)
    
    def convert_examples_to_features(self, example_batch):
        # Input validation
        if 'dialogue' not in example_batch or 'summary' not in example_batch:
            raise KeyError("Required keys 'dialogue' and 'summary' not found in the dataset")
        
        # Process dialogues
        dialogues = example_batch['dialogue']
        # Ensure dialogues are in list format and all elements are strings
        if isinstance(dialogues, (list, tuple)):
            dialogues = [str(d) for d in dialogues]
        else:
            dialogues = [str(dialogues)]
            
        # Tokenize dialogues
        input_encodings = self.tokenizer(
            dialogues,
            max_length=1024,
            truncation=True,
            padding='max_length',
            return_tensors=None  # Return lists instead of tensors
        )
        
        # Process summaries
        summaries = example_batch['summary']
        # Ensure summaries are in list format and all elements are strings
        if isinstance(summaries, (list, tuple)):
            summaries = [str(s) for s in summaries]
        else:
            summaries = [str(summaries)]
        
        # Tokenize summaries
        with self.tokenizer.as_target_tokenizer():
            target_encodings = self.tokenizer(
                summaries,
                max_length=128,
                truncation=True,
                padding='max_length',
                return_tensors=None  # Return lists instead of tensors
            )
        
        return {
            'input_ids': input_encodings['input_ids'],
            'attention_mask': input_encodings['attention_mask'],
            'labels': target_encodings['input_ids']
        }

    def convert(self):
        try:
            # Load dataset
            dataset_samsum = load_from_disk(self.config.data_path)
            print("Dataset loaded successfully")
            print("Dataset structure:", dataset_samsum)
            
            # Create output directory
            output_dir = os.path.join(self.config.root_dir, "samsum_dataset")
            os.makedirs(output_dir, exist_ok=True)
            
            # Initialize transformed dataset
            transformed_dataset = DatasetDict()
            
            # Process each split
            for split in dataset_samsum.keys():
                print(f"\nProcessing {split} split...")
                
                # Print sample for debugging
                print(f"Sample entry from {split}:", dataset_samsum[split][0])
                
                # Transform the split with error handling
                try:
                    transformed_split = dataset_samsum[split].map(
                        self.convert_examples_to_features,
                        batched=True,
                        batch_size=8,  # Smaller batch size for better error handling
                        desc=f"Processing {split} split",
                        remove_columns=dataset_samsum[split].column_names  # Remove original columns
                    )
                    
                    # Add to transformed dataset
                    transformed_dataset[split] = transformed_split
                    print(f"Successfully processed {split} split")
                    
                except Exception as e:
                    print(f"Error processing {split} split: {str(e)}")
                    raise
            
            # Save transformed dataset
            transformed_dataset.save_to_disk(output_dir)
            print(f"\nTransformed dataset saved to {output_dir}")
            
            return transformed_dataset
            
        except Exception as e:
            print(f"Error in convert method: {str(e)}")
            raise

### Converting csv to .json such that it supports hugging face Datasets

In [59]:
# Did this because in my dataset i only had Test Train Valid folder so first i changed the filename to test 
#train, validation as hugging face supports only lowercase folders in this case now after that i had missing
#dataset_dict.json which is necessary for load_from_disk and save_to_disk function
#

#------------------------------------------------------------------------
# from datasets import load_dataset, Dataset, DatasetDict

# # Define file paths
# data_files = {
#     "train": "artifacts/data_ingestion/Data/train/samsum_train.csv",
#     "validation": "artifacts/data_ingestion/Data/validation/samsum_valid.csv",
#     "test": "artifacts/data_ingestion/Data/test/samsum_test.csv"
# }

# # Load CSV files correctly as Hugging Face datasets
# dataset_dict = DatasetDict({
#     split: load_dataset("csv", data_files=path, split="train") for split, path in data_files.items()
# })

# # Save the entire dataset in a single directory
# dataset_dict.save_to_disk("artifacts/data_ingestion/Data")

# print("Dataset saved correctly!")


In [60]:
from datasets import load_from_disk

data_path = "artifacts/data_ingestion/Data"

try:
    dataset = load_from_disk(data_path)
    print("Dataset loaded successfully!")
    print(dataset)
except Exception as e:
    print(f"Error: {e}")


Dataset loaded successfully!
DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
})


In [61]:
import os

dataset_path = "artifacts/data_ingestion/Data"
files = os.listdir(dataset_path)
print(files)

['dataset_dict.json', 'validation', 'train', 'test']


In [83]:
try:
    config = ConfigurationManager()
    data_transformation_config =config.get_data_transformation_config()
    data_transformation = DataTransformation(config = data_transformation_config)
    data_transformation.convert()
except Exception as e:
    raise e

[2025-02-21 19:32:56,235 : INFO : common  : yaml file config/config.yaml was read succesfully]
[2025-02-21 19:32:56,237 : INFO : common  : yaml file params.yaml was read succesfully]
artifacts
<class 'str'>
[2025-02-21 19:32:56,238 : INFO : common  : Created directory at : artifacts]
[2025-02-21 19:32:56,239 : INFO : common  : Created directory at : artifacts/data_transformation]
Dataset loaded successfully
Dataset structure: DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
})

Processing train split...
Sample entry from train: {'id': '13818513', 'dialogue': "Amanda: I baked  cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-)", 'summary': 'Amanda baked cookies and will bring Jerry some tomorr

Processing train split: 100%|██████████| 14732/14732 [00:13<00:00, 1083.66 examples/s]


Successfully processed train split

Processing validation split...
Sample entry from validation: {'id': '13817023', 'dialogue': "A: Hi Tom, are you busy tomorrow’s afternoon?\r\nB: I’m pretty sure I am. What’s up?\r\nA: Can you go with me to the animal shelter?.\r\nB: What do you want to do?\r\nA: I want to get a puppy for my son.\r\nB: That will make him so happy.\r\nA: Yeah, we’ve discussed it many times. I think he’s ready now.\r\nB: That’s good. Raising a dog is a tough issue. Like having a baby ;-) \r\nA: I'll get him one of those little dogs.\r\nB: One that won't grow up too big;-)\r\nA: And eat too much;-))\r\nB: Do you know which one he would like?\r\nA: Oh, yes, I took him there last Monday. He showed me one that he really liked.\r\nB: I bet you had to drag him away.\r\nA: He wanted to take it home right away ;-).\r\nB: I wonder what he'll name it.\r\nA: He said he’d name it after his dead hamster – Lemmy  - he's  a great Motorhead fan :-)))", 'summary': 'A will go to the anim

Processing validation split: 100%|██████████| 818/818 [00:00<00:00, 1124.84 examples/s]


Successfully processed validation split

Processing test split...
Sample entry from test: {'id': '13862856', 'dialogue': "Hannah: Hey, do you have Betty's number?\nAmanda: Lemme check\nHannah: <file_gif>\nAmanda: Sorry, can't find it.\nAmanda: Ask Larry\nAmanda: He called her last time we were at the park together\nHannah: I don't know him well\nHannah: <file_gif>\nAmanda: Don't be shy, he's very nice\nHannah: If you say so..\nHannah: I'd rather you texted him\nAmanda: Just text him 🙂\nHannah: Urgh.. Alright\nHannah: Bye\nAmanda: Bye bye", 'summary': "Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry."}


Processing test split: 100%|██████████| 819/819 [00:00<00:00, 1067.44 examples/s]


Successfully processed test split


Saving the dataset (1/1 shards): 100%|██████████| 14732/14732 [00:00<00:00, 119376.53 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 818/818 [00:00<00:00, 47304.40 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 819/819 [00:00<00:00, 30686.47 examples/s]



Transformed dataset saved to artifacts/data_transformation/samsum_dataset
