In [1]:
%pwd

'd:\\Capestone projects\\Project_2\\document_summarization_for_legal_texts\\research'

In [2]:
import os
os.chdir("../")
%pwd

'd:\\Capestone projects\\Project_2\\document_summarization_for_legal_texts'

In [3]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    """
    Data Transformation Configuration
    """
    root_dir: Path
    data_path: Path
    tokenizer_name: Path

In [4]:
from datascience.constants import *
from datascience.utils.common import read_yaml, create_directories

In [5]:
class configuration_manager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
        
        
    def get_data_transformation_config(self) -> DataTransformationConfig:
        """
        method to get data transformation configuration
        """
        config = self.config.data_transformation
        data_transformation_config = DataTransformationConfig(
            root_dir = config.root_dir,
            data_path = config.data_path,
            tokenizer_name = config.tokenizer_name
        )
        
        return data_transformation_config

In [6]:
import os
from datascience.logging import logger
from transformers import AutoTokenizer
from datasets import load_dataset, load_from_disk

  from .autonotebook import tqdm as notebook_tqdm


[2025-07-05 10:08:06,861] -> [INFO]: config - PyTorch version 2.7.1 available.


In [9]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_name)
        
    def convert_examples_to_features(self, example_batch):
        judgements = [str(j) if j is not None else "" for j in example_batch['judgement']]
        summaries = [str(s) if s is not None else "" for s in example_batch['summary']]
        input_encodings = self.tokenizer(judgements, max_length=1024, truncation=True)
        with self.tokenizer.as_target_tokenizer():
            target_encodings = self.tokenizer(summaries, max_length=128, truncation=True)
        return {
            'input_ids': input_encodings['input_ids'],
            'attention_mask': input_encodings['attention_mask'],
            'labels': target_encodings['input_ids']
        }
    

    def convert(self):
        dataset_legal = load_dataset("csv",data_files={
            "train": "artifacts/data_ingestion/train_dataset.csv",
            "test": "artifacts/data_ingestion/test_dataset.csv"
            }
                                     )
        dataset_legal_pt = dataset_legal.map(self.convert_examples_to_features, batched = True)
        dataset_legal_pt.save_to_disk(os.path.join(self.config.root_dir,"legal_summary"))

In [10]:
try:
    config = configuration_manager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.convert()
    
except Exception as e:
    raise e

[2025-07-05 10:11:46,000] -> [INFO]: common - yaml file: config\config.yaml loaded successfully
[2025-07-05 10:11:46,006] -> [INFO]: common - yaml file: params.yaml loaded successfully
[2025-07-05 10:11:46,010] -> [INFO]: common - created directory at: artifacts


Map: 100%|██████████| 6000/6000 [01:18<00:00, 76.35 examples/s] 
Map: 100%|██████████| 100/100 [00:01<00:00, 54.69 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6000/6000 [00:00<00:00, 22121.88 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 100/100 [00:00<00:00, 6637.92 examples/s]
