In [1]:
import os

In [2]:
%pwd

'c:\\Users\\prati\\Desktop\\Project\\English-to-Hindi-Translator\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\prati\\Desktop\\Project\\English-to-Hindi-Translator'

In [10]:
from src.translator.constants import *
from src.translator.utils.common import read_yaml, create_directories

In [32]:
#update in entity
from dataclasses import dataclass
from pathlib import Path
from transformers import AutoTokenizer
import yaml
import pandas as pd
from datasets import load_dataset

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    tokenizer_name: str
    source: str
    

In [33]:
#update configuration manager
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            tokenizer_name = config.tokenizer_name,
            source=config.source
        )

        return data_transformation_config

In [34]:
from datasets import load_from_disk
config = ConfigurationManager()
data_transformation_config = config.get_data_transformation_config()
print(data_transformation_config)
dataset = load_from_disk(data_transformation_config.data_path)
#data_transformation = DataTransformation(config=data_transformation_config)



[2023-08-10 22:40:58,903: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-08-10 22:40:58,909: INFO: common: yaml file: params.yaml loaded successfully]
[2023-08-10 22:40:58,913: INFO: common: created directory at: artifacts]
[2023-08-10 22:40:58,916: INFO: common: created directory at: artifacts/data_transformation]
DataTransformationConfig(root_dir='artifacts/data_transformation', data_path='artifacts/data_ingestion', tokenizer_name='Helsinki-NLP/opus-mt-en-hi', source='cfilt/iitb-english-hindi')


In [35]:
dataset

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 1659083
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 520
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2507
    })
})

In [26]:
max_input_length = 128
max_target_length = 128

source_lang = "en"
target_lang = "hi"

def preprocess_function(examples):
    inputs = [ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [45]:
preprocess_function(dataset["train"][:2])



{'input_ids': [[3872, 85, 2501, 132, 15441, 36398, 0], [32643, 28541, 36253, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1]], 'labels': [[63, 2025, 18, 16155, 346, 20311, 24, 2279, 679, 0], [26618, 16155, 346, 33383, 0]]}

In [48]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

In [36]:
import os
from transformers import AutoTokenizer
from datasets import load_from_disk
from src.translator.logging import logger

class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)
        self.source = config.source
        self.save_dir = config.root_dir
        

    def preprocess_function(self,examples):
        max_input_length = 128
        max_target_length = 128

        source_lang = "en"
        target_lang = "hi"
        inputs = [ex[source_lang] for ex in examples["translation"]]
        targets = [ex[target_lang] for ex in examples["translation"]]
        model_inputs = self.tokenizer(inputs, max_length=max_input_length, truncation=True)

        # Setup the tokenizer for targets
        with self.tokenizer.as_target_tokenizer():
            labels = self.tokenizer(targets, max_length=max_target_length, truncation=True)

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    def save_tokenized_datasets(self, tokenized_datasets):
        os.makedirs(self.save_dir, exist_ok=True)
        tokenized_datasets.save_to_disk(self.save_dir)



In [38]:
from datasets import load_from_disk
config = ConfigurationManager()
data_transformation_config = config.get_data_transformation_config()
print(data_transformation_config)
dataset = load_from_disk(data_transformation_config.data_path)
#data_transformation = DataTransformation(config=data_transformation_config)



[2023-08-10 22:42:45,926: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-08-10 22:42:45,932: INFO: common: yaml file: params.yaml loaded successfully]
[2023-08-10 22:42:45,936: INFO: common: created directory at: artifacts]
[2023-08-10 22:42:45,940: INFO: common: created directory at: artifacts/data_transformation]
DataTransformationConfig(root_dir='artifacts/data_transformation', data_path='artifacts/data_ingestion', tokenizer_name='Helsinki-NLP/opus-mt-en-hi', source='cfilt/iitb-english-hindi')


In [39]:
try:
    
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    raw_dataset = load_from_disk(data_transformation_config.data_path)
    
    tokenized_datasets = raw_dataset.map(data_transformation.preprocess_function, batched=True)

    # Save tokenized datasets to the specified directory
    data_transformation.save_tokenized_datasets(tokenized_datasets)
    
except Exception as e:
    raise e

[2023-08-10 22:43:24,839: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-08-10 22:43:24,845: INFO: common: yaml file: params.yaml loaded successfully]
[2023-08-10 22:43:24,849: INFO: common: created directory at: artifacts]
[2023-08-10 22:43:24,853: INFO: common: created directory at: artifacts/data_transformation]


Map:  16%|█▌        | 268000/1659083 [00:42<07:29, 3093.55 examples/s]