In [1]:
import os
os.chdir("../")

In [2]:
%pwd

'c:\\Users\\sajit\\OneDrive\\Documents\\Desktop\\Pythonn\\Git\\Named-Entity-Recognition-News\\backend'

In [3]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    dataset_name: str
    data_path: Path
    tokenizer_path: Path
    params_max_sequence_length: int
    params_label_all_tokens: bool

In [4]:
from named_entity_recognition.constants import *
from named_entity_recognition.utils.common import read_yaml, create_directories

In [5]:
class ConfigurationManager:
    def __init__(self,
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationConfig:

        config = self.config.data_transformation
        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir = Path(config.root_dir),
            dataset_name = self.config.data_ingestion.dataset_name,
            data_path = Path(config.data_path),
            tokenizer_path = Path(config.tokenizer_path),
            params_max_sequence_length=self.params.MAX_SEQUENCE_LENGTH,
            params_label_all_tokens=self.params.LABEL_ALL_TOKENS
        )
        return data_transformation_config

In [6]:
import os
import pickle
from datasets import load_from_disk
import pickle
from named_entity_recognition import logger
import numpy as np
import tensorflow as tf

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
class DataTransformation:

    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.tokenizer = self.load_tokenizer()

    def transform_data(self):
        data = load_from_disk(self.config.data_path)
        data = data.map(self.tokenize_and_align_labels, batched=True).map(self.input_and_label_pad_sequence, batched=True).map(self.create_attention_mask, batched=True)
        data.save_to_disk(os.path.join(self.config.root_dir,self.config.dataset_name))

    def load_tokenizer(self):
        with open(self.config.tokenizer_path, 'rb') as f:
            tokenizer = pickle.load(f)
        return tokenizer
    
    def tokenize_and_align_labels(self, examples):
        tokenized_inputs = [self.tokenizer.texts_to_sequences(token) for token in examples['tokens']]
        new_tokenized_inputs = []
        labels = []
        word_ids_list = []
        for i,tokenized_input in enumerate(tokenized_inputs):
            ner_tags = examples['ner_tags'][i]
            label_ids = []
            word_ids = []
            tokenized_sentence = []
            for j,tokenized_words in enumerate(tokenized_input):
                if tokenized_words:
                    tokenized_sentence.extend(tokenized_words)
                    word_ids.extend([j]*len(tokenized_words))
                    label_ids.append(ner_tags[j])
                    for k in range(len(tokenized_words)-1):
                        label_ids.append(ner_tags[j] if self.config.params_label_all_tokens else 0)
            labels.append(label_ids)
            word_ids_list.append(word_ids)
            new_tokenized_inputs.append(tokenized_sentence)
        return {'input_ids': new_tokenized_inputs, 'word_ids': word_ids_list, 'labels': labels}
    
    def input_and_label_pad_sequence(self, examples):
        return { 'input_ids': tf.keras.preprocessing.sequence.pad_sequences(examples['input_ids'],
                                                                            maxlen = self.config.params_max_sequence_length, 
                                                                            padding='post', truncating='post'),
                'labels': tf.keras.preprocessing.sequence.pad_sequences(examples['labels'],
                                                                        maxlen = self.config.params_max_sequence_length, 
                                                                        padding='post', truncating='post', value=-1 )}
    def create_attention_mask(self,examples):
        mask = 1 - (np.array(examples['input_ids'])==0)
        return {"attention_mask": mask}
    

In [8]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.transform_data()
except Exception as e:
    raise e

[2024-04-24 19:23:59,357: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-04-24 19:23:59,360: INFO: common: yaml file: params.yaml loaded successfully]
[2024-04-24 19:23:59,360: INFO: common: created directory at: artifacts]
[2024-04-24 19:23:59,362: INFO: common: created directory at: artifacts/data_transformation]


                                                                                                 