In [1]:
import os

In [2]:
%pwd

'c:\\Users\\rahul\\Desktop\\Project\\CaptionAI\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\rahul\\Desktop\\Project\\CaptionAI'

Entity

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen = True)
class TokenizationConfig:
    root_dir: Path
    token_file: Path
    caption_file: Path
    tokenizer_type: str
    unk_token: str
    pad_token: str
    sos_token: str
    eos_token: str

Configuration

In [6]:
from CaptionAI.constants import *
from CaptionAI.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(self,
                 config_file_path = CONFIG_FILE_PATH,
                 params_file_path = PARAMS_FILE_PATH):
        self.config = read_yaml(config_file_path)
        self.params = read_yaml(params_file_path)

        create_directories([self.config.artifacts_root])

    def get_tokenization_config(self):
        config = self.config.tokenization
        create_directories([config.root_dir])

        tokenization_config = TokenizationConfig(
            root_dir = config.root_dir,
            token_file = config.token_file,
            caption_file = config.caption_file,
            tokenizer_type = config.tokenizer_type,
            unk_token = "<unk>",
            pad_token = "<pad>",
            sos_token = "<sos>",
            eos_token = "<eos>"
        )

        return tokenization_config

Components

In [8]:
from torchtext.vocab import vocab
from torchtext.data.utils import get_tokenizer
from CaptionAI import logger
import pandas as pd
import pickle

from collections import Counter

In [9]:
class Tokenization:
    def __init__(self, config: TokenizationConfig):
        self.config = config

    def init_tokenizer(self):
        logger.info("Initializing the Tokenizer.")
        self.tokenizer = get_tokenizer(self.config.tokenizer_type)
        self.counter = Counter()

    def build_vocab(self):
        logger.info("Building the vocab.")
        lines = pd.read_csv(self.config.caption_file)
        for line in lines["caption"].tolist():
            self.counter.update(self.tokenizer(line))

        self.vocab = vocab(self.counter, min_freq = 5)

        self.vocab.insert_token(self.config.unk_token, 0)
        self.vocab.insert_token(self.config.pad_token, 1)
        self.vocab.insert_token(self.config.sos_token, 2)
        self.vocab.insert_token(self.config.eos_token, 3)

        self.vocab.set_default_index(self.vocab[self.config.unk_token])

        logger.info("Finished Creating the vocab.")

    def save_vocab_pickle(self):
        with open(self.config.token_file, "wb") as f:
            pickle.dump(self.vocab, f)
        logger.info(f"Vocabulary saved to {self.config.token_file}.")

Pipeline

In [10]:
try:
    config = ConfigurationManager()
    tokenization_config = config.get_tokenization_config()
    tokenization = Tokenization(config = tokenization_config)
    tokenization.init_tokenizer()
    tokenization.build_vocab()
    tokenization.save_vocab_pickle()
except Exception as e:
    raise e

[2024-12-07 22:42:35,959: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-12-07 22:42:35,962: INFO: common: yaml file: params.yaml loaded successfully]
[2024-12-07 22:42:35,963: INFO: common: created directory at: artifacts]
[2024-12-07 22:42:35,965: INFO: common: created directory at: artifacts/tokenization]
[2024-12-07 22:42:35,966: INFO: 789261004: Initializing the Tokenizer.]
[2024-12-07 22:42:35,967: INFO: 789261004: Building the vocab.]
[2024-12-07 22:42:36,560: INFO: 789261004: Finished Creating the vocab.]
[2024-12-07 22:42:36,563: INFO: 789261004: Vocabulary saved to artifacts/tokenization/data.]
