In [5]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

class DataProcessor:
    def __init__(self, num_words=10000, oov_token="<OOV>", max_length=100, padding_type='post', truncating_type='post'):
        self.tokenizer = Tokenizer(num_words=num_words, oov_token=oov_token)
        self.max_length = max_length
        self.padding_type = padding_type
        self.truncating_type = truncating_type
    
    def fit_tokenizer(self, texts):
        self.tokenizer.fit_on_texts(texts)
    
    def text_to_sequences(self, texts):
        return self.tokenizer.texts_to_sequences(texts)
    
    def pad_sequences(self, sequences):
        return pad_sequences(sequences, maxlen=self.max_length, padding=self.padding_type, truncating=self.truncating_type)
    
    def process_texts(self, texts):
        sequences = self.text_to_sequences(texts)
        padded_sequences = self.pad_sequences(sequences)
        return padded_sequences

    def get_word_index(self):
        return self.tokenizer.word_index

    def save_tokenizer(self, path='tokenizer.json'):
        tokenizer_json = self.tokenizer.to_json()
        with open(path, 'w') as f:
            f.write(tokenizer_json)

    def load_tokenizer(self, path='tokenizer.json'):
        with open(path) as f:
            tokenizer_json = f.read()
        self.tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(tokenizer_json)

# Sử dụng DataProcessor để xử lý dữ liệu
data_processor = DataProcessor()

# Giả sử chúng ta có một số dữ liệu văn bản
texts = ["Hello, how are you?", "I am fine, thank you!", "What about you?"]

# Huấn luyện tokenizer
data_processor.fit_tokenizer(texts)

# Chuyển đổi văn bản thành chuỗi số
sequences = data_processor.text_to_sequences(texts)

# Pad sequences
padded_sequences = data_processor.pad_sequences(sequences)

print("Padded Sequences:")
print(padded_sequences)

# Lưu tokenizer để sử dụng sau
data_processor.save_tokenizer()

# Tải lại tokenizer
new_data_processor = DataProcessor()
new_data_processor.load_tokenizer()

# Kiểm tra lại các chỉ số từ
print("Word Index:")
print(new_data_processor.get_word_index())


Padded Sequences:
[[ 3  4  5  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0]
 [ 6  7  8  9  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0]
 [10 11  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0]]
Word Index:
{'<OOV>': 1, 'you': 2, 'hello': 3, '