# Dictionary

In [None]:
replacement_dict_labels = {
  "mở ngoặc đơn": "1 1 1",
  "đóng ngoặc đơn": "2 2 2",
  "mở ngoặc nhọn": "3 3 3",
  "đóng ngoặc nhọn": "4 4 4",
  "mở ngoặc vuông": "5 5 5",
  "đóng ngoặc vuông": "6 6 6",
  "gạch ngang trên": "7 7 7",
  "gạch ngang dưới": "8 8 8",
  "hai chấm": "9 9",
  "chấm phẩy": "10 10",
  "phẩy": "11",
  "lớn hơn": "12 12",
  "bé hơn": "13 13",
  "chấm hỏi": "14 14",
  "chấm than": "15 15",
  "a còng": "16 16",
  "dấu thăng": "17 17",
  "phần trăm": "18 18",
  "ba chấm": "19 19",
  "chấm": "20",
  "bằng": "21",
  "xuyệt trái": "22 22",
  "xuyệt phải": "23 23"
}

replacement_dict = {
  r"(": "mở ngoặc đơn",
  r")": "đóng ngoặc đơn",
  r"{": "mở ngoặc nhọn",
  r"}": "đóng ngoặc nhọn",
  r"[": "mở ngoặc vuông",
  r"]": "đóng ngoặc vuông",
  r"-": "gạch ngang trên",
  r"_": "gạch ngang dưới",
  r":": "hai chấm",
  r";": "chấm phẩy",
  r",": "phẩy",
  r">": "lớn hơn",
  r"<": "bé hơn",
  r"?": "chấm hỏi",
  r"!": "chấm than",
  r"@": "a còng",
  r"#": "dấu thăng",
  r"%": "phần trăm",
  r"...": "ba chấm",
  r".": "chấm",
  r"=": "bằng",
  r"/": "xuyệt trái",
  r"\\": "xuyệt phải"
}

# Data process

## convert from text to number

In [None]:
import re
def replace_special_characters_regex(text):
  regex = re.compile(r'[\(\)\[\]{}:;,><=_?!@#%/\\.-]+|\.{3}')
  txt = regex.sub(lambda x: f' {replacement_dict.get(x.group(), x.group())} ', text)
  txt1 = delete_space(txt)
  return txt1

def delete_space(text):
    text_del_sp = text.strip()
    text_del_sp_inside = re.sub(' +', ' ', text_del_sp)
    return text_del_sp_inside

def word_to_number(text):
  pattern = "(" + "|".join(key.strip() for key in replacement_dict_labels.keys()) + ")"
  matches = re.findall(pattern, text)
  result = re.sub(pattern, lambda x: replacement_dict_labels[x.group()], text)
  return result

def lower_string(text):
    return text.lower()

def text_to_0(sen):
  ls = sen.split()
  for i in range(len(ls)):
    if ls[i].isnumeric() is False:
      ls[i] = '0 '
      i+=1
    sen = ' '.join(ls)
    new_sen = re.sub(' +', ' ', sen)
  return new_sen

def num_to_0(text):
  ls = text.split()
  for i in range(len(ls)):
    if ls[i].isnumeric() == True:
      ls[i] = '0 '
      i+=1
    sen = ' '.join(ls)
    new_sen = re.sub(' +', ' ', sen)
  return new_sen

def convert_text_to_num(text):
  txt1 = replace_special_characters_regex(text)
  txt2 = lower_string(txt1)
  txt3 = num_to_0(txt2)
  txt4 = word_to_number(txt3)
  txt5 = text_to_0(txt4)
  return  text + txt1 + '\n'

'''text='Chây ì nộp phạt nguội 12/2.'
print(convert_text_to_num(text))'''

"text='Chây ì nộp phạt nguội 12/2.'\nprint(convert_text_to_num(text))"

## convert number to text

In [None]:
def replace_lables_to_word(text):
  for chu, so in replacement_dict_labels.items():
    text = text.replace(so, chu)
    text = text.replace(" 20", " chấm")
    txt = delete_space(text)
  return txt

def form_0_to_text(num, text):
  ls1 = num.split()
  ls2 = text.split()
  i = 0
  j = 0
  for i in range(len(ls1)):
    if ls1[i] == '0':
      ls1[i] = ls2[j]
      i+=1
      j+=1
    else:
      i+=1
      j+=1
    sentence = ' '.join(ls1)
  return sentence

def back_to_original_regex(text):
  pattern = "(" + "|".join(re.escape(value) for value in replacement_dict.values()) + ")"
  result = re.sub(pattern, lambda x: next(key for key, value in replacement_dict.items() if value == x.group()), text)
  s0 = re.sub(r'\s*([\(\[\{<])\s*', r' \1', result)
  s1 = re.sub(r'\s*([\)\]\}>])\s*', r'\1 ', s0)
  s2 = re.sub(r'\s*([:;,_?!%.])\s*', r'\1 ', s1)
  s3 = re.sub(r'\s*([@#/\\])\s*', r'\1', s2)
  s4 = re.sub(r'\s*([-<>=])\s*', r' \1 ', s3)
  s5 = re.sub(' +', ' ',s4)
  return s5

def num_to_text(text, num):
  t = replace_lables_to_word(num)
  t1 = form_0_to_text(t, text)
  t2 = back_to_original_regex(t1)
  return t2

# prepare data

In [None]:
def process_file(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    processed_lines = [convert_text_to_num(line) for line in lines]
    with open(output_file, 'w', encoding='utf-8') as file:
        file.writelines(processed_lines)

process_file('demo-title.txt', 'title_num.txt')

# Model BiLSTM

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense

# Load the content of the file
with open('title_num.txt', 'r', encoding='utf-8') as file:
    content = file.readlines()

# Preprocess the data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(content)
sequences = tokenizer.texts_to_sequences(content)
vocab_size = len(tokenizer.word_index) + 1

# Split into X and y
X = []
y = []
for seq in sequences:
    if len(seq) > 1:
        X.append(seq[:-1])
        y.append(seq[1:])

# Pad sequences to have the same length
max_len = max([len(seq) for seq in X])
X = pad_sequences(X, maxlen=max_len, padding='post')
y = pad_sequences(y, maxlen=max_len, padding='post')

# Convert y to one-hot encoding
y = tf.keras.utils.to_categorical(y, num_classes=vocab_size)

# Build the BiLSTM model
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_len))
model.add(Bidirectional(LSTM(150, return_sequences=True)))
model.add(Dense(vocab_size, activation='softmax'))

# Compile and train the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=30, verbose=1)

# Generate new text
seed_text = "Chây ì nộp phạt nguội."
next_words = 20

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30