In [1]:
## import the required libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from khmernltk import word_tokenize
import fitz




In [2]:
text = "ការសិក្សាពិតជាមានសារៈសំខាន់ខ្លាំងណាស់"
print(word_tokenize(text))

| 2024-01-21 15:20:17,074 | [1;32mINFO[0m | khmer-nltk | Loaded model from c:\Users\soyvi\AppData\Local\Programs\Python\Python311\Lib\site-packages\khmernltk\word_tokenize\sklearn_crf_ner_10000.sav |


['ការសិក្សា', 'ពិតជា', 'មាន', 'សារៈសំខាន់', 'ខ្លាំង', 'ណាស់']


### load text from doc

In [None]:
def read_pdf(file_path):
    text = ''
    with fitz.open(file_path) as pdf_document:
        for page_num in range(pdf_document.page_count):
            page = pdf_document[page_num]
            text += page.get_text()
    return text

def read_txt(file_path):
    text = ''
    with open(file_path, 'r', encoding='utf-8') as txt_file:
        text = txt_file.read()
    return text


In [None]:
file_path = "../../Web Mining/kh-search-QNA.txt"
text_from_txt = read_txt(file_path)
text_from_txt

### Step 1: Create a corpus

In [None]:
## create corpus by lowering the letters and splitting the text by \n
corpus = []
temp_text = ""
ignore_word = [" ", "។", "\n"]
for ch in text_from_txt:
    if ch != "។":
        if ch not in ignore_word:
            temp_text += ch
    else:
        corpus.append(temp_text)
        temp_text = ""
print(corpus)

### Step 2: Train the tokenizer and create word encoding dictionary

In [None]:
tokenize_word = []
for line in corpus:
    tokenize_word = tokenize_word + word_tokenize(line)

# get unique word
tokenize_word = np.array(list(set(tokenize_word)))

#shuffle array of word
np.random.shuffle(tokenize_word)

# calculate vocabulary size + 1 for <oov> token
vocab_size = len(tokenize_word) + 1

print(tokenize_word)
print(vocab_size)

### Remove longest word

In [None]:
max_length = 20
filtered_words = [word for word in tokenize_word if len(word) <= max_length]

def remove_longest_word(max_length, tokenize_word):
    filtered_words = [word for word in tokenize_word if len(word) <= max_length]
    return np.array(filtered_words)
## re assign filtered_words to tokenized_word
tokenize_word = remove_longest_word(max_length, tokenize_word)
tokenize_word

### Step 3: Create N-gram sequence

In [None]:
# create n-gram sequences of each text sequence
input_sequences = []
for line in corpus:
    tokens = word_tokenize(line)
    tokens = remove_longest_word(max_length, tokenize_word)
    for i in range(2, len(tokens) + 1):
        temp_array_seq = []
        for word in tokens[: i]:
            temp_array_seq.append(list(tokenize_word).index(word))
        input_sequences.append(temp_array_seq)

print(input_sequences)

In [None]:
## pad sequences
max_seq_len = max([len(i) for i in input_sequences])
input_seq_array = np.array(pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre'))

### Step 4 Extract features and labels

In [None]:
# creating features(x) and label(y)
x = input_seq_array[:, :-1]
labels = input_seq_array[:, -1]
# one-hot encode the labels to get y
y = tf.keras.utils.to_categorical(labels, num_classes=vocab_size)

In [None]:
print(vocab_size, max_seq_len)

In [None]:
model1 = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 64, input_length=max_seq_len - 1)
])
model1.weights[0].shape

### Define the LSTM model

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 64, input_length = max_seq_len - 1),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(vocab_size, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
history = model.fit(x, y, epochs=240, verbose=1)

### Visualize metrics

In [None]:
import matplotlib.pyplot as plt

def plot_metric(history, metric):
    plt.plot(history.history[metric])
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.show()

In [None]:
plot_metric(history=history, metric='accuracy')

### Generate new text

In [None]:
seed_text = "ធ្វើដូចម្តេងទើងយើងស៊ីឯបនឆ្ងាញ់ពីសា"

seed_text = [seed_text]

test_sequence = []

for line in seed_text:
    tokens = word_tokenize(line)
    for i in range(2, len(tokens) + 1):
        temp_array_seq = []
        for word in tokens[: i]:
            temp_array_seq.append(list(tokenize_word).index(word))
        test_sequence.append(temp_array_seq)

# tokenlist = tokenizer.texts_to_sequences([seed_text])
token_pad = pad_sequences(test_sequence, maxlen=max_seq_len - 1, padding='pre')
predictd = model.predict(token_pad, verbose=0)
print(np.max(predictd), np.argmax(predictd, axis=-1))

In [None]:
def text_to_sequence(text, tokenize_word):
    text_seq = []
    for line in [text]:
        tokens = word_tokenize(line)
        for i in range(2, len(tokens) + 1):
            temp_array_seq = []
            previous_word = ""
            for word in tokens[: i]:
                if word not in ignore_word:
                    if word in tokenize_word:
                        temp_array_seq.append(list(tokenize_word).index(word))
                    else:
                        if previous_word + word not in tokenize_word:
                            pass
                        else:
                            temp_array_seq.append(list(tokenize_word).index(previous_word + word))
                previous_word = word
            text_seq.append(temp_array_seq)
    return text_seq[-1]

In [None]:
seed_text = "ធ្វើដូចម្តេងទើងយើងស៊ីឯបនឆ្ងាញពីសា"


## add number of words you want to predict
next_words = 25

## run the loop to predict and concatenate the word

for _ in range(next_words):
    token_list = text_to_sequence(seed_text, tokenize_word)
    token_pad = pad_sequences([token_list], maxlen=max_seq_len - 1, padding='pre')
    # predict the class using the trained model
    predicted = model.predict(token_pad, verbose=0)
    higest_prediction = np.argmax(predicted)
    output_word = ""
    for word in tokenize_word:
        # reference the predicted class with the vocabulary
        if higest_prediction == list(tokenize_word).index(word):
            output_word = word
            break
    
    seed_text += " " + output_word
print(seed_text)