In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re, os, string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Lambda, Dense
import tensorflow.keras.backend as K

# Finish script for pipeline in slide 28

In [None]:
Punc2Remove = string.punctuation + string.digits + "\n"

def clean_text(text):
    translation_table = str.maketrans('', '', Punc2Remove)
    text = text.translate(translation_table)
    text = text.lower()
    return text

with open('truyen_kieu_data.txt', 'r', encoding='utf-8') as file:
    data = file.readlines()

print(f"Data: {data[0]}")
exs = clean_text(data[0])
print(f"Cleaned text: {exs}")

Data: 1..Trăm năm trong cõi người ta,

Cleaned text: trăm năm trong cõi người ta


In [None]:
cleaned_data = []
for line in data:
    line = clean_text(line)
    cleaned_data.append(line)

cleaned_data[:10]

['trăm năm trong cõi người ta',
 'chữ tài chữ mệnh khéo là ghét nhau',
 'trải qua một cuộc bể dâu',
 'những điều trông thấy mà đau đớn lòng',
 ' lạ gì bỉ sắc tư phong',
 'trời xanh quen thói má hồng đánh ghen',
 'cảo thơm lần giở trước đèn',
 'phong tình có lục còn truyền sử xanh',
 'rằng năm gia tĩnh triều minh',
 ' bốn phương phẳng lặng hai kinh vững vàng']

In [None]:

def get_centers_and_contexts(corpus, max_window_size=2):
    centers = []
    contexts = []

    for line in corpus:
        line = line.split()

        if len(line) <= 2*max_window_size:
            continue

        for i in range(max_window_size, len(line)-max_window_size):
            centers.append(line[i])
            idxs = list(range(i-max_window_size, i+max_window_size+1))
            idxs.remove(i)
            contexts.append(" ".join([line[idx] for idx in idxs]))

    return centers, contexts

In [None]:
centers, contexts = get_centers_and_contexts(cleaned_data)
len(centers)

9778

In [None]:
max_length = 4
embedding_size = 250

In [None]:
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(cleaned_data)

In [None]:
vocab_size = len(tokenizer.index_word) + 1

In [None]:
train_seq = tokenizer.texts_to_sequences(contexts)
train_seq_pad = pad_sequences(train_seq, maxlen=max_length, truncating = 'post', padding = "post")
train_labels = [to_categorical(tokenizer.word_index[label], len(tokenizer.word_index) + 1) for label in centers]
train_labels = np.array(train_labels)

In [None]:
cbow = Sequential()
cbow.add(Embedding(input_dim=vocab_size, output_dim=embedding_size, input_length=4))
cbow.add(Lambda(lambda x : K.mean(x, axis=1), output_shape=(embedding_size,)))
cbow.add(Dense(vocab_size, activation='softmax'))
cbow.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

cbow.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_10 (Embedding)    (None, 4, 250)            603000    
                                                                 
 lambda_10 (Lambda)          (None, 250)               0         
                                                                 
 dense_9 (Dense)             (None, 2412)              605412    
                                                                 
Total params: 1208412 (4.61 MB)
Trainable params: 1208412 (4.61 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
model = cbow.fit(train_seq_pad, train_labels, epochs=25, verbose=2)
print(model.history)

Epoch 1/25
306/306 - 5s - loss: 7.4276 - accuracy: 0.0184 - 5s/epoch - 17ms/step
Epoch 2/25
306/306 - 5s - loss: 6.7092 - accuracy: 0.0209 - 5s/epoch - 17ms/step
Epoch 3/25
306/306 - 5s - loss: 6.5284 - accuracy: 0.0276 - 5s/epoch - 18ms/step
Epoch 4/25
306/306 - 4s - loss: 6.3082 - accuracy: 0.0419 - 4s/epoch - 15ms/step
Epoch 5/25
306/306 - 6s - loss: 5.9821 - accuracy: 0.0697 - 6s/epoch - 18ms/step
Epoch 6/25
306/306 - 5s - loss: 5.5498 - accuracy: 0.1039 - 5s/epoch - 16ms/step
Epoch 7/25
306/306 - 4s - loss: 5.0385 - accuracy: 0.1519 - 4s/epoch - 15ms/step
Epoch 8/25
306/306 - 6s - loss: 4.4835 - accuracy: 0.2065 - 6s/epoch - 19ms/step
Epoch 9/25
306/306 - 4s - loss: 3.9239 - accuracy: 0.2834 - 4s/epoch - 15ms/step
Epoch 10/25
306/306 - 4s - loss: 3.3867 - accuracy: 0.3628 - 4s/epoch - 14ms/step
Epoch 11/25
306/306 - 6s - loss: 2.8920 - accuracy: 0.4532 - 6s/epoch - 19ms/step
Epoch 12/25
306/306 - 5s - loss: 2.4519 - accuracy: 0.5354 - 5s/epoch - 15ms/step
Epoch 13/25
306/306 - 4s 

In [None]:
sample_text = 'trăm năm cõi người ta'
sample_seq = tokenizer.texts_to_sequences([sample_text])
sample_seq_pad = pad_sequences(sample_seq, maxlen=max_length, truncating='post', padding="post")
cbow.predict(sample_seq_pad)



array([[1.07512593e-11, 1.06291651e-11, 5.69267548e-04, ...,
        9.60149791e-08, 1.09128834e-11, 2.33700888e-07]], dtype=float32)

In [None]:
tokenizer.index_word[np.argmax(cbow.predict(sample_seq_pad))]



'trong'