In [None]:
import json
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [None]:
from bs4 import BeautifulSoup
import string

stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "hed", "hes", "her", "here", "heres", "hers", "herself", "him", "himself", "his", "how", "hows", "i", "id", "ill", "im", "ive", "if", "in", "into", "is", "it", "its", "itself", "lets", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "shed", "shell", "shes", "should", "so", "some", "such", "than", "that", "thats", "the", "their", "theirs", "them", "themselves", "then", "there", "theres", "these", "they", "theyd", "theyll", "theyre", "theyve", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "wed", "well", "were", "weve", "were", "what", "whats", "when", "whens", "where", "wheres", "which", "while", "who", "whos", "whom", "why", "whys", "with", "would", "you", "youd", "youll", "youre", "youve", "your", "yours", "yourself", "yourselves"]


table = str.maketrans('', '', string.punctuation)

In [None]:
import tensorflow_datasets as tfds
import numpy as np


dataset, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)
train_dataset, test_dataset = dataset['train'], dataset['test']

sentences = []
labels = []


for s, l in train_dataset:
  sentences.append(s.numpy().decode('utf8'))
  labels.append(l.numpy())

print(f" จำนวนประโยคทั้งหมด: {len(sentences)} ประโยค")
print("ตัวอย่างประโยคแรก:", sentences[0])
print("Label ของประโยคนี้ (0=Neg, 1=Pos):", labels[0])

print(f"จำนวนคลาสทั้งหมด: {info.features['label'].num_classes}")


print(f"ชื่อของแต่ละคลาส: {info.features['label'].names}")

In [None]:
sentences[0:2]


In [None]:
cleaned_sentences = []


if len(sentences) > 0:
    for sentence in sentences:
        sentence = sentence.lower()

        soup = BeautifulSoup(sentence, "html.parser")
        sentence = soup.get_text()


        sentence = sentence.translate(table)

        # แยกคำและลบ Stopwords
        words = sentence.split()
        filtered_sentence = [w for w in words if w not in stopwords]

        cleaned_sentences.append(" ".join(filtered_sentence))

    print("Clean Data Completely!")
    print("ตัวอย่างประโยคก่อนคลีน:", sentences[0])
    print("ตัวอย่างประโยคหลังคลีน:", cleaned_sentences[0])
else:
    print("Error: ไม่มีข้อมูลในลิสต์ sentences กรุณากลับไปเช็คขั้นตอนการโหลดข้อมูล")

In [None]:
import matplotlib.pyplot as plt


sentence_lengths = [len(s.split()) for s in cleaned_sentences]


plt.hist(sentence_lengths, bins=50)
plt.xlabel('Length of Sentence')
plt.ylabel('Number of Reviews')
plt.title('Distribution of Review Lengths')
plt.show()


import numpy as np
print(f"Mean length: {np.mean(sentence_lengths)}")
print(f"Median length: {np.median(sentence_lengths)}")
print(f"90th percentile: {np.percentile(sentence_lengths, 90)}")

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

vocab_size = 2000
embedding_dim = 7
max_length = 120
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 24000


In [None]:
training_sentences = cleaned_sentences[0:training_size]
testing_sentences = cleaned_sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

In [None]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
training_sequences[1]

In [None]:
wc=tokenizer.word_counts
print(wc)

In [None]:
import matplotlib.pyplot as plt
wc = tokenizer.word_counts
from collections import OrderedDict
newlist = (OrderedDict(sorted(wc.items(), key=lambda t: t[1], reverse=True)))
print(word_index)
# print(newlist)
xs=[]
ys=[]
curr_x = 1
for item in newlist:
  xs.append(curr_x)
  curr_x=curr_x+1
  ys.append(newlist[item])

print(ys)
plt.plot(xs, ys)
plt.xlabel("Word Rank")
plt.ylabel("Word Frequency")
plt.title("Word Frequency Distribution of IMDB Reviews")
plt.show()

print(ys[1000])
print(ys[2000])

In [None]:

import numpy as np
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100)),
    tf.keras.layers.Dense(32, activation='relu', kernel_regularizer = tf.keras.regularizers.l2(0.1)),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
num_epochs = 30
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=20,
    restore_best_weights=True
)

history = model.fit(
    training_padded,
    training_labels,
    epochs=150,
    validation_data=(testing_padded, testing_labels),
    callbacks=[early_stop],
    verbose=2
)


In [None]:
import matplotlib.pyplot as plt


def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

In [None]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_sentence(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

print(decode_sentence(training_padded[2]))
print(training_sentences[2])
print(labels[2])

In [None]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)


In [None]:
print(reverse_word_index[2])
print(weights[2])

In [None]:
import io

out_v = io.open('vecssen.tsv', 'w', encoding='utf-8')
out_m = io.open('metasen.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()


try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('vecssen.tsv')
  files.download('metasen.tsv')

In [None]:
test_sentences = [
    "I absolutely loved this movie! The acting was superb and the story was very touching.",
    "This was the worst film I have ever seen. A complete waste of time and money.",
    "The cinematography was beautiful, but the plot was boring and the characters were flat.",
    "I thought it would be a good movie, but it turned out to be very disappointing and slow.",
    "An incredible masterpiece. I will definitely watch it again and recommend it to everyone!"
]
sequences = tokenizer.texts_to_sequences(test_sentences)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
predictions = model.predict(padded)

for s, p in zip(test_sentences, predictions):
    print(f"{p[0]:.3f} = {s}")
