In [5]:
import pandas as pd
import numpy as np
import pythainlp
import pickle

from keras.callbacks import EarlyStopping
from pythainlp.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential, load_model
from keras.layers import Dense, Embedding, Conv1D, GlobalMaxPooling1D
from keras.utils import to_categorical, pad_sequences
from keras.models import load_model

In [6]:
# Set the maximum number of words to keep based on word frequency
max_words = 10000

# Define a custom tokenizer function for Thai text
def thai_tokenizer(text):
    return word_tokenize(text, engine='newmm')

# Load the data from the CSV file
data = pd.read_csv('C:/LabPython/datasets/dataTH.csv')

# Split the data into text and labels
x = data['text'].values
y = data['sentiment'].values

# Convert the sentiment labels to one-hot encoded vectors
y = pd.get_dummies(y).values

# Split the data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Define a custom tokenizer function for Thai text
def thai_tokenizer(texts):
    return [word_tokenize(text, engine='newmm') for text in texts]

# Tokenize the text using the custom tokenizer function
x_train = thai_tokenizer(x_train)
x_test = thai_tokenizer(x_test)

# Create the Tokenizer object without passing a custom tokenizer function
tokenizer = Tokenizer(num_words=max_words, filters='', lower=False, oov_token='<UNK>')
tokenizer.fit_on_texts(x_train)
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

# The rest of the code remains the same
maxlen = max([len(x) for x in x_train])
x_train = pad_sequences(x_train, maxlen=maxlen)
x_test = pad_sequences(x_test, maxlen=maxlen)

In [7]:
# Create the model
model = Sequential()
model.add(Embedding(max_words, 64, input_length=maxlen))
model.add(Conv1D(64, 7, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(3, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 517, 64)           640000    
                                                                 
 conv1d_2 (Conv1D)           (None, 511, 64)           28736     
                                                                 
 global_max_pooling1d_2 (Glo  (None, 64)               0         
 balMaxPooling1D)                                                
                                                                 
 dense_2 (Dense)             (None, 3)                 195       
                                                                 
Total params: 668,931
Trainable params: 668,931
Non-trainable params: 0
_________________________________________________________________
None


In [8]:
# Train the model
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
model.fit(x_train, y_train, epochs=10, batch_size=16, validation_data=(x_test, y_test), callbacks=[es])


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 4: early stopping


<keras.callbacks.History at 0x21c70bf7350>

In [57]:
# Save the trained model and the tokenizer to disk
model.save('sentiment_analysis_model.h5')
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle)

In [None]:
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

model = load_model('sentiment_analysis_modelCNN.h5')

In [58]:
# Prepare the new data for prediction
new_text = ["แย่"]
new_text = tokenizer.texts_to_sequences(new_text)
new_text = pad_sequences(new_text, maxlen=maxlen)

# Make the prediction
prediction = model.predict(new_text)[0]

# Get the predicted sentiment and confidence level
sentiments = ['negative', 'neutral', 'positive']
sentiment = sentiments[np.argmax(prediction)]
confidence = np.max(prediction)

# Display the result
print(f'{sentiment} ({confidence * 100:.2f}%)')

negative (99.03%)
