In [12]:
import keras.utils
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Embedding, Dense, Conv1D, GlobalMaxPooling1D
from keras.preprocessing.text import Tokenizer
import pandas as pd
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.model_selection import train_test_split
from keras.utils import pad_sequences
import numpy as np
from sklearn.metrics import classification_report
from collections import Counter
import joblib
import tensorflow as tf

In [3]:
data = pd.read_csv('../ds/HuTrain/combined_data.csv')

# Split the data into features (X) and labels (y)
X = data['Nhận xét đánh giá'].values
y = data['Cảm xúc'].values
label_encoder = LabelEncoder()

# Encode the labels
y_encoded = label_encoder.fit_transform(y)


# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=132)



In [4]:
borderline_smote = BorderlineSMOTE(sampling_strategy='auto', random_state=132)

max_words = 5000
max_len = 1000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)

X_train_seq = pad_sequences(X_train_seq, maxlen=max_len)
X_val_seq = pad_sequences(X_val_seq, maxlen=max_len)
X_train_resampled, y_train_resampled = borderline_smote.fit_resample(X_train_seq, y_train)

In [5]:
class_distribution_before_SMOTE = Counter(y_train)

class_distribution_after_SMOTE = Counter(y_train_resampled)

print("Class Distribution Before SMOTE:")
for label, count in class_distribution_before_SMOTE.items():
    print(f"Class {label}: {count} samples")

# Print class distribution after SMOTE
print("\nClass Distribution After SMOTE:")
for label, count in class_distribution_after_SMOTE.items():
    print(f"Class {label}: {count} samples")

Class Distribution Before SMOTE:
Class 4: 2993 samples
Class 3: 2607 samples
Class 5: 3587 samples
Class 0: 3938 samples
Class 1: 912 samples
Class 2: 1251 samples

Class Distribution After SMOTE:
Class 4: 3938 samples
Class 3: 3938 samples
Class 5: 3938 samples
Class 0: 3938 samples
Class 1: 3938 samples
Class 2: 3938 samples


In [6]:

# Build and compile your CNN model
embedding_dim = 120
num_filters = 128
kernel_size = 5
num_classes = len(np.unique(y_train_resampled))
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=max_len))
model.add(Conv1D(num_filters, kernel_size, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(num_classes, activation='softmax'))

optimizer = keras.optimizers.Adam(learning_rate=0.0007)
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [8]:
batch_size = 128
epochs = 10

model.fit(X_train_seq, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val_seq, y_val))

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7f9e4bcc51e0>

In [9]:
y_pred = model.predict(X_val_seq)
# Convert numerical labels to original labels
y_pred_labels = np.argmax(y_pred, axis=1)
y_pred_labels = label_encoder.inverse_transform(y_pred_labels)

# Convert true labels to original labels
y_val_labels = label_encoder.inverse_transform(y_val)

# Generate classification report
report = classification_report(y_val_labels, y_pred_labels)
print(report)

                 precision    recall  f1-score   support

    bình thường       0.65      0.60      0.63       977
không liên quan       0.96      0.96      0.96       230
         rất tệ       0.76      0.76      0.76       325
        rất tốt       0.62      0.56      0.59       632
             tệ       0.87      0.85      0.86       789
            tốt       0.59      0.70      0.64       870

       accuracy                           0.70      3823
      macro avg       0.74      0.74      0.74      3823
   weighted avg       0.71      0.70      0.70      3823



In [18]:
model.save('../saved_model/sentiment_model')
model.save('../saved_model/sentiment_model.keras')

INFO:tensorflow:Assets written to: ../saved_model/sentiment_model/assets


INFO:tensorflow:Assets written to: ../saved_model/sentiment_model/assets


In [17]:
custom_text = "thay giao day rat tuyet"

# Tokenize and pad the custom text
custom_text_seq = tokenizer.texts_to_sequences([custom_text])
custom_text_seq = keras.utils.pad_sequences(custom_text_seq, maxlen=max_len)

# Make predictions on the custom text
custom_text_pred = model.predict(custom_text_seq)

# Convert numerical labels to original labels
custom_text_pred_label = np.argmax(custom_text_pred, axis=1)
custom_text_pred_label = label_encoder.inverse_transform(custom_text_pred_label)

# Print the predicted label
print(f"Predicted Sentiment: {custom_text_pred_label[0]}")



Predicted Sentiment: tệ


In [13]:
new_model = tf.keras.models.load_model('../saved_model/saved_model/sentiment_model')

# Check its architecture
new_model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1000, 120)         600000    
                                                                 
 conv1d (Conv1D)             (None, 996, 128)          76928     
                                                                 
 global_max_pooling1d (Glob  (None, 128)               0         
 alMaxPooling1D)                                                 
                                                                 
 dense (Dense)               (None, 6)                 774       
                                                                 
Total params: 677702 (2.59 MB)
Trainable params: 677702 (2.59 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
