In [34]:
import keras.utils
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import LSTM, Embedding, Dense, Conv1D, GlobalMaxPooling1D
from keras.preprocessing.text import Tokenizer
import tensorflow as tf
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.utils import pad_sequences
from imblearn.over_sampling import ADASYN
import numpy as np
from sklearn.metrics import classification_report
from collections import Counter


In [41]:
# Load your dataset
data = pd.read_csv('combined_data.csv')

# Split the data into features (X) and labels (y)
X = data['Nhận xét đánh giá'].values
y = data['Cảm xúc'].values

# Encode the labels
label_encoder = LabelEncoder()
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [42]:
class_distribution = Counter(y_train)
print(class_distribution)

Counter({'bình thường': 3926, 'tốt': 3607, 'tệ': 2993, 'rất tốt': 2604, 'không liên quan': 847, 'rất tệ': 674, 'rất tệ ': 182})


In [43]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)

class_distribution = Counter(y_train)

# Define a desired total number of samples (you can adjust this)
desired_total_samples = 4500  # Reduce this number

# Create the initial sampling strategy
sampling_strategy_dict = {
    label: int(desired_total_samples * (count / sum(class_distribution.values())))
    for label, count in class_distribution.items()
}

# Ensure that no class exceeds its original count
for label, count in class_distribution.items():
    if label in sampling_strategy_dict and sampling_strategy_dict[label] > count:
        sampling_strategy_dict[label] = count

# Create the ADASYN sampler with the adjusted sampling strategy
adasyn = ADASYN(sampling_strategy=sampling_strategy_dict, random_state=42)
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train_tfidf, y_train)

unique_classes = np.unique(y_train_resampled)
print("Unique Classes in y_train_resampled:", unique_classes)
# Tokenization and Padding
max_words = 5000
max_len = 1000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)

X_train_seq = pad_sequences(X_train_seq, maxlen=max_len)
X_val_seq = pad_sequences(X_val_seq, maxlen=max_len)

# Build and compile your CNN model
embedding_dim = 120
num_filters = 128
kernel_size = 5
num_classes = len(label_encoder.classes_)
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=max_len))
model.add(Conv1D(num_filters, kernel_size, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

ValueError: With over-sampling methods, the number of samples in a class should be greater or equal to the original number of samples. Originally, there is 3926 samples and 1191 samples are asked.

In [None]:
# Train the model
batch_size = 128
epochs = 10

model.fit(X_train_seq, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val_seq, y_val))


In [None]:
loss, accuracy = model.evaluate(X_val_seq, y_val)
print(f"Validation loss: {loss}")
print(f"Validation accuracy: {accuracy}")

In [None]:
y_pred = model.predict(X_val_seq)
# Convert numerical labels to original labels
y_pred_labels = np.argmax(y_pred, axis=1)
y_pred_labels = label_encoder.inverse_transform(y_pred_labels)

# Convert true labels to original labels
y_val_labels = label_encoder.inverse_transform(y_val)

# Generate classification report
report = classification_report(y_val_labels, y_pred_labels)
print(report)