In [2]:
# install kaggle library
!pip install kaggle



Upload kaggle json file


In [3]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Importing Vietnamese Review Shopee Dataset

In [4]:
!kaggle datasets download -d linhlpv/vietnamese-sentiment-analyst

Dataset URL: https://www.kaggle.com/datasets/linhlpv/vietnamese-sentiment-analyst
License(s): unknown
Downloading vietnamese-sentiment-analyst.zip to /content
 93% 1.00M/1.08M [00:01<00:00, 1.03MB/s]
100% 1.08M/1.08M [00:01<00:00, 1.09MB/s]


In [5]:
# extrating the compressed dataset
from zipfile import ZipFile
dataset = '/content/vietnamese-sentiment-analyst.zip'

with ZipFile(dataset,'r') as zip:
  zip.extractall()
  print('The dataset is extracted')

The dataset is extracted


Data processing

In [6]:
import numpy as np
import pandas as pd

In [7]:
# loading the data from csv file to pandas dataframe
review_data = pd.read_csv('/content/data.csv',encoding='utf8')

In [8]:
review_data.shape



(31460, 3)

In [9]:
review_data.head()

Unnamed: 0,content,label,start
0,Áo bao đẹp ạ!,POS,5
1,Tuyệt vời,POS,5
2,2day ao khong giong trong,NEG,1
3,"Mùi thơm,bôi lên da mềm da",POS,5
4,"Vải đẹp, dày dặn",POS,5


In [10]:
#counting the number of missing values in the dataset
review_data.isnull().sum()

Unnamed: 0,0
content,24
label,0
start,0


In [11]:
# Delete null content row
review_data = review_data.dropna(subset=['content'])

# Checking the data
print("\nMissing values after cleaning:")
print(review_data.isnull().sum())

review_data.shape



Missing values after cleaning:
content    0
label      0
start      0
dtype: int64


(31436, 3)

In [12]:
# checking the distribution of target column
review_data['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
POS,20078
NEG,6664
NEU,4694


**Rename label POSITIVE : 1 , NEGATIVE : -1, NEUTRAL : 0**

In [13]:
review_data.replace({'label':{'POS': 1}},inplace=True)
review_data.replace({'label':{'NEG':-1}},inplace=True)
review_data.replace({'label':{'NEU': 0}},inplace=True)
review_data['label'].value_counts()

  review_data.replace({'label':{'NEU': 0}},inplace=True)


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,20078
-1,6664
0,4694


# Training Model

In [14]:
!pip install torch transformers scikit-learn




Check Exit GPU for optimaze run time using CUDA

In [15]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using GPU")
else:
    device = torch.device("cpu")
    print("Using CPU")


Using GPU


In [16]:
# separating the data and label
X = review_data['content'].values
Y = review_data['label'].values
print(X)
print(Y)

['Áo bao đẹp ạ!' 'Tuyệt vời' '2day ao khong giong trong' ...
 'Hàng đẹp đúng giá tiền' 'Chất vải khá ổn'
 'áo rất ok nhé , vải mịn , len cao cổ này phối form mùa đông thì quá chất m.ng nhé']
[ 1  1 -1 ...  1  1  1]


In [20]:
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


Splitting the data to training data and test data

In [18]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [19]:
print(X.shape, X_train.shape, X_test.shape)

(31436,) (25148,) (6288,)


# Using PhoBERT to tokenizer data

In [27]:
# Create tokenizer và model PhoBERT
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
phobert = AutoModel.from_pretrained("vinai/phobert-base")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
phobert.to(device)

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(64001, 768, padding_idx=1)
    (position_embeddings): Embedding(258, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropou

Split each Data to each Batch

In [28]:
def extract_embedding_batch(texts, batch_size=16):
    embeddings = []

    # Convert NumPy array to list of strings if needed
    if isinstance(texts, np.ndarray):
        texts = texts.tolist()

    for i in range(0, len(texts), batch_size):
        # 1 batch of texts
        batch_texts = texts[i:i + batch_size]

        # Tokenize batch
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)

        # Extract embedding with PhoBERT
        with torch.no_grad():
            output = phobert(**inputs.to(device)).last_hidden_state  # Using GPU
            embedding = output[:, 0, :].cpu().numpy()  # Take vector [CLS]
            embeddings.append(embedding)

    return np.vstack(embeddings)  # convert to numpy matrix

In [29]:
# Extract embedding from X_train and X_test
X_train = extract_embedding_batch(X_train)
X_test = extract_embedding_batch(X_test)
print(X_train)
print(X_test)

[[-0.20997487  0.21593407 -0.39167526 ...  0.04614848 -0.25975007
  -0.1346499 ]
 [-0.3142863   0.0191837   0.05191322 ... -0.12081489 -0.00966456
   0.040553  ]
 [-0.158544   -0.36982    -0.01957392 ...  0.23343354  0.65080714
   0.3236144 ]
 ...
 [-0.2223792   0.23921238 -0.22807093 ... -0.13190694 -0.6271619
  -0.37419793]
 [-0.17755565  0.55106276 -0.4437321  ... -0.09395918 -0.21038878
  -0.32422015]
 [-0.5226966  -0.23425797 -0.5627008  ... -0.04018904  0.45335796
  -0.17164071]]
[[-0.13199858 -0.11195054 -0.12387154 ... -0.08021745 -0.33873904
   0.2372906 ]
 [-0.0927427   0.0081035   0.04350041 ...  0.16269507 -0.15172748
   0.15305811]
 [-0.27274406  0.27560022 -0.4674139  ...  0.3199338   0.06668851
  -0.8588957 ]
 ...
 [ 0.0276375   0.27895972 -0.63918227 ...  0.16449009 -0.20499215
  -0.40617326]
 [-0.04348463  0.1196072   0.21430048 ...  0.23429768 -0.30138958
   0.34563205]
 [ 0.00497463  0.25683063 -0.3020059  ... -0.28216913  0.18487267
   0.00202517]]


# Using Logistic Regression Model

In [36]:
# Using Logistic Regression Model
model = LogisticRegression( max_iter=1000)
model.fit(X_train, Y_train)
# accuracy score on the training data
Y_pred = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, Y_pred)
print('Accuracy score of the training data : ', training_data_accuracy)
print("Classification Report:\n", classification_report(Y_train, Y_pred, target_names=["Negative", "Neutral", "Positive"]))


Accuracy score of the training data :  0.7792269763003022
Classification Report:
               precision    recall  f1-score   support

    Negative       0.69      0.72      0.70      5331
     Neutral       0.49      0.25      0.33      3755
    Positive       0.84      0.92      0.88     16062

    accuracy                           0.78     25148
   macro avg       0.67      0.63      0.64     25148
weighted avg       0.76      0.78      0.76     25148



In [37]:
# # accuracy score on the testing data
y_pred = model.predict(X_test)

# Đánh giá kết quả
print("Accuracy:", accuracy_score(Y_test, y_pred))
print("Classification Report:\n", classification_report(Y_test, y_pred, target_names=["Negative", "Neutral", "Positive"]))


Accuracy: 0.7600190839694656
Classification Report:
               precision    recall  f1-score   support

    Negative       0.65      0.68      0.66      1333
     Neutral       0.41      0.21      0.28       939
    Positive       0.83      0.92      0.87      4016

    accuracy                           0.76      6288
   macro avg       0.63      0.60      0.60      6288
weighted avg       0.73      0.76      0.74      6288



Saving the trained model

In [38]:
import pickle
filename = 'trained_model.sav'
pickle.dump(model, open(filename, 'wb'))


Using the saved model for predictions

In [39]:
# loading the saved model
loaded_model = pickle.load(open('/content/trained_model.sav', 'rb'))


In [52]:
sample = "Tiền nào của nấy."
sample = extract_embedding_batch([sample])

prediction = loaded_model.predict(sample)
print(prediction)

if (prediction[0] == 1):
  print('Positive review')

elif (prediction[0] == -1):
  print('Negative review')

else:
  print('Neutral review')

[0]
Neutral review


In [54]:
sample = "Áo xấu lắm, mọi người không nên mua. Tôi dùng vài ngày là rách"
sample = extract_embedding_batch([sample])

prediction = loaded_model.predict(sample)
print(prediction)

if (prediction[0] == 1):
  print('Positive review')

elif (prediction[0] == -1):
  print('Negative review')

else:
  print('Neutral review')

[-1]
Negative review


In [55]:
sample = "Hàng đẹp giá cả phải chăng. Shop giao hàng đúng thời hạn."
sample = extract_embedding_batch([sample])

prediction = loaded_model.predict(sample)
print(prediction)

if (prediction[0] == 1):
  print('Positive review')

elif (prediction[0] == -1):
  print('Negative review')

else:
  print('Neutral review')

[1]
Positive review
