In [1]:
# %pip install pandas scikit-learn sentence-transformers numpy

In [2]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sentence_transformers import SentenceTransformer
import numpy as np

# Đọc dữ liệu
data = pd.read_csv('./test_English/dataset_en/Amazon_Product_Review_full_en_cleaned.csv', sep=';', dtype=str)
data.columns = data.columns.str.strip()

# Kết hợp headline và body
data['full_review'] = data['review_headline'].fillna('') + ' ' + data['review_body'].fillna('')

# Tiền xử lý
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z0-9À-ỹ\s]', '', text)
    return text

data['full_review'] = data['full_review'].apply(preprocess_text)
data = data.dropna(subset=['full_review', 'sentiment'])
data['sentiment'] = data['sentiment'].astype(int)

X = data['full_review']
y = data['sentiment']

# Tách train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Dùng embedding mạnh hơn
embedder = SentenceTransformer('all-mpnet-base-v2')  # Hoặc 'paraphrase-multilingual-mpnet-base-v2' nếu muốn đa ngôn ngữ
X_train_emb = embedder.encode(X_train.tolist(), show_progress_bar=True)
X_test_emb = embedder.encode(X_test.tolist(), show_progress_bar=True)

# GridSearchCV cho Logistic Regression
param_grid = {
    'C': [1, 10],
    'penalty': ['l2'],
    'solver': ['lbfgs']
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
logreg = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)
grid = GridSearchCV(logreg, param_grid, cv=cv, scoring='accuracy', n_jobs=-1, verbose=2)
grid.fit(X_train_emb, y_train)

print("Best parameters:", grid.best_params_)
print("Best CV accuracy:", grid.best_score_)

# Đánh giá trên tập test
y_pred = grid.predict(X_test_emb)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Classification report:\n", classification_report(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|██████████| 254/254 [00:08<00:00, 30.91it/s]
Batches: 100%|██████████| 64/64 [00:02<00:00, 31.86it/s]


Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best parameters: {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
Best CV accuracy: 0.8492288710672424
Test Accuracy: 0.8707449432659102
Classification report:
               precision    recall  f1-score   support

           0       0.87      0.87      0.87      1014
           1       0.87      0.88      0.87      1013

    accuracy                           0.87      2027
   macro avg       0.87      0.87      0.87      2027
weighted avg       0.87      0.87      0.87      2027

Confusion matrix:
 [[878 136]
 [126 887]]


In [3]:

# 9. Kiểm thử với các câu thực tế
test_sentences = [
    # Nhóm 1: Dự đoán lớp 0 (tiêu cực)
    "I was extremely disappointed with this product. It stopped working after just two days and the customer service was unhelpful when I tried to get a replacement.",
    "This is by far the worst purchase I have ever made. The quality is terrible, and it did not match the description at all.",
    "After using it for a week, I noticed several issues. The battery drains quickly, and the device constantly overheats, making it unusable.",
    "I regret buying this item. It arrived late, the packaging was damaged, and the product itself was already broken.",
    "Despite the positive reviews, my experience was awful. The product malfunctioned, and I wasted my money on something that doesn't work.",

    # Nhóm 2: Dự đoán lớp 1 (tích cực)
    "I am absolutely thrilled with this purchase. The product works perfectly, and the quality exceeded my expectations in every way.",
    "This is one of the best products I have ever bought online. It arrived quickly, was well-packaged, and performs exactly as advertised.",
    "I have been using this for a month now, and I am very satisfied. The battery life is impressive, and it is extremely easy to use.",
    "The customer service was fantastic. When I had a small issue, they responded immediately and sent me a replacement without any hassle.",
    "I highly recommend this product to anyone looking for quality and reliability. It has made my daily routine so much easier and more enjoyable.",

    # Nhóm 3: Dự đoán chung chung (trung tính, không rõ ràng)
    "The product arrived on time and was as described.",
    "I have used the product for a few days. It works as expected.",
    "The packaging was standard and there was nothing special about it.",
    "I received the item yesterday and have not used it much yet.",
    "The product is okay, neither good nor bad."
]


# 1. Tiền xử lý các câu test (giống hàm preprocess_text bạn đã dùng)
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z0-9À-ỹ\s]', '', text)
    return text



test_sentences_processed = [preprocess_text(s) for s in test_sentences]

# 2. Encode embedding cho các câu test
test_sentences_emb = embedder.encode(test_sentences_processed, show_progress_bar=False)

# 3. Dự đoán với SVM đã train
y_test_pred = grid.predict(test_sentences_emb)

# 4. Hiển thị kết quả
for text, pred in zip(test_sentences, y_test_pred):
    print(f"Review: {text} -> Dự đoán: {pred} sao")

Review: I was extremely disappointed with this product. It stopped working after just two days and the customer service was unhelpful when I tried to get a replacement. -> Dự đoán: 0 sao
Review: This is by far the worst purchase I have ever made. The quality is terrible, and it did not match the description at all. -> Dự đoán: 0 sao
Review: After using it for a week, I noticed several issues. The battery drains quickly, and the device constantly overheats, making it unusable. -> Dự đoán: 0 sao
Review: I regret buying this item. It arrived late, the packaging was damaged, and the product itself was already broken. -> Dự đoán: 0 sao
Review: Despite the positive reviews, my experience was awful. The product malfunctioned, and I wasted my money on something that doesn't work. -> Dự đoán: 0 sao
Review: I am absolutely thrilled with this purchase. The product works perfectly, and the quality exceeded my expectations in every way. -> Dự đoán: 1 sao
Review: This is one of the best products I h