In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)
from scipy import fft, stats

In [2]:
from google.colab import drive
drive.mount('/content/drive')
file_path = '/content/drive/My Drive/NLP/Amazon_Unlocked_Mobile.csv'
df = pd.read_csv(file_path, nrows=1000)

Mounted at /content/drive


In [3]:
df.head(5)

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0


In [4]:
print(df.columns)

Index(['Product Name', 'Brand Name', 'Price', 'Rating', 'Reviews',
       'Review Votes'],
      dtype='object')


In [5]:
!pip install -q transformers datasets


In [6]:
# 1. Kiểm tra số lượng NaN theo từng cột
nan_counts = df.isna().sum()
print("Số lượng giá trị NaN theo cột:")
print(nan_counts)

Số lượng giá trị NaN theo cột:
Product Name      0
Brand Name      464
Price             0
Rating            0
Reviews           0
Review Votes     24
dtype: int64


In [7]:
# Loại bỏ dòng thiếu giá trị
df.dropna(subset=['Reviews', 'Rating'], inplace=True)

In [8]:

# Gán nhãn cảm xúc: 2 (positive), 1 (neutral), 0 (negative)
def label_sentiment(r):
    if r >= 4:
        return 2
    elif r == 3:
        return 1
    else:
        return 0

df["label"] = df["Rating"].apply(label_sentiment)

# Đổi tên cột Reviews → text
df = df.rename(columns={"Reviews": "text"})
df = df[["text", "label"]]

In [9]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2)


In [10]:
import transformers
print(transformers.__version__)


4.57.1


In [12]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2)


In [13]:
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, TrainingArguments, Trainer

# Tách train/test
train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'], df['label'], test_size=0.2)

# Tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True)

# Chuyển về Dataset
train_dataset = Dataset.from_dict({**train_encodings, 'labels': list(train_labels)})
val_dataset = Dataset.from_dict({**val_encodings, 'labels': list(val_labels)})

# Tải mô hình
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
from transformers import TrainingArguments, Trainer
import os

# Tắt hoàn toàn wandb
os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    report_to=None  # KHÔNG log lên wandb hay tensorboard
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss
1,0.6753,0.37436
2,0.4318,0.412521
3,0.1669,0.398748


TrainOutput(global_step=300, training_loss=0.3788308036327362, metrics={'train_runtime': 128.1053, 'train_samples_per_second': 18.735, 'train_steps_per_second': 2.342, 'total_flos': 317927426457600.0, 'train_loss': 0.3788308036327362, 'epoch': 3.0})

In [15]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

predictions = trainer.predict(val_dataset)
y_true = predictions.label_ids
y_pred = np.argmax(predictions.predictions, axis=1)

print(classification_report(
    y_true, y_pred,
    labels=[0, 1, 2],
    target_names=["Negative", "Neutral", "Positive"]
))

print(confusion_matrix(y_true, y_pred, labels=[0, 1, 2]))


              precision    recall  f1-score   support

    Negative       0.85      0.81      0.83        72
     Neutral       0.18      0.21      0.19        14
    Positive       0.94      0.95      0.94       114

    accuracy                           0.84       200
   macro avg       0.66      0.66      0.66       200
weighted avg       0.85      0.84      0.85       200

[[ 58  10   4]
 [  8   3   3]
 [  2   4 108]]


In [16]:
import torch

# Kiểm tra thiết bị model đang dùng
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

sample = "It's battery life is great. It's very responsive to touch. The only issue is that sometimes the screen goes black and you have to press the top button several times to get the screen to re-illuminate. "
inputs = tokenizer(sample, return_tensors="pt", truncation=True, padding=True).to(device)  # Đưa input lên GPU/CPU

with torch.no_grad():
    outputs = model(**inputs)
    pred = outputs.logits.argmax(dim=1).item()

print(["Negative", "Neutral", "Positive"][pred])


Neutral
