In [None]:
import os

In [None]:
# Text classification to solve real-world business problems
# Dataset should be placed under /content/drive/MyDrive/amazon_review.csv (Root directory of Google Drive)
from google.colab import drive
drive.mount('/content/drive')

dataset_path = "/content/drive/MyDrive/amazon_review.csv"
if os.path.exists(dataset_path):
    print("√ Dataset file found.")
else:
    print("× Dataset file does not exist.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
√ Dataset file found.


In [None]:
# Ensure Colab is running with Nvidia runtime for GPU acclerated inference later
!nvidia-smi | grep -q "Tesla T4" && echo "√ Tesla T4 Found" && nvidia-smi | grep "CUDA" && nvidia-smi | grep "Name" && nvidia-smi | grep "Tesla" || echo "× Change to GPU runtime before proceed"

√ Tesla T4 Found
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |


In [None]:
# Thoughts:
# 1. Sentiment Classification
# 2. Review Quality & Junk Reviews Classification
# 3. Multi-label Issue Detection
#
# References:
# https://huggingface.co/docs/transformers/en/tasks/sequence_classification
#
# To use pretrained models from HuggingFace

In [None]:
# Install PyTorch with CUDA 12.4 & transformers
!pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cu124
!pip3 install transformers datasets evaluate accelerate

Looking in indexes: https://download.pytorch.org/whl/cu124
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [None]:
from datasets import load_dataset
# Load dataset file with 80:20 train:test
dataset = load_dataset("csv", data_files=dataset_path)['train']
print(f"Example: {dataset[0]}")

Generating train split: 0 examples [00:00, ? examples/s]

Example: {'reviewerID': 'A3SBTW3WS4IQSN', 'asin': 'B007WTAJTO', 'reviewerName': None, 'helpful': '[0, 0]', 'reviewText': 'No issues.', 'overall': 4.0, 'summary': 'Four Stars', 'unixReviewTime': 1406073600, 'reviewTime': '2014-07-23', 'day_diff': 138, 'helpful_yes': 0, 'total_vote': 0}


In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
def tokenize(record):
  # Function to combine and tokenize summary & review text
  # Specified maximum sequence length for distilbert-base-uncased: 512
  # By setting trunction=True, first 512 tokens will be tokenized and stored in input_ids
  return tokenizer(str(record["summary"])+"\n"+str(record["reviewText"]), truncation=True)

# Tokenize input
dataset = dataset.map(tokenize)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/4915 [00:00<?, ? examples/s]

In [None]:
# Clean data
for i in dataset.filter(lambda example: example["input_ids"] is None or len(example["input_ids"]) < 5):
  print(f"Deleted: {i['summary']} - {i['reviewText']}")

dataset = dataset.filter(
    lambda example: example["input_ids"] is not None and len(example["input_ids"]) > 4
)

Filter:   0%|          | 0/4915 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4915 [00:00<?, ? examples/s]

In [None]:
# Verify tokenized results
for i in range(5):
  print(f"Tokenized Example[{i}]: {dataset[i]['summary']+' \\n '+dataset[i]['reviewText']}")
  print(f"len:{len(dataset[i]['input_ids'])}, {dataset[i]['input_ids']}\n")

Tokenized Example[0]: Four Stars \n No issues.
len:7, [101, 2176, 3340, 2053, 3314, 1012, 102]

Tokenized Example[1]: MOAR SPACE!!! \n Purchased this for my device, it worked as advertised. You can never have too much phone memory, since I download a lot of stuff this was a no brainer for me.
len:44, [101, 9587, 2906, 2686, 999, 999, 999, 4156, 2023, 2005, 2026, 5080, 1010, 2009, 2499, 2004, 17099, 1012, 2017, 2064, 2196, 2031, 2205, 2172, 3042, 3638, 1010, 2144, 1045, 8816, 1037, 2843, 1997, 4933, 2023, 2001, 1037, 2053, 4167, 2121, 2005, 2033, 1012, 102]

Tokenized Example[2]: nothing to really say.... \n it works as expected. I should have sprung for the higher capacity.  I think its made a bit cheesier than the earlier versions; the paint looks not as clean as before
len:46, [101, 2498, 2000, 2428, 2360, 1012, 1012, 1012, 1012, 2009, 2573, 2004, 3517, 1012, 1045, 2323, 2031, 22057, 2005, 1996, 3020, 3977, 1012, 1045, 2228, 2049, 2081, 1037, 2978, 18178, 2229, 3771, 2084, 1996, 3041

In [None]:
# Collator that is used to dynamically pad the sentences to longest length
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Evaluation metric support
import evaluate
accuracy = evaluate.load("accuracy")

import numpy as np
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
# Overall rating:
#  1-3: NEGATIVE/0
#  4-5: POSITIVE/1
dataset2id = {1:0, 2:0, 3:1, 4:2, 5:2}
id2label = {0: "NEGATIVE", 1: "NEUTRAL", 2:"POSITIVE"}
label2id = {"NEGATIVE": 0, "NEUTRAL": 1,"POSITIVE": 2}

# Map ratings to dataset
dataset = dataset.map(lambda example: {"labels": dataset2id[example["overall"]]})

# Check result
for i in range(5):
  print(f"Example[{i}]: overall:{dataset[i]['overall']}, rating:{dataset[i]['labels']}/{id2label[dataset[i]['labels']]}, {dataset[i]['summary']}")
  print(f"len:{len(dataset[i]['input_ids'])}, {dataset[i]['input_ids']}\n")

Map:   0%|          | 0/4915 [00:00<?, ? examples/s]

Example[0]: overall:4.0, rating:2/POSITIVE, Four Stars
len:7, [101, 2176, 3340, 2053, 3314, 1012, 102]

Example[1]: overall:5.0, rating:2/POSITIVE, MOAR SPACE!!!
len:44, [101, 9587, 2906, 2686, 999, 999, 999, 4156, 2023, 2005, 2026, 5080, 1010, 2009, 2499, 2004, 17099, 1012, 2017, 2064, 2196, 2031, 2205, 2172, 3042, 3638, 1010, 2144, 1045, 8816, 1037, 2843, 1997, 4933, 2023, 2001, 1037, 2053, 4167, 2121, 2005, 2033, 1012, 102]

Example[2]: overall:4.0, rating:2/POSITIVE, nothing to really say....
len:46, [101, 2498, 2000, 2428, 2360, 1012, 1012, 1012, 1012, 2009, 2573, 2004, 3517, 1012, 1045, 2323, 2031, 22057, 2005, 1996, 3020, 3977, 1012, 1045, 2228, 2049, 2081, 1037, 2978, 18178, 2229, 3771, 2084, 1996, 3041, 4617, 1025, 1996, 6773, 3504, 2025, 2004, 4550, 2004, 2077, 102]

Example[3]: overall:5.0, rating:2/POSITIVE, Great buy at this price!!!  *** UPDATE
len:110, [101, 2307, 4965, 2012, 2023, 3976, 999, 999, 999, 1008, 1008, 1008, 10651, 2023, 2228, 2038, 2499, 2041, 2307, 1012, 20

In [None]:
# Use distilbert-base-uncased model and get ready for training
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=3, id2label=id2label, label2id=label2id
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Split dataset for training & testing
dataset_train, dataset_test = dataset.train_test_split(test_size=0.2).values()

# Some fancy hyper-params for training
training_args = TrainingArguments(
    output_dir="amazon_review",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none",
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


In [None]:
# Check device status before training
training_args.device

device(type='cuda', index=0)

In [None]:
# Train the model
trainer.train()
trainer.evaluate()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.161097,0.955239
2,No log,0.125715,0.963377
3,No log,0.141244,0.963377
4,No log,0.159096,0.955239
5,0.165400,0.163693,0.959308
6,0.165400,0.185661,0.957274
7,0.165400,0.187016,0.957274
8,0.165400,0.196945,0.959308
9,0.037000,0.19329,0.959308
10,0.037000,0.202474,0.959308


{'eval_loss': 0.1257147341966629,
 'eval_accuracy': 0.9633774160732451,
 'eval_runtime': 7.8627,
 'eval_samples_per_second': 125.021,
 'eval_steps_per_second': 3.943,
 'epoch': 10.0}

In [None]:
# Save the model
save_directory="/content/drive/MyDrive/amazon_review_sentiment_model"
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

('/content/drive/MyDrive/amazon_review_sentiment_model/tokenizer_config.json',
 '/content/drive/MyDrive/amazon_review_sentiment_model/special_tokens_map.json',
 '/content/drive/MyDrive/amazon_review_sentiment_model/vocab.txt',
 '/content/drive/MyDrive/amazon_review_sentiment_model/added_tokens.json',
 '/content/drive/MyDrive/amazon_review_sentiment_model/tokenizer.json')

In [None]:
# (Optional) Continue & load the model if using a new Colab instance
load_directory="/content/drive/MyDrive/amazon_review_sentiment_model"
model = AutoModelForSequenceClassification.from_pretrained(load_directory)
tokenizer = AutoTokenizer.from_pretrained(load_directory)

In [None]:
from transformers import pipeline
classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

Device set to use cuda:0


In [None]:
# For testing
print(classifier(input("Review to be tested: ")))

KeyboardInterrupt: Interrupted by user

# To find out potential wrongly-rated reviews

In [None]:
test_reviews = dataset_test
# The classifier pipeline is already loaded in the previous cell.
print("Test dataset assigned to 'test_reviews' and classifier pipeline is ready.")

Test dataset assigned to 'test_reviews' and classifier pipeline is ready.


In [None]:
predictions = []
for review in test_reviews:
  combined_text = str(review["summary"]) + "\n" + str(review["reviewText"])
  # Explicitly tokenize and truncate
  inputs = tokenizer(combined_text, return_tensors="pt", truncation=True, padding=True).to(model.device)
  outputs = model(**inputs)
  logits = outputs.logits
  predicted_class_id = logits.argmax().item()
  predicted_label = model.config.id2label[predicted_class_id]
  predictions.append(predicted_label)

print(f"Generated {len(predictions)} predictions.")

Generated 983 predictions.


In [None]:
original_labels = [dataset2id[review["overall"]] for review in test_reviews]
mismatched_reviews = []
for i, review in enumerate(test_reviews):
    # Get the predicted label (0 for NEGATIVE, 1 for POSITIVE)
    predicted_label_id = label2id[predictions[i]]
    if original_labels[i] != predicted_label_id and predicted_label_id != 1:
        mismatched_reviews.append(review)

print(f"Number of potentially wrongly rated reviews: {len(mismatched_reviews)}")

Number of potentially wrongly rated reviews: 35


In [None]:
for review in mismatched_reviews:
    original_rating = review["overall"]
    # Get the predicted sentiment label string
    combined_text = str(review["summary"]) + "\n" + str(review["reviewText"])
    # Truncate the combined text to the maximum sequence length
    truncated_text = combined_text[:tokenizer.model_max_length]
    predicted_sentiment = classifier(truncated_text)[0]['label']
    summary = review["summary"]
    review_text = review["reviewText"]

    print(f"Original Rating: {original_rating}")
    print(f"Predicted Sentiment: {predicted_sentiment}")
    print(f"Summary: {summary}")
    print(f"Review Text: {review_text}")
    print("-" * 20)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Original Rating: 3.0
Predicted Sentiment: POSITIVE
Summary: Very good
Review Text: Have only transfered some pictures and music so far. My S3  Galazy had no problems recognizing the card...hear a lot of rumors about memory cards not working, having to format first, etc., but this one worked from me right out of the box.The included Adapter is also good when I want to use with my computer(s) for slide shows since MicroSDHC are not supported by many if any..just to small.
--------------------
Original Rating: 1.0
Predicted Sentiment: POSITIVE
Summary: I got a rock
Review Text: The 64GB card I purchased actually contains 4GB of flash memory and some kind of hack that causes it to appear as a 64GB drive in various drives and readers. Of course, this took me a while to figure out since I only just got around to copying more than 4GB of data to the card, but with some testing I have nonetheless discovered this to be the case.The card I bought was in Amazon Frustration Free packaging. I don't

# Demo to be done in GitHub CodeSpace