In [1]:
from datasets import load_dataset
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_All_Beauty", trust_remote_code=True)
print(dataset["full"][0])


{'rating': 5.0, 'title': 'Such a lovely scent but not overpowering.', 'text': "This spray is really nice. It smells really good, goes on really fine, and does the trick. I will say it feels like you need a lot of it though to get the texture I want. I have a lot of hair, medium thickness. I am comparing to other brands with yucky chemicals so I'm gonna stick with this. Try it!", 'images': [], 'asin': 'B00YQ6X8EO', 'parent_asin': 'B00YQ6X8EO', 'user_id': 'AGKHLEW2SOWHNMFQIJGBECAF7INQ', 'timestamp': 1588687728923, 'helpful_vote': 0, 'verified_purchase': True}


In [2]:
from datasets import load_dataset
import pandas as pd
import numpy as np
from sklearn.utils import resample

# Step 1: Load the dataset
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_All_Beauty", trust_remote_code=True)

# Step 2: Convert to pandas DataFrame
df = pd.DataFrame(dataset["full"])

# Step 3: Select only required columns
df = df[['text', 'rating']]

# Step 4: Convert ratings into sentiment labels
def map_rating_to_label(rating):
    if rating <= 2:
        return 0  # Negative
    elif rating == 3:
        return 1  # Neutral
    else:
        return 2  # Positive

df["sentiment"] = df["rating"].apply(map_rating_to_label)

# Step 5: Drop rows with missing text
df = df.dropna(subset=["text"])

# Step 6: Print original distribution
print("Before balancing:\n", df["sentiment"].value_counts())

# Step 7: Separate classes
df_negative = df[df.sentiment == 0]
df_neutral = df[df.sentiment == 1]
df_positive = df[df.sentiment == 2]

# Target sample count (chosen as size of negative class)
target_count = len(df_negative)

# Upsample neutral (from 56k to ~145k)
df_neutral_upsampled = resample(df_neutral, replace=True, n_samples=target_count, random_state=42)

# Downsample positive (from 500k to ~145k)
df_positive_downsampled = resample(df_positive, replace=False, n_samples=target_count, random_state=42)

# Combine all
df_balanced = pd.concat([df_negative, df_neutral_upsampled, df_positive_downsampled])
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Step 8: Print balanced distribution
print("After balancing:\n", df_balanced["sentiment"].value_counts())

Before balancing:
 sentiment
2    500107
0    145114
1     56307
Name: count, dtype: int64
After balancing:
 sentiment
2    145114
1    145114
0    145114
Name: count, dtype: int64


In [3]:
import torch
from datasets import Dataset
from transformers import DistilBertTokenizerFast
from sklearn.model_selection import train_test_split

# Set device to GPU
device = torch.device("cuda")

# Load tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# Split the dataset (80% train, 20% test)
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df_balanced["text"].tolist(), df_balanced["sentiment"].tolist(), test_size=0.2, random_state=42
)

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_dict({"text": train_texts, "label": train_labels})
test_dataset = Dataset.from_dict({"text": test_texts, "label": test_labels})

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Convert to PyTorch format and move to GPU
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

train_dataset = train_dataset.with_format("torch", device=device)
test_dataset = test_dataset.with_format("torch", device=device)

# Print first training sample
print(train_dataset[0])



Map:   0%|          | 0/348273 [00:00<?, ? examples/s]

Map:   0%|          | 0/87069 [00:00<?, ? examples/s]

{'text': 'makes your skin look way better including shrinking lightly the pores in 30 to 40 days. I use it everyday and I love the results. I would like to purchase more products of The Ordinary! Quality products in affordable prices!', 'label': tensor(2, device='cuda:0'), 'input_ids': tensor([  101,  3084,  2115,  3096,  2298,  2126,  2488,  2164, 28375,  8217,
         1996, 18499,  2229,  1999,  2382,  2000,  2871,  2420,  1012,  1045,
         2224,  2009, 10126,  1998,  1045,  2293,  1996,  3463,  1012,  1045,
         2052,  2066,  2000,  5309,  2062,  3688,  1997,  1996,  6623,   999,
         3737,  3688,  1999, 15184,  7597,   999,   102,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
           

In [4]:
from torch.utils.data import DataLoader

# Define batch size
BATCH_SIZE=8

# Create DataLoaders for training and testing
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, pin_memory=False)

test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=False)

print(f"Train DataLoader: {len(train_dataloader)} batches")
print(f"Test DataLoader: {len(test_dataloader)} batches")

Train DataLoader: 43535 batches
Test DataLoader: 10884 batches


In [5]:
# Fetch a single batch from the DataLoader
sample_batch = next(iter(test_dataloader))

# Check the device of each tensor in the batch
for k, v in sample_batch.items():
    if isinstance(v, torch.Tensor):
        print(f"{k}: {v.device}")  # Print the device


label: cuda:0
input_ids: cuda:0
attention_mask: cuda:0


In [7]:
from transformers import DistilBertForSequenceClassification

# Load DistilBERT model with a classification head (2 classes: Positive-2,neutral-1 & Negative- 0)
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)
model.to(device)  # Move model to GPU


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [8]:
from transformers import get_scheduler
from torch.optim import AdamW

import torch
from torch.nn import CrossEntropyLoss

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Learning rate scheduler
num_training_steps = len(train_dataloader) * 3  # Assume 3 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# # Loss function (CrossEntropyLoss for classification)
# loss_fn = torch.nn.CrossEntropyLoss()


# Step 1: Compute class weights from balanced dataset
class_counts = df_balanced['sentiment'].value_counts().sort_index()  # ensure 0, 1, 2 order
total_samples = sum(class_counts)
class_weights = [total_samples / c for c in class_counts]

# Normalize to sum to number of classes (optional but stable)
normed_weights = [w * len(class_counts) / sum(class_weights) for w in class_weights]

# Convert to tensor and send to device
weights_tensor = torch.tensor(normed_weights).to(device)

# Step 2: Initialize the weighted loss function
loss_fn = CrossEntropyLoss(weight=weights_tensor)



In [9]:
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm

scaler = GradScaler()

for batch in tqdm(train_dataloader):
    # Ensure labels are correctly assigned
     
    if 'label' in batch:  
        batch['labels'] = batch.pop('label')  

    batch = {k: v for k, v in batch.items() if k in ['input_ids', 'attention_mask', 'labels']}


    optimizer.zero_grad()  # Reset gradients before backward pass

    # with autocast():  # Enable mixed precision
    #     outputs = model(**batch)
    #     loss = outputs.loss
    with autocast():
        outputs = model(**batch)
        logits = outputs.logits
        loss = loss_fn(logits, batch['labels'])  # ✅ CUSTOM LOSS


       # Check for NaN/Inf before updating weights
    if not torch.isnan(loss) and not torch.isinf(loss):
        scaler.scale(loss).backward()  # Backpropagation with scaled loss
        scaler.step(optimizer)  # Update model parameters
        scaler.update()  # Update scaler for next iteration
print("Training complete! 🎉")

  scaler = GradScaler()
  with autocast():
100%|██████████████████████████████████████████████| 43535/43535 [58:58<00:00, 12.30it/s]

Training complete! 🎉





In [10]:
model.save_pretrained("model_v4")  # Saves model weights & config
tokenizer.save_pretrained("model_v4")  # Saves tokenizer 

('model_v4\\tokenizer_config.json',
 'model_v4\\special_tokens_map.json',
 'model_v4\\vocab.txt',
 'model_v4\\added_tokens.json',
 'model_v4\\tokenizer.json')

In [11]:
# from transformers import DistilBertForSequenceClassification, AutoTokenizer
# import torch
# device = torch.device("cuda")
# # Load the trained model
# model = DistilBertForSequenceClassification.from_pretrained("model_v4")
# model.to(device)  # Move to GPU if available
# model.eval()  # Set to evaluation mode

# # Load the tokenizer
# tokenizer = AutoTokenizer.from_pretrained("model_v4")


In [2]:
from transformers import DistilBertForSequenceClassification, AutoTokenizer, pipeline
import torch
import shap 
device = torch.device("cuda")

# Load model and tokenizer
model = DistilBertForSequenceClassification.from_pretrained("model_v4")
tokenizer = AutoTokenizer.from_pretrained("model_v4")
model.to(device)
model.eval()

# Label mapping
label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}

# Create HF pipeline for sentiment prediction
sentiment_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer, return_all_scores=True, device=0)

# SHAP Explainer
explainer = shap.Explainer(sentiment_pipeline)

# Example input
sample_text = ["Packaging was okay, and delivery was on time."]

# Get SHAP values
shap_values = explainer(sample_text)

# Predict sentiment using the same pipeline
pred_scores = sentiment_pipeline(sample_text)[0]
predicted_class = max(pred_scores, key=lambda x: x['score'])['label']
predicted_index = int(predicted_class.split("_")[-1]) if "LABEL_" in predicted_class else int(predicted_class)
predicted_sentiment = label_map[predicted_index]

# Print predicted sentiment
print("Predicted sentiment:", predicted_sentiment)

# Visualize SHAP explanation
shap.plots.text(shap_values[0])


Device set to use cuda:0


Predicted sentiment: Neutral


In [12]:
# def predict(text):
#     # Tokenize input text
#     inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    
#     # Move input tensors to the same device as the model
#     inputs = {k: v.to(device) for k, v in inputs.items()}

#     # Make prediction
#     with torch.no_grad():  # No gradient calculation needed
#         outputs = model(**inputs)
    
#     # Get class label
#     logits = outputs.logits
#     predicted_class = torch.argmax(logits, dim=1).item()
    
#     label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
#     return label_map[predicted_class]

# # Example usage
# text = "Packaging was okay, and delivery was on time."
# print("Predicted sentiment:", predict(text))


Predicted sentiment: Neutral


In [13]:
from torch.cuda.amp import autocast
from tqdm import tqdm

# Set model to evaluation mode
model.eval()

total_correct = 0
total_samples = 0
all_preds = []
all_labels = []

with torch.no_grad():  # Disable gradient computation
    for batch in tqdm(test_dataloader, desc="Testing Progress"):
        
        # Ensure labels are correctly assigned
        if 'label' in batch:  
            batch['labels'] = batch.pop('label')  
        
        batch = {k: v.to(device) for k, v in batch.items() if k in ['input_ids', 'attention_mask', 'labels']}
        labels = batch["labels"]

        with autocast():  # Enable mixed precision inference
            outputs = model(**batch)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)

        # Store predictions and actual labels
        all_preds.extend(predictions.cpu().tolist())
        all_labels.extend(labels.cpu().tolist())

        # Calculate accuracy
        total_correct += (predictions == labels).sum().item()
        total_samples += labels.size(0)

# Compute final accuracy
test_accuracy = total_correct / total_samples
print(f"Test Accuracy: {test_accuracy:.4f}")


  with autocast():  # Enable mixed precision inference
Testing Progress: 100%|████████████████████████████| 10884/10884 [03:29<00:00, 52.07it/s]

Test Accuracy: 0.7984





In [14]:
print(total_correct,total_samples)

69520 87069


In [15]:
from sklearn.metrics import precision_recall_fscore_support, classification_report

# Compute precision, recall, and F1-score
precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

# Alternatively, print a full classification report
print("\nClassification Report:\n", classification_report(all_labels, all_preds, digits=4))


Precision: 0.8032
Recall: 0.7984
F1-score: 0.7989

Classification Report:
               precision    recall  f1-score   support

           0     0.8348    0.7280    0.7777     28935
           1     0.6947    0.7699    0.7304     28985
           2     0.8798    0.8968    0.8882     29149

    accuracy                         0.7984     87069
   macro avg     0.8031    0.7982    0.7988     87069
weighted avg     0.8032    0.7984    0.7989     87069

