# Predicting Sentiment in Product Reviews

This project predicts whether a product review is **positive** or **negative** using a fine-tuned language model.

**Dataset**: [Amazon Reviews Dataset](https://www.kaggle.com/datasets/kritanjalijain/amazon-reviews).

**Model**: fine-tuned [DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert).


### Imports

In [2]:
import numpy as np
import pandas as pd

from transformers import (
    DistilBertTokenizer, 
    DistilBertForSequenceClassification, 
    Trainer, 
    TrainingArguments
)

from datasets import Dataset
from sklearn.metrics import accuracy_score


### Read dataset

In [4]:
# Load Dataset
# Download and extract the archive into the 'dataset/' directory:
# "Amazon Reviews Dataset" - https://www.kaggle.com/datasets/kritanjalijain/amazon-reviews
def read_df(path):
  df = None
  try:
      df = pd.read_csv(path, header=None)
      print("Dataset loaded successfully!")
  except FileNotFoundError:
      print("Error: The dataset file was not found. Please ensure it is in the 'dataset/' directory.")

  return df

df = read_df('dataset/train.csv')

df


Dataset loaded successfully!


Unnamed: 0,0,1,2
0,2,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...
1,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
2,2,Amazing!,This soundtrack is my favorite music of all ti...
3,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
4,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."
...,...,...,...
3599995,1,Don't do it!!,The high chair looks great when it first comes...
3599996,1,"Looks nice, low functionality",I have used this highchair for 2 kids now and ...
3599997,1,"compact, but hard to clean","We have a small house, and really wanted two o..."
3599998,1,what is it saying?,not sure what this book is supposed to be. It ...


### Prepare dataset

In [5]:
# Set column names for the dataframe using info from the dataset web page
# 'labels' is the target variable for Hugging Face Trainer, 
def prepare_dataframe(df):
  col_names = ['labels', 'title', 'text']
  df.columns = col_names

  # Map labels: 1 (negative) -> 0, 2 (positive) -> 1
  df['labels'] = df['labels'].map({1: 0, 2: 1})

  # Train only on the text data
  df = df.drop(['title'], axis=1)
  return df

df = prepare_dataframe(df)
df.sample(5)


Unnamed: 0,labels,text
1249535,0,Very poor:Basically a movie about a bunch of t...
2394083,1,Julie Garwood is a true master of writing. Eve...
729193,0,The VSI series has produced some terrific intr...
769249,0,"One of Australia's greatest anthropologists, W..."
155431,1,Very innovative for the time. The show was dra...


### Create Test and Validation datasets

In [6]:
# Due to the main dataset size, use only fraction of samples
# Set the number of samples for training and validation
train_samples = 150000  # Number of training samples per class
val_samples = 30000     # Number of validation samples per class
random_state = 2        # Random state for reproducibility

# Sample and split data into training and validation sets
samples_pos = df[df['labels'] == 1].sample(train_samples + val_samples, random_state=random_state).reset_index(drop=True)
samples_neg = df[df['labels'] == 0].sample(train_samples + val_samples, random_state=random_state).reset_index(drop=True)

train_data = pd.concat([samples_pos[:train_samples], samples_neg[:train_samples]], ignore_index=True)
train_data = train_data.sample(frac=1, random_state=random_state).reset_index(drop=True)

val_data = pd.concat([samples_pos[train_samples:], samples_neg[train_samples:]], ignore_index=True)
val_data = val_data.sample(frac=1, random_state=random_state).reset_index(drop=True)

# Display class distributions
print(f"Training data values:\n{train_data['labels'].value_counts()}")
print(f"Validation data values:\n{val_data['labels'].value_counts()}")

# Convert the data into Hugging Face Datasets
hf_train_dataset = Dataset.from_pandas(train_data)
hf_eval_dataset = Dataset.from_pandas(val_data)


Training data values:
0    150000
1    150000
Name: labels, dtype: int64
Validation data values:
1    30000
0    30000
Name: labels, dtype: int64


### Prepare Model and Tokenizer

In [7]:
# Define label-to-ID and ID-to-label mappings for easier interpretation
id2label = {0: "NEGATIVE", 1: "POSITIVE"}  # Maps label ID to string labels
label2id = {"NEGATIVE": 0, "POSITIVE": 1}  # Maps string labels to label IDs

# Load the pre-trained DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Load the pre-trained DistilBERT model with sequence classification head
# Using num_labels=2 because we have two classes: "POSITIVE" and "NEGATIVE"
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", 
    num_labels=2, 
    id2label=id2label, 
    label2id=label2id
)

# Test Tokenizer
text = "This is a great product!"
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
print(f'Input text: {text}')
print(f'Tokenized output: {inputs}')


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input text: This is a great product!
Tokenized output: {'input_ids': tensor([[ 101, 2023, 2003, 1037, 2307, 4031,  999,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}


### Tokenize input data

In [8]:
def tok_func(x): return tokenizer(x["text"], truncation=True, padding=True, max_length=512)

train_dataset = hf_train_dataset.map(tok_func, batched=True)
eval_dataset = hf_eval_dataset.map(tok_func, batched=True)

# Print sample
print(train_dataset[0])

Map:   0%|          | 0/300000 [00:00<?, ? examples/s]

Map:   0%|          | 0/60000 [00:00<?, ? examples/s]

{'labels': 0, 'text': 'Item purchased in Dec. 2009. Finish looked fine initially. The finish began showing spots soon after I received it and after about 15 months it is now pitted.', 'input_ids': [101, 8875, 4156, 1999, 11703, 1012, 2268, 1012, 3926, 2246, 2986, 3322, 1012, 1996, 3926, 2211, 4760, 7516, 2574, 2044, 1045, 2363, 2009, 1998, 2044, 2055, 2321, 2706, 2009, 2003, 2085, 25895, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

### Training

In [11]:
# Hyperparameters for training
bs = 16  # Batch size
epochs = 2  # Number of epochs
lr = 8e-5  # Learning rate

# Set training arguments
args = TrainingArguments(
    output_dir='outputs',  # Directory to save model checkpoints
    learning_rate=lr,  # Learning rate
    warmup_ratio=0.1,  # Ratio of warmup steps
    lr_scheduler_type='cosine',  # Type of learning rate scheduler
    fp16=True,  # Enable mixed precision
    eval_strategy="epoch",  # Evaluation strategy: evaluate at the end of each epoch
    per_device_train_batch_size=bs,  # Batch size per device during training
    per_device_eval_batch_size=bs,  # Batch size per device during evaluation
    num_train_epochs=epochs,  # Number of epochs to train
    weight_decay=0.01,  # Weight decay for regularization
    report_to='none'  # Disable logging to external services
)

# Function to compute evaluation metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}  # Return accuracy as a dictionary

# Initialize Trainer for model training
trainer = Trainer(
    model=model,  # Model to be trained
    args=args,  # Training arguments
    train_dataset=train_dataset,  # Training dataset
    eval_dataset=eval_dataset,  # Evaluation dataset
    processing_class=tokenizer,  # Tokenizer for text preprocessing
    compute_metrics=compute_metrics  # Function to compute evaluation metrics
)

In [12]:
# Train the model
trainer.train()

  0%|          | 0/37500 [00:00<?, ?it/s]

{'loss': 0.4252, 'grad_norm': 7.222513198852539, 'learning_rate': 1.0624e-05, 'epoch': 0.03}
{'loss': 0.2701, 'grad_norm': 2.50805401802063, 'learning_rate': 2.129066666666667e-05, 'epoch': 0.05}
{'loss': 0.2477, 'grad_norm': 8.326554298400879, 'learning_rate': 3.195733333333334e-05, 'epoch': 0.08}
{'loss': 0.2443, 'grad_norm': 8.237428665161133, 'learning_rate': 4.262400000000001e-05, 'epoch': 0.11}
{'loss': 0.2659, 'grad_norm': 3.3289244174957275, 'learning_rate': 5.3290666666666675e-05, 'epoch': 0.13}
{'loss': 0.239, 'grad_norm': 1.0574089288711548, 'learning_rate': 6.395733333333333e-05, 'epoch': 0.16}
{'loss': 0.2384, 'grad_norm': 11.564011573791504, 'learning_rate': 7.460266666666667e-05, 'epoch': 0.19}
{'loss': 0.2413, 'grad_norm': 7.556215763092041, 'learning_rate': 7.998942800469363e-05, 'epoch': 0.21}
{'loss': 0.2393, 'grad_norm': 2.0158541202545166, 'learning_rate': 7.99033396418314e-05, 'epoch': 0.24}
{'loss': 0.2312, 'grad_norm': 9.818373680114746, 'learning_rate': 7.97308

  0%|          | 0/3750 [00:00<?, ?it/s]

{'eval_loss': 0.21029998362064362, 'eval_accuracy': 0.9292333333333334, 'eval_runtime': 138.1795, 'eval_samples_per_second': 434.218, 'eval_steps_per_second': 27.139, 'epoch': 1.0}
{'loss': 0.1804, 'grad_norm': 10.898591995239258, 'learning_rate': 4.6064234419363564e-05, 'epoch': 1.01}
{'loss': 0.1723, 'grad_norm': 10.273165702819824, 'learning_rate': 4.4221869024504e-05, 'epoch': 1.04}
{'loss': 0.1576, 'grad_norm': 12.35455322265625, 'learning_rate': 4.236667969698144e-05, 'epoch': 1.07}
{'loss': 0.1628, 'grad_norm': 0.5494061708450317, 'learning_rate': 4.0506364668240464e-05, 'epoch': 1.09}
{'loss': 0.1514, 'grad_norm': 9.472646713256836, 'learning_rate': 3.864495296638388e-05, 'epoch': 1.12}
{'loss': 0.1548, 'grad_norm': 0.13523219525814056, 'learning_rate': 3.678647599466424e-05, 'epoch': 1.15}
{'loss': 0.1565, 'grad_norm': 3.466481924057007, 'learning_rate': 3.4934958800362105e-05, 'epoch': 1.17}
{'loss': 0.1473, 'grad_norm': 10.717649459838867, 'learning_rate': 3.3094411357429956

  0%|          | 0/3750 [00:00<?, ?it/s]

{'eval_loss': 0.18882055580615997, 'eval_accuracy': 0.94455, 'eval_runtime': 140.3408, 'eval_samples_per_second': 427.531, 'eval_steps_per_second': 26.721, 'epoch': 2.0}
{'train_runtime': 5731.7244, 'train_samples_per_second': 104.681, 'train_steps_per_second': 6.543, 'train_loss': 0.1877202207438151, 'epoch': 2.0}


TrainOutput(global_step=37500, training_loss=0.1877202207438151, metrics={'train_runtime': 5731.7244, 'train_samples_per_second': 104.681, 'train_steps_per_second': 6.543, 'total_flos': 5.165557103457062e+16, 'train_loss': 0.1877202207438151, 'epoch': 2.0})

### Evaluate Model

In [None]:
# Prepare the test dataset
test_df = read_df('dataset/test.csv')
test_df = prepare_dataframe(test_df)

samples_test = pd.concat([test_df[test_df['labels'] == 0].sample(25000, random_state=random_state), 
                          test_df[test_df['labels'] == 1].sample(25000, random_state=random_state)])

hf_test_dataset = Dataset.from_pandas(samples_test)

test_dataset = hf_test_dataset.map(tok_func, batched=True)

In [None]:
# Test the model using the test dataset
test_results = trainer.predict(test_dataset)
print(test_results.metrics)

  0%|          | 0/3125 [00:00<?, ?it/s]

### Inference

In [None]:
from transformers import pipeline
classifier = pipeline("sentiment-analysis", model="outputs/checkpoint-37500", device='cuda')

In [25]:
text = '''Great Product!'''
classifier(text)

[{'label': 'POSITIVE', 'score': 0.9973002076148987}]