# **Sentiment Analysis on Yelp Reviews using DistilBERT**

## Introduction:

This project develops a DistilBERT model for sentiment analysis on Yelp reviews. The goal is to classify reviews into positive, negative, or neutral sentiment. We will fine-tune DistilBERT on a sample of 10,000 Yelp reviews and evaluate its performance.

## Install Necessary Libraries

In [None]:
# !pip install transformers
# !pip install torch scikit-learn pandas

## Load and Preprocess Data

In [None]:
import pandas as pd

splits = {'train': 'yelp_review_full/train-00000-of-00001.parquet', 'test': 'yelp_review_full/test-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/Yelp/yelp_review_full/" + splits["train"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
import pandas as pd

# Load the Yelp reviews dataset
# df = load_dataset("Yelp/yelp_review_full")

# Display the first few rows of the dataset
df.head(10)

Unnamed: 0,label,text
0,4,dr. goldberg offers everything i look for in a...
1,1,"Unfortunately, the frustration of being Dr. Go..."
2,3,Been going to Dr. Goldberg for over 10 years. ...
3,3,Got a letter in the mail last week that said D...
4,0,I don't know what Dr. Goldberg was like before...
5,4,Top notch doctor in a top notch practice. Can'...
6,4,Dr. Eric Goldberg is a fantastic doctor who ha...
7,0,I'm writing this review to give you a heads up...
8,1,Wing sauce is like water. Pretty much a lot of...
9,2,Decent range somewhat close to the city. The ...


In [None]:
# Select only the 'Review Text' and 'Rating' columns
df = df[['text', 'label']]
df = df.rename(columns={'text': 'Review Text', 'label': 'Rating'})

# Display the first few rows after selecting columns
df.head()

Unnamed: 0,Review Text,Rating
0,dr. goldberg offers everything i look for in a...,4
1,"Unfortunately, the frustration of being Dr. Go...",1
2,Been going to Dr. Goldberg for over 10 years. ...,3
3,Got a letter in the mail last week that said D...,3
4,I don't know what Dr. Goldberg was like before...,0


In [None]:
# Drop rows with missing values in 'Review Text' or 'Rating'
df = df.dropna(subset=['Review Text', 'Rating'])

# Display the shape of the dataset after dropping NaN values
df.shape


(650000, 2)

To classify the reviews into positive, neutral, and negative sentiments, we mapped the Rating column as follows:
* **Rating 1-2:** Negative sentiment
* **Rating 3**: Neutral sentiment
* **Rating 4-5:** Positive sentiment

In [None]:
# Map ratings to sentiment labels
def map_rating_to_sentiment(rating):
    if rating in [4, 5]:
        return 2  # Positive
    elif rating == 3:
        return 1  # Neutral
    else:
        return 0  # Negative

df['sentiment'] = df['Rating'].apply(map_rating_to_sentiment)

# Drop the 'Rating' column as it's no longer needed
df = df.drop(columns=['Rating'])

# Display the first few rows after mapping
df.head()


Unnamed: 0,Review Text,sentiment
0,dr. goldberg offers everything i look for in a...,2
1,"Unfortunately, the frustration of being Dr. Go...",0
2,Been going to Dr. Goldberg for over 10 years. ...,1
3,Got a letter in the mail last week that said D...,1
4,I don't know what Dr. Goldberg was like before...,0


In [None]:
# Take a sample of 8,000 reviews
df = df.sample(8_000, random_state=42)

# Display the shape of the sampled dataset
df.shape

(8000, 2)

## Prepare the Dataset for DistilBERT

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and test sets (80% training, 20% testing)
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['Review Text'].tolist(),
    df['sentiment'].tolist(),
    test_size=0.2,
    random_state=42
)

In [None]:
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer

# Define the CustomDataset class for handling tokenization
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = torch.tensor(self.labels[idx])

        encoding = self.tokenizer(text, truncation=True, padding="max_length",
                                  max_length=self.max_len, return_tensors="pt")

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': label
        }

# Initialize the tokenizer
checkpoint = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Create the training and test datasets
train_dataset = CustomDataset(train_texts, train_labels, tokenizer)
test_dataset = CustomDataset(test_texts, test_labels, tokenizer)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

## Load the DistilBERT Model

In [None]:
from transformers import AutoModelForSequenceClassification

# Load the DistilBERT model with a classification head for 3 classes
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3).to("cuda")

## Define Training Arguments and Trainer

In [None]:
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Define the training arguments
training_args = TrainingArguments(
    output_dir="output",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    num_train_epochs=3,
    evaluation_strategy='epoch'
)

# Define the metrics computation function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)

    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    precision = precision_score(labels, predictions, average='weighted')
    recall = recall_score(labels, predictions, average='weighted')

    return {'accuracy': accuracy, 'f1': f1, 'precision': precision, 'recall': recall}

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

## Model Training

In [None]:
# Train the model
trainer.train()

# Save the fine-tuned model
trainer.save_model("distilbert_finetuned_yelp")

## Model Evaluation

In [None]:
# Evaluate the model on the test set
metrics = trainer.evaluate()

# Display the evaluation results
print(f"Accuracy: {metrics['eval_accuracy']:.4f}")
print(f"F1 Score: {metrics['eval_f1']:.4f}")
print(f"Precision: {metrics['eval_precision']:.4f}")
print(f"Recall: {metrics['eval_recall']:.4f}")

# Plot visualization and Confusion matrix

In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

eval_result = trainer.evaluate()
predictions, labels, _ = trainer.predict(test_dataset)
predictions = np.argmax(predictions, axis=1)

cm = confusion_matrix(labels, predictions)

# Define the id2label mappin
id2label = {0: "Negative", 1: "Neutral", 2: "Positive"}

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=id2label.values())
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix - DistilBERT Sentiment Analysis')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
# Plot the distribution of ratings
fig, ax = plt.subplots(figsize=(6,6))
sns.countplot(x=df['sentiment'], palette='viridis', ax=ax)
ax.set_title('Count of Reviews by Stars')
ax.set_xlabel('Review Star')
ax.set_ylabel('Count')
plt.show()