# Loading Dataset

In [1]:
import numpy as np
import pandas as pd

## Requirements

In [None]:
%pip install transformers evaluate langchain datasets

## Config

In [16]:
local = True
drop_zeros = True
fine_tuning = True

from enum import Enum
Task = Enum('Task', 'RATING_REGRESSION CATEG_CLASSIFICATION')
task = Task.RATING_REGRESSION

if task == Task.CATEG_CLASSIFICATION:
    target = 'rating'
else:
    target = 'category'
# from huggingface_hub import notebook_login

# notebook_login()
import os
os.environ["WANDB_DISABLED"] = "true"

## Load Data

In [None]:
path = 'dataset.json'
if local:
    path = './data/' + path
else:
    from google.colab import files, drive
    path = '/content/drive/My Drive/' + path
    drive.mount('/content/drive/')

data = pd.read_json(path)
print(data.head())
print(data['category'].unique())

# Preprocessing

In [None]:
print(data.describe())
print(data.info())

In [24]:
if task == Task.RATING_REGRESSION:
  if drop_zeros:
    zeros = data[data['rating'] == 0]
    data = data[data['rating'] != 0] # dropping 0 ratings
    if fine_tuning:
      data['rating'] = data['rating'] - 1
else: 
  print('TBD')
data['review_text'] = data['review_text'].str.lower()

In [None]:
data.head()

In [None]:
data.info()

# Zero-shot Shot

In [None]:
#from langchain_huggingface.llms import HuggingFacePipeline

import json
import random
from transformers import pipeline

def rate_reviews(data, n_blocks=8, block_size=10000, sample_size=100, output_file='review_ratings.json'):
    # Carrega o modelo zero-shot
    pipe = pipeline(task="zero-shot-classification", model="facebook/bart-large-mnli")

    # Define as labels de classificação
    candidate_labels = ["0 stars", "1 stars", "2 stars", "3 stars", "4 stars", "5 stars"]

    # Lista para armazenar os resultados
    results = []

    # Itera sobre cada bloco de 10.000 entradas
    for block_num in range(n_blocks):
        start_idx = block_num * block_size
        end_idx = min((block_num + 1) * block_size, len(data['review_text']))

        # Seleciona 100 reviews aleatoriamente do bloco atual
        if end_idx > start_idx:
            block_reviews = data['review_text'][start_idx:end_idx]
            selected_reviews = block_reviews[:sample_size]  # Pegamos as 100 primeiras reviews do bloco

            # Processa as 100 reviews selecionadas
            for review_text in selected_reviews:
                # Cria o prompt para classificação
                prompt = "Your objective is to read user reviews for books and determine the final rating given by the user on a scale of 0 to 5 stars. Being 0 the lowest score and 5 the highest."
                output = pipe(prompt + review_text, candidate_labels)

                # Armazena o resultado (rótulo de maior pontuação) e a review na lista
                results.append({
                    "review": review_text,
                    "rating": output['labels'][0]
                })

    # Salva os resultados em um arquivo JSON
    with open(output_file, 'w') as f:
        json.dump(results, f, indent=4)

# Executa a função, processando 100 reviews de cada bloco de 10.000
rate_reviews(data, n_blocks=8, block_size=10000, sample_size=1, output_file='review_ratings.json')


In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Few Shot

In [None]:

from transformers import pipeline

model_id = "facebook/bart-large-mnli"
pipe = pipeline("zero-shot-classification", model=model_id)

candidate_labels =["0 Stars", "1 Stars", "2 Stars", "3 Stars", "4 Stars", "5 Stars"]

examples_text = [
    data['review_text'][17],
    data['review_text'][18],
    data['review_text'][19],
  ]

examples_ratings = [
    data['rating'][17],
    data['rating'][18],
    data['rating'][19],
]

query = data['review_text'][10]
label = data['rating'][10]

prompt = """Your objective is to read user reviews for books and determine the final rating given by the user on a scale of 1 to 5 stars. Being 0 the lowest score and 5 the highest. First you'll get 3 examples of
Text: {review text}
Rating: {rating on a scale of 0 to 5}

Examples:
"""

fo = prompt + "\n Text:" + examples_text[0] + "\n Rating:" + str(examples_ratings[0]) + "\n" + " Text:" + examples_text[1] + "\n Rating:" + str(examples_ratings[1]) + "\n" + " Text:" + examples_text[2] + "\n Rating:" + str(examples_ratings[2]) + "\n" + " Text:" + query



output2 = pipe(fo, candidate_labels)


print(output2)


In [None]:
print(query)
print(label)
data.iloc[10:11]


# Fine-tuning Approach

## Sample Dataset

In [None]:
# Sample & Split Dataset
print(len(data)) # 80k

to_sample = data[['review_text', target]]
to_sample.rename(columns={"review_text": 'text'}, inplace=True)

#sample = data.sample(frac=0.0625) # gives us 5k samples
sample = to_sample.sample(frac=0.125)

print('Sample:')
print(sample.head())
print(len(sample))

remainder = to_sample[~to_sample.index.isin(sample.index)]
print('Remainder:')
print(remainder.head())
print(len(remainder))

## Split Dataset

In [None]:
from sklearn.model_selection import train_test_split


x_train, x_test, y_train, y_test = train_test_split(sample['text'], sample[target], test_size=0.3)

from datasets import Dataset

def to_dataset(x_train, x_test, y_train, y_test):
    train_set = Dataset.from_dict({
            'text': x_train,
            'label': y_train
    })

    test_set = Dataset.from_dict({
            'text': x_test,
            'label': y_test
    })

    remainder_set = Dataset.from_dict({
        'text': remainder['text'],
        'label': remainder[target]
    })
    return train_set, test_set, remainder_set

train_set, test_set, remainder_set = to_dataset(x_train, x_test, y_train, y_test)
if local:
        path = './data/'
else:
        path = '/drive/MyDrive/'

if task == Task.RATING_REGRESSION:
        path = path + 'rating_regression'
        if drop_zeros:
               path = path + '_no0'  
else:
        path = path + 'categ_classification'

train_set.save_to_disk(path + '_train.hf')
test_set.save_to_disk(path + '_test.hf')
remainder_set.save_to_disk(path + '_remainder.hf')

## Preprocess

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback, DataCollatorWithPadding
model_name = "distilbert/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

num_labels = 6
if drop_zeros:
  num_labels = 5

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

token_train_data = train_set.map(preprocess_function, batched=True)

token_test_data = test_set.map(preprocess_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model = AutoModelForSequenceClassification.from_pretrained(

    model_name, num_labels=num_labels
)

print(model)

### Prepare evaluation

In [None]:
import evaluate

mse_metric = evaluate.load('mse')
accuracy_metric = evaluate.load('accuracy')

# Define a custom compute_metrics function to calculate both metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)  # Convert logits to class predictions (if classification)

    # Compute accuracy
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)

    # Compute MSE
    mse = mse_metric.compute(predictions=predictions, references=labels)

    # Return both accuracy and mse
    return {
        "accuracy": accuracy["accuracy"],
        "mse": mse["mse"]
    }



## Train Models

In [None]:
training_args = TrainingArguments(

    output_dir="dbert-review-classification-no-0",

    learning_rate=2e-5,

    per_device_train_batch_size=16,

    per_device_eval_batch_size=16,

    num_train_epochs=2,

    weight_decay=0.01,

    eval_strategy="epoch",

    save_strategy="epoch",

    load_best_model_at_end=True,

    push_to_hub=True
)

early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,       # Stop after 3 evaluations with no improvement
    early_stopping_threshold=0.001,  # Minimum improvement threshold
)

trainer = Trainer(

    model=model,

    args=training_args,

    callbacks=[early_stopping_callback],

    train_dataset=token_train_data,

    eval_dataset=token_test_data,

    tokenizer=tokenizer,

    data_collator=data_collator,

    compute_metrics=compute_metrics,

)

trainer.train()

In [None]:
trainer.push_to_hub()