# Initial Setup

In [1]:
import numpy as np
import pandas as pd

## Requirements

In [None]:
%pip install transformers evaluate langchain datasets

## Config

In [None]:
local = False
drop_zeros = True
fine_tuning = True
generate_dset = False

from enum import Enum
Task = Enum('Task', 'RATING_REGRESSION CATEG_CLASSIFICATION')
task = Task.CATEG_CLASSIFICATION

if local:
    base_path = './data/'
else:
    from google.colab import files, drive
    base_path = '/content/drive/My Drive/'
    drive.mount('/content/drive/')

if task == Task.RATING_REGRESSION:
    target = 'rating'
    dset_prefix = 'rating_regression'
    if drop_zeros:
            dset_prefix = dset_prefix + '_no0'
else:
    target = 'category'
    dset_prefix = 'categ_classfication'
    labels = [
    'history_biography',
    'romance',
    'fantasy_paranormal',
    'young_adult',
    'poetry',
    'comics_graphic',
    'mystery_thriller_crime',
    'children'
    ]
    # Create label mappings
    label2id = {label: idx for idx, label in enumerate(labels)}
    id2label = {idx: label for idx, label in enumerate(labels)}

model_name = "distilbert/distilbert-base-uncased"

from huggingface_hub import notebook_login
notebook_login()

import os
os.environ["WANDB_DISABLED"] = "true"

## Load Data

In [None]:
path = base_path + 'dataset.json'
data = pd.read_json(path)
print(data.head())

# Preprocessing

In [None]:
print(data.describe())
print(data.info())

In [None]:
if task == Task.RATING_REGRESSION:
  if drop_zeros:
    zeros = data[data['rating'] == 0]
    data = data[data['rating'] != 0] # dropping 0 ratings
    if fine_tuning:
      data['rating'] = data['rating'] - 1
else:
  data['category'] = data['category'].map(label2id)

data['review_text'] = data['review_text'].str.lower()
data.head()

In [None]:
data.head()

In [None]:
data.info()

# Zero-shot Shot

In [None]:
#from langchain_huggingface.llms import HuggingFacePipeline

import json
import random
from transformers import pipeline

def rate_reviews(data, n_blocks=8, block_size=10000, sample_size=100, output_file='review_ratings.json'):
    # Carrega o modelo zero-shot
    pipe = pipeline(task="zero-shot-classification", model="facebook/bart-large-mnli")

    # Define as labels de classificação
    candidate_labels = ["0 stars", "1 stars", "2 stars", "3 stars", "4 stars", "5 stars"]

    # Lista para armazenar os resultados
    results = []

    # Itera sobre cada bloco de 10.000 entradas
    for block_num in range(n_blocks):
        start_idx = block_num * block_size
        end_idx = min((block_num + 1) * block_size, len(data['review_text']))

        # Seleciona 100 reviews aleatoriamente do bloco atual
        if end_idx > start_idx:
            block_reviews = data['review_text'][start_idx:end_idx]
            selected_reviews = block_reviews[:sample_size]  # Pegamos as 100 primeiras reviews do bloco

            # Processa as 100 reviews selecionadas
            for review_text in selected_reviews:
                # Cria o prompt para classificação
                prompt = "Your objective is to read user reviews for books and determine the final rating given by the user on a scale of 0 to 5 stars. Being 0 the lowest score and 5 the highest."
                output = pipe(prompt + review_text, candidate_labels)

                # Armazena o resultado (rótulo de maior pontuação) e a review na lista
                results.append({
                    "review": review_text,
                    "rating": output['labels'][0]
                })

    # Salva os resultados em um arquivo JSON
    with open(output_file, 'w') as f:
        json.dump(results, f, indent=4)

# Executa a função, processando 100 reviews de cada bloco de 10.000
rate_reviews(data, n_blocks=8, block_size=10000, sample_size=1, output_file='review_ratings.json')


In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Few Shot

In [None]:

from transformers import pipeline

model_id = "facebook/bart-large-mnli"
pipe = pipeline("zero-shot-classification", model=model_id)

candidate_labels =["0 Stars", "1 Stars", "2 Stars", "3 Stars", "4 Stars", "5 Stars"]

examples_text = [
    data['review_text'][17],
    data['review_text'][18],
    data['review_text'][19],
  ]

examples_ratings = [
    data['rating'][17],
    data['rating'][18],
    data['rating'][19],
]

query = data['review_text'][10]
label = data['rating'][10]

prompt = """Your objective is to read user reviews for books and determine the final rating given by the user on a scale of 1 to 5 stars. Being 0 the lowest score and 5 the highest. First you'll get 3 examples of
Text: {review text}
Rating: {rating on a scale of 0 to 5}

Examples:
"""

fo = prompt + "\n Text:" + examples_text[0] + "\n Rating:" + str(examples_ratings[0]) + "\n" + " Text:" + examples_text[1] + "\n Rating:" + str(examples_ratings[1]) + "\n" + " Text:" + examples_text[2] + "\n Rating:" + str(examples_ratings[2]) + "\n" + " Text:" + query



output2 = pipe(fo, candidate_labels)


print(output2)


In [None]:
print(query)
print(label)
data.iloc[10:11]


# Fine-tuning Approach

## Generate or Load Dataset

In [None]:
if generate_dset:
        # Sample & Split Dataset
        print(len(data)) # 80k

        to_sample = data[['review_text', target]]
        to_sample.rename(columns={"review_text": 'text'}, inplace=True)

        #sample = data.sample(frac=0.0625) # gives us 5k samples
        sample = to_sample.sample(frac=0.125)

        print('Sample:')
        print(sample.head())
        print(len(sample))

        remainder = to_sample[~to_sample.index.isin(sample.index)]
        print('Remainder:')
        print(remainder.head())
        print(len(remainder))

        from sklearn.model_selection import train_test_split


        x_train, x_test, y_train, y_test = train_test_split(sample['text'], sample[target], test_size=0.3)

        from datasets import Dataset

        def to_dataset(x_train, x_test, y_train, y_test, remainder):
                train_set = Dataset.from_dict({
                        'text': x_train,
                        'label': y_train
                })

                test_set = Dataset.from_dict({
                        'text': x_test,
                        'label': y_test
                })

                remainder_set = Dataset.from_dict({
                        'text': remainder['text'],
                        'label': remainder[target]
                })
                return train_set, test_set, remainder_set

        train_set, test_set, remainder_set = to_dataset(x_train, x_test, y_train, y_test, remainder)


        path = 'pppereira3/' + dset_prefix

        train_set.push_to_hub(path + '_train.hf')
        test_set.push_to_hub(path + '_test.hf')
        remainder_set.push_to_hub(path + '_remainder.hf')
else:
        from datasets import load_dataset
        path = 'pppereira3/' + dset_prefix
        train_set = load_dataset(path + '_train.hf')['train']
        test_set = load_dataset(path + '_test.hf')['train']


In [None]:
print(test_set.take(1)[0])

## Preprocess

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback, DataCollatorWithPadding

tokenizer = AutoTokenizer.from_pretrained(model_name)

if task == Task.RATING_REGRESSION:
  num_labels = 6
  if drop_zeros:
    num_labels = 5
else:
  num_labels = len(labels)

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

token_train_data = train_set.map(preprocess_function, batched=True)

token_test_data = test_set.map(preprocess_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

if task == Task.RATING_REGRESSION:
  model = AutoModelForSequenceClassification.from_pretrained(

      model_name, num_labels=num_labels
  )
else:
  model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label
  )

print(model)

### Prepare evaluation

In [19]:
import evaluate

mse_metric = evaluate.load('mse')
accuracy_metric = evaluate.load('accuracy')

# Define a custom compute_metrics function to calculate both metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)  # Convert logits to class predictions (if classification)

    # Compute accuracy
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)

    # Compute MSE
    if task == Task.RATING_REGRESSION:
      mse = mse_metric.compute(predictions=predictions, references=labels)
      # Return both accuracy and mse
      return {
          "accuracy": accuracy["accuracy"],
          "mse": mse["mse"]
      }

    return { "accuracy": accuracy["accuracy"]}



## Train Models

In [None]:
training_args = TrainingArguments(

    output_dir="dbert-categ-classification",

    learning_rate=2e-5,

    per_device_train_batch_size=16,

    per_device_eval_batch_size=16,

    num_train_epochs=10,

    weight_decay=0.01,

    eval_strategy="epoch",

    save_strategy="epoch",

    load_best_model_at_end=True,

    push_to_hub=True
)

early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,       # Stop after 3 evaluations with no improvement
    early_stopping_threshold=0.001,  # Minimum improvement threshold
)

trainer = Trainer(

    model=model,

    args=training_args,

    callbacks=[early_stopping_callback],

    train_dataset=token_train_data,

    eval_dataset=token_test_data,

    tokenizer=tokenizer,

    data_collator=data_collator,

    compute_metrics=compute_metrics,

)

trainer.train()

In [None]:
trainer.push_to_hub()