<a href="https://colab.research.google.com/github/TheHackerLlama/charlas/blob/main/riiaa_2021/parte_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Procesamiento de datos con datasets

In [None]:
%%capture
!pip install transformers datasets
!pip install -U huggingface_hub
!apt-get install git-lfs

In [None]:
!huggingface-cli login

In [None]:
!git config --global user.email "osanseviero@gmail.com"
!git config --global user.name "Omar Sanseviero"

In [None]:
import random
import pandas as pd
from datasets import ClassLabel
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    "Taken from https://github.com/huggingface/notebooks/blob/master/examples/text_classification.ipynb"
    
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [None]:
from datasets import list_datasets

datasets = list_datasets()
print(f"Hay {len(datasets)} datasets disponibles en el Hub.")
print(f"Los primeros 10 son: {datasets[:10]}")

In [None]:
metadata = list_datasets(with_details=True)[datasets.index("amazon_reviews_multi")]

print("Description:", metadata.description, "\n")

# Show first 8 lines of the citation string
print("Citation:", "\n".join(metadata.citation.split("\n")[:8]))

In [None]:
from datasets import load_dataset

dataset = load_dataset("amazon_reviews_multi", "es")
dataset

In [None]:
show_random_elements(dataset["train"])

In [None]:
dataset.set_format("pandas")
df = dataset["train"][:]
df.head()

In [None]:
df["product_category"].value_counts()

In [None]:
df["stars"].value_counts()

In [None]:
dataset.reset_format()

## Crear un label

In [None]:
dataset = dataset.filter(lambda x : x["stars"] != 3)

In [None]:
def merge_star_ratings(examples):
    if examples["stars"] <= 2:
        label = 0
    else:
        label = 1
    return {"labels": label}

In [None]:
dataset = dataset.map(merge_star_ratings)

In [None]:
show_random_elements(dataset["train"])

## Tokenization

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "BSC-TeMU/roberta-base-bne"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
tokenizer.vocab_size

In [None]:
text = "¡hola, me llamo omar!"
tokenized_text = tokenizer.encode(text)

for token in tokenized_text:
    print(token, tokenizer.decode([token]))

In [None]:
encoded_text = tokenizer(text, return_tensors="pt")
encoded_text

In [None]:
def tokenize_reviews(examples):
  return tokenizer(examples["review_body"], truncation=True)

In [None]:
columns = dataset["train"].column_names
columns.remove("labels")
encoded_dataset = dataset.map(tokenize_reviews, batched=True, remove_columns=columns)
encoded_dataset

In [None]:
encoded_dataset["train"][0]

## Cargar un modelo pre-entrenado

In [None]:
from transformers import AutoModelForSequenceClassification

num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

In [None]:
encoded_text

In [None]:
outputs = model(**encoded_text)
outputs

## Especificar una métrica

In [None]:
from datasets import load_metric 

metric = load_metric("accuracy")
metric

In [None]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

## Fine-tuning

In [None]:
from transformers import TrainingArguments

model_name = model_checkpoint.split("/")[-1]

batch_size = 16
num_train_epochs=1 # Aumentar a 2
num_train_samples = 2000 # Aumentar a 20000
train_dataset = encoded_dataset["train"].shuffle(seed=42).select(range(num_train_samples))
logging_steps = len(train_dataset) // (2 * batch_size * num_train_epochs)

training_args = TrainingArguments(
    output_dir="results",
    num_train_epochs=num_train_epochs,     
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch", 
    logging_steps=logging_steps,
    push_to_hub=True,
    push_to_hub_model_id=f"{model_name}-finetuned-amazon_reviews_multi"
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model, 
    args=training_args, 
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer
)

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub()

## Utilizando el modelo recién entrenado

In [None]:
from transformers import pipeline

model_checkpoint = "hackertec/roberta-base-bne-finetuned-amazon_reviews_multi"
pipe = pipeline("sentiment-analysis", model=model_checkpoint)

In [None]:
pipe("¡me encanta el ipad!")