In [None]:
from datasets import load_dataset

# Carregando um dataset local

O legal aqui seria mostrar como carregar diferentes tipos de arquivos: jsons e csvs.

In [None]:
# loading json file
squad_it_train_dataset = load_dataset(
    "json", data_files="data/SQuAD_it-train.json", field="data"
)
squad_it_train_dataset


In [None]:
# loading csv file
drugs_com_train = load_dataset(
    "csv", data_files="data/drugsComTrain_raw.tsv", delimiter="\t"
)
drugs_com_train


In [None]:
# loading multiple json files
data_files = {"train": "data/SQuAD_it-train.json", "test": "data/SQuAD_it-test.json"}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")
squad_it_dataset


# Acessando os dados do dataset

In [None]:
# we can access all the paragraphs
squad_it_train_dataset["train"]["paragraphs"]

In [None]:
# we can access just one item
drugs_com_train["train"]["drugName"]

# Interações com o Dataset

## Amostragem

Utilizando o `Dataset.shuffle()` and `Dataset.select()` em combinação nós podemos coletar uma amostra nos dados.

In [None]:
drug_sample = drugs_com_train["train"].shuffle(seed=42).select(range(1000))
drug_sample

## Valores únicos

Utilizando o `Dataset.unique()` nós podemos selecionar os valore únicos de alguma chave que temos interesse. Muito semelhante ao que o `dataframe.unique()` faria.

In [None]:
drugs_com_train["train"].unique("condition")


## Renomeando colunas

### Dataset.rename_columns()

In [None]:
new_drugs_review_dataset = drugs_review_dataset.rename_column(original_column_name="Unnamed: 0", new_column_name="patient_id")
new_drugs_review_dataset

## O método `map` e o método `filter`

### Dataset.map() and Dataset.filter()

Normally used to apply some simple function across all the rows of the dataset. For example to:
- Normalize
- Pre-processing
- Transformation
...


In [None]:
# first we'll filter the None values
new_drugs_review_dataset = new_drugs_review_dataset.filter(lambda x: x["condition"] is not None)

def lowercase_condition(example):
    return {"condition": example["condition"].lower()}

new_drugs_review_dataset = new_drugs_review_dataset.map(lowercase_condition)

In [None]:
new_drugs_review_dataset["train"][:10]

## Map para criar novas colunas

### Utilizando o método `Dataset.map()` para criar novas colunas


In [None]:
def compute_review_length(example) -> dict:
    return {"review_length": len(example["review"].split())}

new_drugs_review_dataset = new_drugs_review_dataset.map(compute_review_length)

## Ordenando o Dataset

### Utilizando o método `Dataset.sort()` to ordering the dataset by some column

In [None]:
new_drugs_review_dataset["train"].sort("review_length")[:3]

In [None]:
new_drugs_review_dataset["train"].sort("review_length")[-3:-1]

# Utilizando o map para obter mais performance

## O param `BATCHED`

Now, instead of receive an object dictionary like, the map function will give to you a list with some batch to preprocess.

In [None]:
# filtering less than 30 words size reviews
new_drugs_review_dataset = new_drugs_review_dataset.filter(lambda x: x["review_length"] > 30)
new_drugs_review_dataset.num_rows

In [None]:
# removing html characters
import html

text = "I&#039;m a transformer called BERT"
html.unescape(text)

In [None]:
new_drugs_review_dataset.map(lambda x: {"review": html.unescape(x["review"])})

In [None]:
from dotenv import load_dotenv

load_dotenv()

In [None]:
new_drugs_review_dataset.map(
    lambda x: {"review": [html.unescape(review) for review in x["review"]]}, batched=True
)


In [None]:
from transformers import AutoTokenizer, BatchEncoding

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["review"], truncation=True)


In [None]:
tokenized_dataset = new_drugs_review_dataset.map(tokenize_function, batched=True, num_proc=10)
tokenized_dataset

# Do formato Dataset para o `DataFrame` e vise-versa

## Utilizando o `Dataset.set_format()`

In [None]:
new_drugs_review_dataset.set_format("pandas")

A forma como o `.set_format()` funciona é varrendo o o Dataset e retornando o `dataframe` para a varredura específica, ele por padrão apenas altera o método `__getitem__()` da classe do `Dataset`.

In [None]:
new_drugs_review_dataset["train"][:3]

In [None]:
train_df = new_drugs_review_dataset["train"][:]
train_df

In [None]:
(
    train_df["condition"]
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={"index": "condition", "condition": "frequency"})
)

O `.reset_format()` retorna o `__getitem__()` para o seu formato default.

In [None]:
new_drugs_review_dataset.reset_format()

new_drugs_review_dataset["train"][0]

# `Dataset.train_test_split()` - Exatamente igual ao scikit-learn

The following code can create a validation split for us

In [None]:
new_drugs_review_clean = new_drugs_review_dataset["train"].train_test_split(train_size=0.8, seed=42)
new_drugs_review_clean["validation"] = new_drugs_review_clean.pop("test")
new_drugs_review_clean["test"] = new_drugs_review_dataset["test"]

new_drugs_review_clean


# Dataset.save_to_disk(): Salvando localmente o `Dataset`

In [None]:
# dataset.save_to_disk()
# dataset.to_csv()
# dataset.to_json() ...

new_drugs_review_clean.save_to_disk("./data/drug-reviews")
