In [1]:
from transformers import pipeline

model_name = "camembert-base" # camembert-base

In [2]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:

from transformers import AutoTokenizer, AutoModelForMaskedLM

num_labels=4
model = AutoModelForMaskedLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)


- To showcase domain adaptation, we’ll use a dataset with amazon reviews. By fine-tuning CamemBERT on this corpus, we expect the language model will adapt its vocabulary from the OSCAR dataset that it was pretrained on to the more subjective elements of reviews.

In [4]:
from datasets import load_dataset

amazon_reviews = load_dataset("amazon_reviews_multi", "fr")

amazon_reviews

Found cached dataset amazon_reviews_multi (C:/Users/radio/.cache/huggingface/datasets/amazon_reviews_multi/fr/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 200000
    })
    validation: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 5000
    })
})

### Preprocessing the Data

For masked language modeling, a common preprocessing step is to concatenate all the examples and then split the whole corpus into chunks of equal size. This is quite different from our usual approach, where we simply tokenize individual examples. Why concatenate everything together? The reason is that individual examples might get truncated if they’re too long, and that would result in losing information that might be useful for the language modeling task!

In [5]:
def tokenize_function(examples):
    result = tokenizer(examples["review_body"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

tokenized_datasets = amazon_reviews.map(
    tokenize_function, batched=True, remove_columns=['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category']
)

tokenized_datasets

Loading cached processed dataset at C:\Users\radio\.cache\huggingface\datasets\amazon_reviews_multi\fr\1.0.0\724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609\cache-141d92bd3fd7cf58.arrow
Loading cached processed dataset at C:\Users\radio\.cache\huggingface\datasets\amazon_reviews_multi\fr\1.0.0\724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609\cache-dae3fa0af4d9af7e.arrow
Loading cached processed dataset at C:\Users\radio\.cache\huggingface\datasets\amazon_reviews_multi\fr\1.0.0\724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609\cache-2b42ffd2aa69d53c.arrow


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 200000
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 5000
    })
})

In [6]:
tokenized_samples = tokenized_datasets["train"][:15]

for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> Review {idx} length: {len(sample)}'")

concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated reviews length: {total_length}'")



'>>> Review 0 length: 31'
'>>> Review 1 length: 42'
'>>> Review 2 length: 33'
'>>> Review 3 length: 95'
'>>> Review 4 length: 108'
'>>> Review 5 length: 41'
'>>> Review 6 length: 33'
'>>> Review 7 length: 40'
'>>> Review 8 length: 17'
'>>> Review 9 length: 36'
'>>> Review 10 length: 118'
'>>> Review 11 length: 34'
'>>> Review 12 length: 52'
'>>> Review 13 length: 13'
'>>> Review 14 length: 96'
'>>> Concatenated reviews length: 789'


Let's get a feel for what kind of text we are dealing with:

In [7]:
sample = amazon_reviews["train"].shuffle(seed=42).select(range(10))

for row in sample:
    print(f"\n'>>> Review: {row['review_body']}'")
    print(f"'>>> Review title: {row['review_title']}'")

Loading cached shuffled indices for dataset at C:\Users\radio\.cache\huggingface\datasets\amazon_reviews_multi\fr\1.0.0\724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609\cache-52b1a943b852eb69.arrow



'>>> Review: Solide lampe, tiens bien sur le support caméra mais je trouve que la batterie ne tiens pas longtemps, j'ai dû en racheter une 2ème au cas où l'autre me lâche. Sinon pas mal ;-)'
'>>> Review title: Batterie moyen autonomie'

'>>> Review: Flotte... donc pas pratique pour infuser'
'>>> Review title: Flotte... donc pas pratique pour infuser'

'>>> Review: Ce produit correspond à tou tes mes attentes. Livraison rapide. Je recommande ce vendeur. Je ferais d'autres achats si besoin'
'>>> Review title: parfait'

'>>> Review: Bonjour alor pour le prix pas mal du tout facile a utiliser dommage notice en anglais mais sinon très correct'
'>>> Review title: Merci'

'>>> Review: Toujours imité : jamais égalé! Le top'
'>>> Review title: Nickel'

'>>> Review: Bon petit kit pour démarrer mais la qualité du métal laisse à désirer. Les bijoux s'oxydent rapidement et le cuivre finit par tâcher la peau avec son vert de gris...'
'>>> Review title: Bien mais pas top'

'>>> Review: achat reçu da

In [8]:
tokenizer.model_max_length

512

In [9]:
chunk_size = 128

def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [10]:
chunks = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")

'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 21'


In [11]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Loading cached processed dataset at C:\Users\radio\.cache\huggingface\datasets\amazon_reviews_multi\fr\1.0.0\724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609\cache-0bf200e513830e00.arrow
Loading cached processed dataset at C:\Users\radio\.cache\huggingface\datasets\amazon_reviews_multi\fr\1.0.0\724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609\cache-18b860f711d675cb.arrow
Loading cached processed dataset at C:\Users\radio\.cache\huggingface\datasets\amazon_reviews_multi\fr\1.0.0\724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609\cache-2bc87195af8b26e6.arrow


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 59248
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1438
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1481
    })
})

With our dataset ready, its time to fine-tune CamemBERT using the DataCollator from the transformers library.

In [12]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

#### Showing how the random masking works

In [13]:
samples = [lm_datasets["train"][i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

You're using a CamembertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.



'>>> <s> A déconseiller - Apport n'a fonctionné qu'une fois - Je ne recommande pas du tout ce produit - Je l<mask>ai<mask>...</s><s> Si<mask> voulez<mask> déçu achetez le produit! Au bout de 3 utilisation ne fonctionne plus.. Je ne<mask><mask> du tout sauf si vous voulez acheter un thermomètre quelques jours<mask> avoir acheté celui<mask> participer</s><s> Écran<mask> mauvaise qualité<mask> car il s'use<mask> peu de temps et croche. Dommage j'aimais<mask><mask> peu de couper de<mask>.</s><s> navigue engin ne sert à rien<mask> son<mask> sont pourris les songs sont simplistes vous n'

'>>> 'apprendrez sanctuaire à jouer de la batterie avec<mask> bou<mask> pareille. En fait c'est juste un jouet destiné aux enfants et rien d'autre. Si<mask> voulez vraiment quelque chose de bien et d'utile passez votre chemin et gardez votre<mask>rie rez'<mask> voulu essayer et j'ai été très mais alors très<mask>. Résultat<mask> poubelle.</s><s> Très beau produit mais la grue n'a pas fonctionné très longte

### Whole word masking

In [14]:
import collections
import numpy as np

from transformers import default_data_collator

wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return default_data_collator(features)

In [15]:
samples = [lm_datasets["train"][i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> <s> A déconseiller - Article n'a<mask><mask><mask><mask> fois - Je ne recommande<mask><mask> tout ce produit - Je l'ai<mask>...</s><s> Si vous voulez<mask><mask> achetez le produit! Au<mask> de 3 utilisation ne<mask> plus..<mask> ne recommande<mask> du<mask> sauf<mask> vous voulez acheter un thermomètre quelques jours<mask> avoir acheté celui ci!</s><s> Écran de mauvaise<mask><mask> car il s'use<mask> peu de temps et croche. Dommage j'aimais bien car<mask> de traces de doigts.</s><s><mask> engin ne sert à rien les sons sont pourris les songs sont simplistes vous n'

'>>> 'apprendrez<mask> à jouer de la<mask> avec<mask> bouze pareille. En fait<mask><mask><mask> juste<mask> jouet destiné aux enfants et rien<mask><mask><mask><mask> Si vous<mask> vraiment quelque chose de bien<mask> d'utile passez<mask><mask> et gardez<mask> fric<mask> j'ai voulu essayer et<mask><mask><mask> été très mais alors<mask> déçu. Résultat direction poubelle.</s><s> Très<mask> produit mais la grue n'a pas<ma

Let's downsample the dataset a little in order to save our GPU.

In [16]:
train_size = 1000
test_size = int(0.1 * train_size)

downsampled_dataset = lm_datasets["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset

Loading cached split indices for dataset at C:\Users\radio\.cache\huggingface\datasets\amazon_reviews_multi\fr\1.0.0\724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609\cache-f819f868abee1173.arrow and C:\Users\radio\.cache\huggingface\datasets\amazon_reviews_multi\fr\1.0.0\724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609\cache-858df5d0a66f150c.arrow


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 100
    })
})

### Setting up the training

In [17]:
from transformers import TrainingArguments

batch_size = 64
# Show the training loss with every epoch
logging_steps = len(downsampled_dataset["train"]) // batch_size
model_name = "camembert-base"

training_args = TrainingArguments(
    output_dir=f"./{model_name}-finetuned-amazon-reviews",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_steps=logging_steps,
    remove_unused_columns=False
)

In [None]:
from transformers import Trainer
import math


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=whole_word_masking_data_collator,
    tokenizer=tokenizer,
   
)


print(trainer.evaluate())
trainer.train()

trainer.save_model(f"./{model_name}-finetuned-amazon-reviews-model")



# A different training approach

In [64]:
def insert_random_mask(batch):
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    masked_inputs = data_collator(features)
    # Create a new "masked" column for each column in the dataset
    return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}

In [65]:
downsampled_dataset = downsampled_dataset.remove_columns(["word_ids"])
eval_dataset = downsampled_dataset["test"].map(
    insert_random_mask,
    batched=True,
    remove_columns=downsampled_dataset["test"].column_names,
)
eval_dataset = eval_dataset.rename_columns(
    {
        "masked_input_ids": "input_ids",
        "masked_attention_mask": "attention_mask",
        "masked_labels": "labels",
    }
)

Loading cached processed dataset at C:\Users\radio\.cache\huggingface\datasets\amazon_reviews_multi\fr\1.0.0\724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609\cache-2d20f095c5955673.arrow


In [66]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

batch_size = 64
train_dataloader = DataLoader(
    downsampled_dataset["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)
eval_dataloader = DataLoader(
    eval_dataset, batch_size=batch_size, collate_fn=default_data_collator
)

#### Defining optimizer

In [67]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [68]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

#### Specifying the learning rate scheduler

In [69]:
from transformers import get_scheduler

num_train_epochs = 3
output_dir = f"./{model_name}-finetuned-amazon-reviews"
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

## Defining the main training and evaluation loop

In [70]:
from tqdm.auto import tqdm
import torch
import math

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)

        loss = outputs.loss
        losses.append(accelerator.gather(loss.repeat(batch_size)))

    losses = torch.cat(losses)
    losses = losses[: len(eval_dataset)]
    try:
        perplexity = math.exp(torch.mean(losses))
    except OverflowError:
        perplexity = float("inf")

    print(f">>> Epoch {epoch}: Perplexity: {perplexity}")

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
   


  0%|          | 0/471 [29:17<?, ?it/s]


KeyboardInterrupt: 