# 1. Activate GPU and Install Dependencies

In [1]:
# Install required libraries
!pip install datasets transformers huggingface_hub
!apt-get install git-lfs
!pip install datasets
!pip install transformers[torch]

In [2]:
import torch
import pandas as pd
from datasets import Dataset, DatasetDict, load_dataset
torch.cuda.is_available()

True

# 2. Preprocess data

In [3]:
train_dataset = Dataset.from_pandas(pd.read_csv('../data/movie_reviews/movie_reviews.csv'))

In [4]:
# Set DistilBERT tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [5]:
# Prepare the text inputs for the model
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_train = train_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

In [6]:
# Use data_collector to convert our samples to PyTorch tensors and concatenate them with the correct amount of padding
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 3. Training the model

In [7]:
# Define DistilBERT as our base model:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Define the evaluation metrics
import numpy as np
from datasets import load_metric

def compute_metrics(eval_pred):
    load_accuracy = load_metric("accuracy")
    load_f1 = load_metric("f1")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
    return {"accuracy": accuracy, "f1": f1}

In [14]:
# Log in to your Hugging Face account
# Get your API token here https://huggingface.co/settings/token
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [10]:
from transformers import TrainingArguments, Trainer

num_samples = 40000
batch_size = 8 
num_train_epochs = 3

training_args = TrainingArguments(
    output_dir="finetuning-sentiment-model-40000-movie-reviews-samples",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    weight_decay=0.01,
    save_strategy="epoch",
    fp16=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [12]:
# Train the model
trainer.train()

  0%|          | 0/15000 [00:00<?, ?it/s]

TrainOutput(global_step=15000, training_loss=0.18035380350748698, metrics={'train_runtime': 5228.3863, 'train_samples_per_second': 22.952, 'train_steps_per_second': 2.869, 'train_loss': 0.18035380350748698, 'epoch': 3.0})

In [16]:
# Upload the model to the Hub
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/tiennn/finetuning-sentiment-model-40000-movie-reviews-sample/commit/e913116d8cb0d7ec350bde7107cf429b13e6b458', commit_message='End of training', commit_description='', oid='e913116d8cb0d7ec350bde7107cf429b13e6b458', pr_url=None, pr_revision=None, pr_num=None)