In [1]:
import torch
print(torch.__version__)

2.3.0


In [2]:
# Is MPS even available? macOS 12.3+
print(torch.backends.mps.is_available())

# Was the current version of PyTorch built with MPS activated?
print(torch.backends.mps.is_built())

True
True


In [3]:
# Example for using Hugging Face pipeline built-in models
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")
data = ['I would love to work here!', "I think I'll get the job.", 'I hate this.']
sentiment_pipeline(data)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'POSITIVE', 'score': 0.9997377991676331},
 {'label': 'POSITIVE', 'score': 0.9980835914611816},
 {'label': 'NEGATIVE', 'score': 0.9996209144592285}]

In [4]:
# Using a specific model
sentiment_pipeline = pipeline(model='finiteautomata/bertweet-base-sentiment-analysis')
sentiment_pipeline(data)

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


[{'label': 'POS', 'score': 0.9915524125099182},
 {'label': 'POS', 'score': 0.955975353717804},
 {'label': 'NEG', 'score': 0.9820212125778198}]

In [5]:
hebrew_pipeline = pipeline(
    "sentiment-analysis",
    model="avichr/heBERT_sentiment_analysis",
    tokenizer="avichr/heBERT_sentiment_analysis",
    return_all_scores = True
)
hebrew_pipeline('אני לא אוהב את זה')



[[{'label': 'neutral', 'score': 0.00019010114192496985},
  {'label': 'positive', 'score': 0.00018210208509117365},
  {'label': 'negative', 'score': 0.9996278285980225}]]

In [6]:
hebrew_pipeline('! זה יהיה ממש כיף לעבוד פה')

[[{'label': 'neutral', 'score': 0.00015818208339624107},
  {'label': 'positive', 'score': 0.9997316002845764},
  {'label': 'negative', 'score': 0.00011027839354937896}]]

## Fine-tuning a model

### 1. Preprocess data

In [7]:
from datasets import load_dataset
imdb = load_dataset("imdb")

In [8]:
small_train_dataset = imdb["train"].shuffle(seed=42).select([i for i in list(range(3000))])
small_test_dataset = imdb["test"].shuffle(seed=42).select([i for i in list(range(300))])

In [9]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

In [10]:
def preprocess_function(examples):
   return tokenizer(examples["text"], truncation=True, max_length=512)

tokenized_train = small_train_dataset.map(preprocess_function, batched=True)
tokenized_test = small_test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [11]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### 2. Train

In [12]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
import numpy as np
from datasets import load_metric
 
def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")
   load_f1 = load_metric("f1")
  
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
   return {"accuracy": accuracy, "f1": f1}

In [14]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [15]:
from transformers import TrainingArguments, Trainer
 
repo_name = "finetuning-sentiment-analysis"
 
training_args = TrainingArguments(
   output_dir=repo_name,
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=2,
   weight_decay=0.01,
   save_strategy="epoch",
   push_to_hub=True,
)
 
trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)


In [16]:
trainer.train()

  0%|          | 0/376 [00:00<?, ?it/s]

In [None]:
trainer.evaluate()

  load_accuracy = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.3576453626155853,
 'eval_accuracy': 0.85,
 'eval_f1': 0.8553054662379421,
 'eval_runtime': 15.7834,
 'eval_samples_per_second': 19.007,
 'eval_steps_per_second': 1.204,
 'epoch': 2.0}

In [None]:
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/TamirG765/finetuning-sentiment-analysis/commit/0979c1bf1f82c4b8b47b1003653778194e9ed4e1', commit_message='End of training', commit_description='', oid='0979c1bf1f82c4b8b47b1003653778194e9ed4e1', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
from transformers import pipeline

# LABEL_0 is negative, LABEL_1 is positive
sentiment_model = pipeline(model="TamirG765/finetuning-sentiment-analysis")
sentiment_model(["I love this movie", "This movie sucks!"])



config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

[{'label': 'LABEL_1', 'score': 0.9752447605133057},
 {'label': 'LABEL_0', 'score': 0.9349680542945862}]