In [1]:
import torch
torch.cuda.is_available()

True

In [2]:
!pip install datasets transformers huggingface_hub



In [3]:
!apt-get install git-lfs

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 16 not upgraded.


In [4]:
from datasets import load_dataset
import pandas as pd

imdb = load_dataset("imdb")

In [5]:
small_train_dataset = imdb["train"].shuffle(seed=42).select([i for i in list(range(3000))])
small_test_dataset = imdb["test"].shuffle(seed=42).select([i for i in list(range(300))])

In [6]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [7]:
def preprocess_function(examples):
   return tokenizer(examples["text"], truncation=True)

tokenized_train = small_train_dataset.map(preprocess_function, batched=True)
tokenized_test = small_test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [8]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [9]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
import numpy as np
from datasets import load_metric

def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")
   load_f1 = load_metric("f1")
   load_precision = load_metric("precision")
   load_recall = load_metric("recall")

   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
   precision = load_precision.compute(predictions=predictions, references=labels)["precision"]
   recall = load_recall.compute(predictions=predictions, references=labels)["recall"]
   return {"accuracy": accuracy, "f1": f1, "precision": precision, "recall": recall}

In [12]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [13]:
!pip install accelerate>=0.20.1 transformers

In [14]:
from transformers import TrainingArguments, Trainer

repo_name = "finetuning-sentiment-model-3000-samples"

training_args = TrainingArguments(
   output_dir=repo_name,
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=2,
   weight_decay=0.01,
   save_strategy="epoch",
   push_to_hub=True,
)

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

Cloning https://huggingface.co/pprabu/finetuning-sentiment-model-3000-samples into local empty directory.


Download file pytorch_model.bin:   0%|          | 15.4k/255M [00:00<?, ?B/s]

Download file runs/Aug10_00-15-54_49bd4f7a2262/events.out.tfevents.1691626588.49bd4f7a2262.4285.0: 100%|######…

Download file training_args.bin: 100%|##########| 3.93k/3.93k [00:00<?, ?B/s]

Clean file runs/Aug10_00-15-54_49bd4f7a2262/events.out.tfevents.1691626588.49bd4f7a2262.4285.0:  24%|##3      …

Clean file training_args.bin:  25%|##5       | 1.00k/3.93k [00:00<?, ?B/s]

Download file runs/Aug10_00-15-54_49bd4f7a2262/events.out.tfevents.1691627393.49bd4f7a2262.4285.1: 100%|######…

Clean file runs/Aug10_00-15-54_49bd4f7a2262/events.out.tfevents.1691627393.49bd4f7a2262.4285.1: 100%|#########…

Clean file pytorch_model.bin:   0%|          | 1.00k/255M [00:00<?, ?B/s]

In [15]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


TrainOutput(global_step=376, training_loss=0.2824876257713805, metrics={'train_runtime': 336.7422, 'train_samples_per_second': 17.818, 'train_steps_per_second': 1.117, 'total_flos': 785643443397696.0, 'train_loss': 0.2824876257713805, 'epoch': 2.0})

In [16]:
trainer.evaluate()

  load_accuracy = load_metric("accuracy")


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.58k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.52k [00:00<?, ?B/s]

{'eval_loss': 0.3324328064918518,
 'eval_accuracy': 0.8633333333333333,
 'eval_f1': 0.8664495114006515,
 'eval_precision': 0.8471337579617835,
 'eval_recall': 0.8866666666666667,
 'eval_runtime': 8.6241,
 'eval_samples_per_second': 34.786,
 'eval_steps_per_second': 2.203,
 'epoch': 2.0}

In [19]:
trainer.push_to_hub()

To https://huggingface.co/pprabu/finetuning-sentiment-model-3000-samples
   c1ee5b5..56ba68c  main -> main

   c1ee5b5..56ba68c  main -> main

To https://huggingface.co/pprabu/finetuning-sentiment-model-3000-samples
   56ba68c..6ad146d  main -> main

   56ba68c..6ad146d  main -> main



'https://huggingface.co/pprabu/finetuning-sentiment-model-3000-samples/commit/56ba68c7a4c7110a766d90b8c609ff124df87765'

In [20]:
!pip install xformers

In [21]:
from transformers import pipeline

sentiment_model = pipeline(model="pprabu/finetuning-sentiment-model-3000-samples")
sentiment_model(["This movie was average", "This movie is worst movie ever created!!!", "It's a shitty movie and waste of my money"])


Downloading (…)lve/main/config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/320 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

[{'label': 'LABEL_0', 'score': 0.7617270350456238},
 {'label': 'LABEL_0', 'score': 0.9619356393814087},
 {'label': 'LABEL_0', 'score': 0.9669193029403687}]