# Fine tuning LLMs

In [1]:
from datasets import load_dataset

dataset = load_dataset("imdb")
dataset


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [2]:
dataset["train"][100]


{'text': "Terrible movie. Nuff Said.<br /><br />These Lines are Just Filler. The movie was bad. Why I have to expand on that I don't know. This is already a waste of my time. I just wanted to warn others. Avoid this movie. The acting sucks and the writing is just moronic. Bad in every way. The only nice thing about the movie are Deniz Akkaya's breasts. Even that was ruined though by a terrible and unneeded rape scene. The movie is a poorly contrived and totally unbelievable piece of garbage.<br /><br />OK now I am just going to rag on IMDb for this stupid rule of 10 lines of text minimum. First I waste my time watching this offal. Then feeling compelled to warn others I create an account with IMDb only to discover that I have to write a friggen essay on the film just to express how bad I think it is. Totally unnecessary.",
 'label': 0}

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding = "max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [4]:
tokenized_datasets['train'][100]['input_ids']

[101,
 12008,
 27788,
 2523,
 119,
 151,
 9435,
 11455,
 119,
 133,
 9304,
 120,
 135,
 133,
 9304,
 120,
 135,
 1636,
 12058,
 1132,
 2066,
 17355,
 9860,
 119,
 1109,
 2523,
 1108,
 2213,
 119,
 2009,
 146,
 1138,
 1106,
 7380,
 1113,
 1115,
 146,
 1274,
 112,
 189,
 1221,
 119,
 1188,
 1110,
 1640,
 170,
 5671,
 1104,
 1139,
 1159,
 119,
 146,
 1198,
 1458,
 1106,
 11857,
 1639,
 119,
 138,
 6005,
 2386,
 1142,
 2523,
 119,
 1109,
 3176,
 22797,
 1105,
 1103,
 2269,
 1110,
 1198,
 182,
 14824,
 7770,
 119,
 6304,
 1107,
 1451,
 1236,
 119,
 1109,
 1178,
 3505,
 1645,
 1164,
 1103,
 2523,
 1132,
 14760,
 9368,
 138,
 19610,
 2315,
 112,
 188,
 13016,
 119,
 2431,
 1115,
 1108,
 9832,
 1463,
 1118,
 170,
 6434,
 1105,
 8362,
 23063,
 4902,
 9372,
 2741,
 119,
 1109,
 2523,
 1110,
 170,
 9874,
 14255,
 19091,
 5790,
 1105,
 5733,
 8362,
 26438,
 2727,
 1104,
 14946,
 119,
 133,
 9304,
 120,
 135,
 133,
 9304,
 120,
 135,
 10899,
 1208,
 146,
 1821,
 1198,
 1280,
 1106,
 26133,
 1113,
 

In [5]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(200))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(200))

In [6]:
import torch
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

In [8]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [9]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", num_train_epochs = 2, evaluation_strategy="epoch")

In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [11]:
trainer.train()

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

{'eval_loss': 0.6720085144042969, 'eval_accuracy': 0.58, 'eval_runtime': 609.7916, 'eval_samples_per_second': 0.328, 'eval_steps_per_second': 0.041, 'epoch': 1.0}


  0%|          | 0/25 [00:00<?, ?it/s]

{'eval_loss': 0.5366445183753967, 'eval_accuracy': 0.82, 'eval_runtime': 524.186, 'eval_samples_per_second': 0.382, 'eval_steps_per_second': 0.048, 'epoch': 2.0}
{'train_runtime': 3886.3326, 'train_samples_per_second': 0.103, 'train_steps_per_second': 0.013, 'train_loss': 0.6406719207763671, 'epoch': 2.0}


TrainOutput(global_step=50, training_loss=0.6406719207763671, metrics={'train_runtime': 3886.3326, 'train_samples_per_second': 0.103, 'train_steps_per_second': 0.013, 'train_loss': 0.6406719207763671, 'epoch': 2.0})

In [12]:
trainer.save_model('models/sentiment-classifier')

In [13]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [14]:
trainer.push_to_hub("valen/sentiment-classifier")

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.03k [00:00<?, ?B/s]

'https://huggingface.co/vaalto/test_trainer/tree/main/'

In [21]:
from transformers import BertConfig, BertModel

model = AutoModelForSequenceClassification.from_pretrained('models/sentiment-classifier')

In [27]:
inputs = tokenizer("I cannot stand it anymore!", return_tensors="pt")

outputs = model(**inputs)

In [29]:
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.6467, -0.0041]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [28]:
import tensorflow as tf


predictions = tf.math.softmax(outputs.logits.detach(), axis=-1)
print(predictions)

tf.Tensor([[0.6571879  0.34281212]], shape=(1, 2), dtype=float32)


In [26]:
from transformers import pipeline

classifier = pipeline(task ='sentiment-analysis', model = model, tokenizer = tokenizer)
classifier('I cannot believe you did it again!')

[{'label': 'LABEL_0', 'score': 0.6545460224151611}]

## AutoTrain

In [10]:
#reducing the rows of the dataset to leverage the free autotrain

import pandas as pd

small_dataset = dataset["train"].shuffle(seed=42).select(range(3000))
small_dataset = small_dataset.to_pandas()

small_dataset['label'] = small_dataset['label'].replace({0: 'Negative', 1: 'Positive'})

small_dataset.head()

small_dataset.to_csv('imdb_small.csv')


In [16]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer


import os
from dotenv import load_dotenv

load_dotenv()

#os.environ["HUGGINGFACEHUB_API_TOKEN"]
huggingfacehub_api_token = os.environ['HUGGINGFACEHUB_API_TOKEN']

model = AutoModelForSequenceClassification.from_pretrained("vaalto/autotrain-imdb-sentiment-analysis-89579143987", token = huggingfacehub_api_token)

tokenizer = AutoTokenizer.from_pretrained("vaalto/autotrain-imdb-sentiment-analysis-89579143987", token = huggingfacehub_api_token)

inputs = tokenizer("This movie put me in a very nostalgic mood", return_tensors="pt")

outputs = model(**inputs)


Downloading (…)okenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [45]:
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-0.8675,  1.3277]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [38]:
import tensorflow as tf


predictions = tf.math.softmax(outputs.logits.detach(), axis=-1)
print(predictions)

tf.Tensor([[0.10018114 0.89981884]], shape=(1, 2), dtype=float32)


In [39]:
model.config.id2label

{0: 'Negative', 1: 'Positive'}

In [44]:
from transformers import pipeline

classifier = pipeline(task ='sentiment-analysis', model = model, tokenizer = tokenizer)
classifier('I cannot believe you did it again!')

[{'label': 'Negative', 'score': 0.6326483488082886}]