In [15]:
!pip install transformers datasets huggingface_hub tensorboard==2.11
!sudo apt-get install git-lfs --yes

In [45]:
import torch
from datasets import load_dataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)
from huggingface_hub import HfFolder, notebook_login

In [59]:
from huggingface_hub import login

# Replace with your write token from Hugging Face
HUGGINGFACE_TOKEN = "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"

login(token=HUGGINGFACE_TOKEN)

In [71]:
model_id = "roberta-base"
dataset_id = "ag_news"
# relace the value with your model: ex <hugging-face-user>/<model-name>
repository_id = "Pradeep18103/roberta-base_ag_news"

In [6]:
# Load dataset
dataset = load_dataset(dataset_id)

README.md:   0%|          | 0.00/8.07k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})

In [8]:
# Training and testing datasets
train_dataset = dataset['train']
test_dataset = dataset["test"].shard(num_shards=2, index=0)

# Validation dataset
val_dataset = dataset['test'].shard(num_shards=2, index=1)

In [9]:
print(train_dataset)
print(test_dataset)
print(val_dataset)

Dataset({
    features: ['text', 'label'],
    num_rows: 120000
})
Dataset({
    features: ['text', 'label'],
    num_rows: 3800
})
Dataset({
    features: ['text', 'label'],
    num_rows: 3800
})


In [10]:
# Preprocessing
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)

# This function tokenizes the input text using the RoBERTa tokenizer.
# It applies padding and truncation to ensure that all sequences have the same length (256 tokens).
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=256)

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
val_dataset = val_dataset.map(tokenize, batched=True, batch_size=len(val_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3800 [00:00<?, ? examples/s]

Map:   0%|          | 0/3800 [00:00<?, ? examples/s]

In [11]:
print(train_dataset)
print(test_dataset)
print(val_dataset)

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 120000
})
Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 3800
})
Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 3800
})


In [12]:
# Set dataset format
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [13]:
# We will need this to directly output the class names when using the pipeline without mapping the labels later.
# Extract the number of classes and their names
num_labels = dataset['train'].features['label'].num_classes
class_names = dataset["train"].features["label"].names
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

# Create an id2label mapping
id2label = {i: label for i, label in enumerate(class_names)}

# Update the model's configuration with the id2label mapping
config = AutoConfig.from_pretrained(model_id)
config.update({"id2label": id2label})

number of labels: 4
the labels: ['World', 'Sports', 'Business', 'Sci/Tech']


In [16]:
# Model
model = RobertaForSequenceClassification.from_pretrained(model_id, config=config)

# TrainingArguments
training_args = TrainingArguments(
    output_dir=repository_id,
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="tensorboard",
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repository_id,
    hub_token=HfFolder.get_token(),
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
# Fine-tune the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.1205,0.330861
2,0.1735,0.28446


TrainOutput(global_step=30000, training_loss=0.3514421947369973, metrics={'train_runtime': 6561.0559, 'train_samples_per_second': 36.579, 'train_steps_per_second': 4.572, 'total_flos': 3.157389361152e+16, 'train_loss': 0.3514421947369973, 'epoch': 2.0})

In [63]:
# Evaluate the model
trainer.evaluate()

{'eval_loss': 0.28446003794670105,
 'eval_runtime': 26.6967,
 'eval_samples_per_second': 142.34,
 'eval_steps_per_second': 17.792,
 'epoch': 2.0}

In [None]:
# Save our tokenizer and create a model card
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()
# Push the results to the hub
trainer.push_to_hub()

In [73]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("Pradeep18103/roberta-base_ag_news")
model = AutoModelForSequenceClassification.from_pretrained("Pradeep18103/roberta-base_ag_news")

In [74]:
tokenizer

RobertaTokenizerFast(name_or_path='Pradeep18103/roberta-base_ag_news', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}
)

In [75]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [76]:
# Load test dataset again
dataset_id = "ag_news"
from datasets import load_dataset
dataset = load_dataset(dataset_id)
test_dataset = dataset["test"]

# Tokenize test dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [77]:
test_encodings = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [78]:
# Convert to PyTorch tensors
test_input_ids = torch.tensor(test_encodings["input_ids"])
test_attention_mask = torch.tensor(test_encodings["attention_mask"])
test_labels = torch.tensor(test_encodings["label"])  # True labels

In [None]:
# Run inference
model.eval()
with torch.no_grad():
    outputs = model(input_ids=test_input_ids, attention_mask=test_attention_mask)
    predictions = torch.argmax(outputs.logits, axis=1)
