In [1]:
!pip install accelerate
!pip install -q transformers[torch]



In [2]:

!pip install transformers datasets huggingface_hub tensorboard==2.11
!sudo apt-get install git-lfs --yes


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 18 not upgraded.


In [3]:
!pip install datasets



In [4]:
import datasets
import torch
from datasets import load_dataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)
from huggingface_hub import HfFolder, notebook_login

In [17]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [18]:
model_id = "roberta-base"
# dataset_id = "sandeep12345/roberta_finetune"
# relace the value with your model: ex <hugging-face-user>/<model-name>
repository_id = "sandeep12345/roberta_finetune_model"

In [7]:
# model_id = "roberta-base"
# dataset_id = "ag_news"
# # relace the value with your model: ex <hugging-face-user>/<model-name>
# repository_id = "sandeep12345/roberta_finetune_model"

In [8]:
# Load dataset
dataset = datasets.load_dataset('data_loader.py', "AbstractText")

# Training and testing datasets
train_dataset = dataset['train']
test_dataset = dataset["test"].shard(num_shards=2, index=0)

# Validation dataset
val_dataset = dataset['test'].shard(num_shards=2, index=1)

# Preprocessing
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)

# This function tokenizes the input text using the RoBERTa tokenizer.
# It applies padding and truncation to ensure that all sequences have the same length (256 tokens).
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
val_dataset = val_dataset.map(tokenize, batched=True, batch_size=len(val_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))

Map:   0%|          | 0/684 [00:00<?, ? examples/s]

In [9]:
# Set dataset format
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [10]:
# Print dataset features
print(dataset['train'].features)

# Extract unique labels
unique_labels = set(row['label'] for row in dataset['train'])
num_labels = len(unique_labels)

print(f"Number of unique labels: {num_labels}")
print(f"Unique labels: {unique_labels}")


{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['name', 'description', 'authors'], id=None)}
Number of unique labels: 3
Unique labels: {0, 1, 2}


In [11]:
dataset['train'].features['label']

ClassLabel(names=['name', 'description', 'authors'], id=None)

In [12]:
# We will need this to directly output the class names when using the pipeline without mapping the labels later.
# Extract the number of classes and their names
num_labels = dataset['train'].features['label'].num_classes
class_names = dataset["train"].features["label"].names
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

# Create an id2label mapping
id2label = {i: label for i, label in enumerate(class_names)}

# Update the model's configuration with the id2label mapping
config = AutoConfig.from_pretrained(model_id)
config.update({"id2label": id2label})

number of labels: 3
the labels: ['name', 'description', 'authors']


In [13]:
model = RobertaForSequenceClassification.from_pretrained(model_id, config=config)




Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
from transformers import Trainer, TrainingArguments

# TrainingArguments
training_args = TrainingArguments(
    output_dir=repository_id,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="tensorboard",
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repository_id,
    hub_token=HfFolder.get_token(),
)

In [20]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [21]:
# Fine-tune the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0006,0.032625
2,0.1065,0.032415
3,0.0001,0.028224
4,0.0,0.030897
5,0.0,0.031695


TrainOutput(global_step=3600, training_loss=0.03456908141890987, metrics={'train_runtime': 2766.7341, 'train_samples_per_second': 10.406, 'train_steps_per_second': 1.301, 'total_flos': 7575035296389120.0, 'train_loss': 0.03456908141890987, 'epoch': 5.0})

In [22]:
# Evaluate the model
trainer.evaluate()

{'eval_loss': 0.028224343433976173,
 'eval_runtime': 18.3461,
 'eval_samples_per_second': 37.283,
 'eval_steps_per_second': 4.688,
 'epoch': 5.0}

In [23]:
# Save our tokenizer and create a model card
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()
# Push the results to the hub
trainer.push_to_hub()

'https://huggingface.co/sandeep12345/roberta_finetune_model/tree/main/'

In [26]:
# TEST MODEL

from transformers import pipeline

classifier = pipeline('text-classification',repository_id)

text = "This research leverages advanced biomedical instruments to precisely control parameters influencing biofilm development, including temperature, nutrient flow, and surface characteristics. Bioreactors equipped with real-time monitoring capabilities enable the observation of biofilm growth kinetics and the dynamic interactions between microbial communities. Furthermore, the incorporation of imaging modalities, such as confocal microscopy and advanced sensors, provides detailed insights into biofilm architecture and its responses to therapeutic interventions."
result = classifier(text)

predicted_label = result[0]["label"]
print(f"Predicted label: {predicted_label}")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Predicted label: description
