In [15]:
! pip install transformers datasets



In [16]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Fine-Tuning a model on Text Classification task

we are using GLUE(General Language Understanding Evaluation) Benchmark for evaluation.GLUE is a widely adapted suite of tasks designed to evaluate the performance of natural language understanding.
GLUE Benchmark is a group of nine classification task on sentences
* CoLA(corpus of Linguistic Acceptability) Determine if a sentence is grammatically correct or not.
* MNLI(Multi-Genre Natural Language Inference) Classify relation between sentence pair across genres.
* MRPC(Microsoft Research Paraphrease Corpus) Determine if two sentences are paraphrases from one another or not.
* QNLI(Question-answering Natural Language Inference) Determine if the answer to a question is in second sentence or not.
* QQP(Quora Question Pairs2) Determine if two questions are semantically equivalent or not
* RTE(REcognizing Textual Entailment) Determine if a sentence entails a given hypothesis or not
* SST-2(Stanford Sentiment Treebank)Determine if the sentence has a positive or negative sentiment.
* STS-B(Semantic Textual Similarity Benchmark) Determine the similarity of two sentences with score from 1 to 5
* WNLI(Winograd Natural Language Ingerence) Determine if a sentene with an anonymous pronoun and sentence with this pronoun replaced are entailed or not

In [17]:
GLUE_TASKS = ["cola", "mnli", "mnli-mm", "mrpc", "qnli", "qqp", "rte", "sst2", "stsb", "wnli"]

In [18]:
task='cola'
model_checkpoint='distilbert-base-uncased'
batch_size=16

In [19]:
# Loading the dataset
from transformers import AutoTokenizer
from datasets import load_dataset
import evaluate

actual_task='mnli' if task=='mnli-mm' else task
dataset=load_dataset('glue',actual_task)
metric=evaluate.load('glue',actual_task)

In [20]:
metric

EvaluationModule(name: "glue", module_type: "metric", features: {'predictions': Value(dtype='int64', id=None), 'references': Value(dtype='int64', id=None)}, usage: """
Compute GLUE evaluation metric associated to each GLUE dataset.
Args:
    predictions: list of predictions to score.
        Each translation should be tokenized into a list of tokens.
    references: list of lists of references for each translation.
        Each reference should be tokenized into a list of tokens.
Returns: depending on the GLUE subset, one or several of:
    "accuracy": Accuracy
    "f1": F1 score
    "pearson": Pearson Correlation
    "spearmanr": Spearman Correlation
    "matthews_correlation": Matthew Correlation
Examples:

    >>> glue_metric = evaluate.load('glue', 'sst2')  # 'sst2' or any of ["mnli", "mnli_mismatched", "mnli_matched", "qnli", "rte", "wnli", "hans"]
    >>> references = [0, 1]
    >>> predictions = [0, 1]
    >>> results = glue_metric.compute(predictions=predictions, references=ref

In [21]:
# Checking our metric on fake dataset
import numpy as np
fake_preds=np.random.randint(0,2,size=(64,))
fake_labels=np.random.randint(0,2,size=(64,))
metric.compute(predictions=fake_preds,references=fake_labels)

{'matthews_correlation': np.float64(-0.26035958758213296)}

In [22]:
# Before we can feed those texts to our model, we need to preprocess them. Will done by tokenizer.
from transformers import AutoTokenizer
# use_fast=True -> It determines whethre to load fast tokenzer implemented in Rust or slow tokenizer implemented in python.
tokenizer=AutoTokenizer.from_pretrained(model_checkpoint,use_fast=True)

In [23]:
# Example
tokenizer('Hello Beautiful')
# here
# input_ids -> These are numerical representations fo the tokens in your input text. The tokenzer converts each token into corresponding integer ID based on model's vocabulary.
# attention_mask -> It is a binary mask that indicates to the model which token should be attended to and which should not

{'input_ids': [101, 7592, 3376, 102], 'attention_mask': [1, 1, 1, 1]}

In [24]:
# So we are specific metric on specific dataset's column
task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mnli-mm": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
}

In [26]:
# We checking
sentence1_key, sentence2_key = task_to_keys[task]
if sentence2_key is None:
    print(f"Sentence: {dataset['train'][0][sentence1_key]}")
else:
    print(f"Sentence 1: {dataset['train'][0][sentence1_key]}")
    print(f"Sentence 2: {dataset['train'][0][sentence2_key]}")

Sentence: Our friends won't buy this analysis, let alone the next one we propose.


In [27]:
# Will process our samples. We feed our data to Tokenizer.
def preprocess_function(examples):
    if sentence2_key is None:
      # truncation=True -> If input is longer than allowed,it cut it down to fit.
        return tokenizer(examples[sentence1_key], truncation=True)
    return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True)

In [28]:
preprocess_function(dataset['train'][:5])

{'input_ids': [[101, 2256, 2814, 2180, 1005, 1056, 4965, 2023, 4106, 1010, 2292, 2894, 1996, 2279, 2028, 2057, 16599, 1012, 102], [101, 2028, 2062, 18404, 2236, 3989, 1998, 1045, 1005, 1049, 3228, 2039, 1012, 102], [101, 2028, 2062, 18404, 2236, 3989, 2030, 1045, 1005, 1049, 3228, 2039, 1012, 102], [101, 1996, 2062, 2057, 2817, 16025, 1010, 1996, 13675, 16103, 2121, 2027, 2131, 1012, 102], [101, 2154, 2011, 2154, 1996, 8866, 2024, 2893, 14163, 8024, 3771, 1012, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [29]:
# Apply this function on all sentences in our dataset.

# batched=True -> map() function passes a batch of examples to function, instead of just one example
encoded_dataset=dataset.map(preprocess_function,batched=True)

Map:   0%|          | 0/8551 [00:00<?, ? examples/s]

Map:   0%|          | 0/1043 [00:00<?, ? examples/s]

Map:   0%|          | 0/1063 [00:00<?, ? examples/s]

# Fine-Tuning the model
Now that our data is ready, we donwload the pretrained model and fine-tune it. We use the AutoModelForSequenceClassification class for it

In [30]:
from transformers import AutoModelForSequenceClassification,TrainingArguments,Trainer
# AutoModelForSequenceClassification ( It ha task specific head) -> Specially configured for a sequence classification task(like sentiment analysis , topic classification)
# TrainingArguments -> Configuration class
# Trainer -> High-level training interface

# Set number of output labels for classification,depending on the task name.
# mnli -> multi-class classification
# stsb -> Regression
num_labels=3 if task.startswith('mnli') else 1 if task=='stsb' else 2

model=AutoModelForSequenceClassification.from_pretrained(model_checkpoint,num_labels=num_labels)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
# We initiate our TrainingArguments
metric_name='pearson' if task=='stsb' else 'matthews_correlation' if task =='cola' else 'accuracy'
model_name=model_checkpoint.split('/')[-1]

args=TrainingArguments(
    f"{model_name}-finetuned-{task}",
    eval_strategy='epoch',# Evaluate the model at end of each epoch
    save_strategy='epoch',# save model at the end of each epoch
    learning_rate=2e-5,# Standard fine tuning learning rate
    per_device_train_batch_size=batch_size, # Batch size per GPU
    per_device_eval_batch_size=batch_size,# batch size per GPU during evaluation
    num_train_epochs=5,# Train for 5 full passes over the training
    weight_decay=0.01,# helps prevent overfitting by penaliing large weights
    load_best_model_at_end=True,# Agter training load the model checkpoint with the best evaluation metric
    metric_for_best_model=metric_name, # Specifies which metric to use when choosing the best model
    push_to_hub=True# Push model to Hugging Face hub
)

In [32]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    if task != "stsb":
        predictions = np.argmax(predictions, axis=1)
    else:
        predictions = predictions[:, 0]
    return metric.compute(predictions=predictions, references=labels)

In [33]:
validation_key = "validation_mismatched" if task == "mnli-mm" else "validation_matched" if task == "mnli" else "validation"
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset[validation_key],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [34]:
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mombhandwalkar38126[0m ([33mombhandwalkar38126-student[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.5221,0.471926,0.446997
2,0.3464,0.4661,0.516314
3,0.2379,0.623722,0.51581
4,0.1808,0.758952,0.517114
5,0.1311,0.850893,0.530131


TrainOutput(global_step=2675, training_loss=0.27362346898729556, metrics={'train_runtime': 6524.3132, 'train_samples_per_second': 6.553, 'train_steps_per_second': 0.41, 'total_flos': 229000686898068.0, 'train_loss': 0.27362346898729556, 'epoch': 5.0})

In [35]:
trainer.evaluate()

{'eval_loss': 0.850893497467041,
 'eval_matthews_correlation': 0.5301312348234369,
 'eval_runtime': 33.2271,
 'eval_samples_per_second': 31.39,
 'eval_steps_per_second': 1.986,
 'epoch': 5.0}

In [36]:
trainer.push_to_hub()

events.out.tfevents.1746087379.a0b1a215c43d.183.1:   0%|          | 0.00/423 [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

events.out.tfevents.1746080800.a0b1a215c43d.183.0:   0%|          | 0.00/8.02k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/OmBhandwalkar/distilbert-base-uncased-finetuned-cola/commit/69ff7521859c77780e187f8496f498ad07508c1a', commit_message='End of training', commit_description='', oid='69ff7521859c77780e187f8496f498ad07508c1a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/OmBhandwalkar/distilbert-base-uncased-finetuned-cola', endpoint='https://huggingface.co', repo_type='model', repo_id='OmBhandwalkar/distilbert-base-uncased-finetuned-cola'), pr_revision=None, pr_num=None)

# Hyperparameter search

In [37]:
 ! pip install optuna
 ! pip install ray[tune]

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.2-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.2 colorlog-6.9.0 optuna-4.3.0
Collecting ray[tune]
  Downloading ray-2.45.0-cp311-cp311-manylinux2014_x86_64.whl.metadata (19 kB)
Collecting tensorboardX>=1.9 (from ray[tune])
  Downloading tensorboardX-2.6.2.2-py2.py3

In [38]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

In [None]:
best_run = trainer.hyperparameter_search(n_trials=10, direction="maximize")

In [None]:
best_run