In [19]:
# Import libraries
import pandas as pd
import numpy as np
import os

from datasets import load_dataset, concatenate_datasets

import transformers, torch, accelerate, evaluate
from huggingface_hub import notebook_login

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer


In [2]:
# Load dataset
ds = load_dataset('imdb')

In [6]:
# Copy access token to clipboard
HFToken = os.getenv('HuggingFaceAccessToken')
pd.DataFrame([HFToken]).to_clipboard(index=False,header=False)
print('Hugging face access token has been copied to clipboard')

Hugging face access token has been copied to clipboard


In [4]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Distilbert tokenizer and model
Tokenize

In [7]:
# use the default preprocessor
# important to ensure expected input to our model (i.e. same lemmatization modelling, stopwords, etc)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    # Map function
    # padding and truncation control for variable length sequences
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# apply to all datasets with .map(). Built in function of the HF datasets class
tokenized_datasets = ds.map(tokenize_function, batched=True)



Model

In [8]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred # raw outputs, actual labels
    predictions = np.argmax(logits, axis=-1) #prediction is the highest output probability
    return metric.compute(predictions=predictions, references=labels) # accuracy computation

Trainer

In [13]:
train_ds = tokenized_datasets['train']
eval_ds = tokenized_datasets['test']

In [14]:
training_args = TrainingArguments(
    output_dir='..//models//fineTunedModel',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

  0%|          | 0/1563 [00:00<?, ?it/s]

{'loss': 0.2901, 'grad_norm': 15.448760986328125, 'learning_rate': 1.3602047344849649e-05, 'epoch': 0.32}
{'loss': 0.2386, 'grad_norm': 8.095005989074707, 'learning_rate': 7.204094689699297e-06, 'epoch': 0.64}
{'loss': 0.2109, 'grad_norm': 11.429597854614258, 'learning_rate': 8.061420345489445e-07, 'epoch': 0.96}
{'train_runtime': 1145.7401, 'train_samples_per_second': 21.82, 'train_steps_per_second': 1.364, 'train_loss': 0.24505678629615868, 'epoch': 1.0}


TrainOutput(global_step=1563, training_loss=0.24505678629615868, metrics={'train_runtime': 1145.7401, 'train_samples_per_second': 21.82, 'train_steps_per_second': 1.364, 'total_flos': 3311684966400000.0, 'train_loss': 0.24505678629615868, 'epoch': 1.0})

In [15]:
trainer.evaluate()

  0%|          | 0/1563 [00:00<?, ?it/s]

{'eval_loss': 0.19233736395835876,
 'eval_f1': 0.9285971727203556,
 'eval_runtime': 401.6952,
 'eval_samples_per_second': 62.236,
 'eval_steps_per_second': 3.891,
 'epoch': 1.0}

# Save model

In [16]:
trainer.save_model('..//models//fineTunedModel')
tokenizer.save_pretrained('..//models//fineTunedModel')

('..//models//fineTunedModel\\tokenizer_config.json',
 '..//models//fineTunedModel\\special_tokens_map.json',
 '..//models//fineTunedModel\\vocab.txt',
 '..//models//fineTunedModel\\added_tokens.json',
 '..//models//fineTunedModel\\tokenizer.json')

# Optimizating Model Hyperparameters

Using a sample of 2500 entries (1250 positive and 1250 negative) to test hyperparameters. This allows for quicker experimentation and training within a reasonable timeframe.

In [47]:
negativeReviewsTrain = tokenized_datasets['train'].filter(lambda example: example['label'] == 0)
positiveReviewsTrain = tokenized_datasets['train'].filter(lambda example: example['label'] == 1)

negativeReviewsTest = tokenized_datasets['test'].filter(lambda example: example['label'] == 0)
positiveReviewsTest = tokenized_datasets['test'].filter(lambda example: example['label'] == 1)

In [48]:
train_ds = concatenate_datasets([
        negativeReviewsTrain.shuffle(seed=42).select(range(1250)), 
        positiveReviewsTrain.shuffle(seed=42).select(range(1250))]).shuffle(seed=42)

eval_ds = concatenate_datasets([
        negativeReviewsTest.shuffle(seed=42).select(range(1250)), 
        positiveReviewsTest.shuffle(seed=42).select(range(1250))]).shuffle(seed=42)

Running and tuning training parameters

In [49]:
repo_name = "imdbSentimentAnalysis"

training_args = TrainingArguments(
   learning_rate=1e-3,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=1,
   weight_decay=0.01,
   warmup_steps=250,
   output_dir=repo_name,
   push_to_hub=True,
)

In [50]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.evaluate()

  0%|          | 0/157 [00:00<?, ?it/s]

{'eval_loss': 0.18454618752002716,
 'eval_model_preparation_time': 0.002,
 'eval_f1': 0.9326961369972122,
 'eval_runtime': 34.715,
 'eval_samples_per_second': 72.015,
 'eval_steps_per_second': 4.523}

In [23]:
repo_name = "imdbSentimentAnalysis"

training_args = TrainingArguments(
   learning_rate=2e-5, # Tried 1e-5, 3e-5, 5e-5
   per_device_train_batch_size=16, 
   per_device_eval_batch_size=16,
   num_train_epochs=1, # Tried 2, 3
   weight_decay=0.01, # Tried 0.1, 0.005, 0.001
   warmup_steps=300, # Tried 250, 300, 350
   output_dir=repo_name,
   push_to_hub=True,
)

Running full model again with adjusted parameters

In [53]:
train_ds = tokenized_datasets['train']
eval_ds = tokenized_datasets['test']

In [55]:
repo_name = "imdbSentimentAnalysis"

training_args = TrainingArguments(
   learning_rate=1e-4,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=1,
   weight_decay=0.01,
   warmup_steps=2000,
   output_dir=repo_name,
   push_to_hub=True,
)

In [56]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.evaluate()

  0%|          | 0/1563 [00:00<?, ?it/s]

{'eval_loss': 0.19233736395835876,
 'eval_model_preparation_time': 0.002,
 'eval_f1': 0.9285971727203556,
 'eval_runtime': 359.7792,
 'eval_samples_per_second': 69.487,
 'eval_steps_per_second': 4.344}

Very little improvement with the additional training arguments and adjustment to thel earning rate.

# Save model

In [60]:
trainer.save_model('..//models//fineTunedModelOptimized')
tokenizer.save_pretrained('..//models//fineTunedModelOptimized')

No files have been modified since last commit. Skipping to prevent empty commit.


('..//models//fineTunedModelOptimized\\tokenizer_config.json',
 '..//models//fineTunedModelOptimized\\special_tokens_map.json',
 '..//models//fineTunedModelOptimized\\vocab.txt',
 '..//models//fineTunedModelOptimized\\added_tokens.json',
 '..//models//fineTunedModelOptimized\\tokenizer.json')

# Pushing model

In [57]:
trainer.push_to_hub()

Upload 17 LFS files:   0%|          | 0/17 [00:00<?, ?it/s]

events.out.tfevents.1725609361.DESKTOP-IJEVO1K.17432.6:   0%|          | 0.00/403 [00:00<?, ?B/s]

events.out.tfevents.1725609007.DESKTOP-IJEVO1K.17432.3:   0%|          | 0.00/403 [00:00<?, ?B/s]

events.out.tfevents.1725609147.DESKTOP-IJEVO1K.17432.4:   0%|          | 0.00/403 [00:00<?, ?B/s]

events.out.tfevents.1725609255.DESKTOP-IJEVO1K.17432.5:   0%|          | 0.00/403 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

events.out.tfevents.1725609409.DESKTOP-IJEVO1K.17432.7:   0%|          | 0.00/403 [00:00<?, ?B/s]

events.out.tfevents.1725609469.DESKTOP-IJEVO1K.17432.8:   0%|          | 0.00/403 [00:00<?, ?B/s]

events.out.tfevents.1725609525.DESKTOP-IJEVO1K.17432.9:   0%|          | 0.00/403 [00:00<?, ?B/s]

events.out.tfevents.1725609615.DESKTOP-IJEVO1K.17432.10:   0%|          | 0.00/403 [00:00<?, ?B/s]

events.out.tfevents.1725609660.DESKTOP-IJEVO1K.17432.11:   0%|          | 0.00/403 [00:00<?, ?B/s]

events.out.tfevents.1725609779.DESKTOP-IJEVO1K.17432.12:   0%|          | 0.00/403 [00:00<?, ?B/s]

events.out.tfevents.1725609922.DESKTOP-IJEVO1K.17432.14:   0%|          | 0.00/403 [00:00<?, ?B/s]

events.out.tfevents.1725609821.DESKTOP-IJEVO1K.17432.13:   0%|          | 0.00/403 [00:00<?, ?B/s]

events.out.tfevents.1725610214.DESKTOP-IJEVO1K.17432.15:   0%|          | 0.00/403 [00:00<?, ?B/s]

events.out.tfevents.1725610281.DESKTOP-IJEVO1K.17432.16:   0%|          | 0.00/403 [00:00<?, ?B/s]

events.out.tfevents.1725610789.DESKTOP-IJEVO1K.17432.17:   0%|          | 0.00/403 [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ktang5/imdbSentimentAnalysis/commit/e3a7ad26e6012b02a45e631029795a25eb26246a', commit_message='End of training', commit_description='', oid='e3a7ad26e6012b02a45e631029795a25eb26246a', pr_url=None, pr_revision=None, pr_num=None)

### Using Model on HuggingFace hub using pipeline module

In [59]:
# predict on new text with uploaded model on huggingface
from transformers import pipeline

data = ["This movie is good", "This is not good, very bad"] 
my_model = pipeline(model="ktang5/imdbSentimentAnalysis")
my_model(data)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'label': 'LABEL_1', 'score': 0.9833689332008362},
 {'label': 'LABEL_0', 'score': 0.9626513123512268}]