In [1]:
# Setting .env
from dotenv import load_dotenv
import os

load_dotenv()

api_key = os.getenv("HF_TOKEN")

### What we're going to build

We're going to be bulding a `food`/`not_food` **text classification model**. 

Given a piece of a text (such as an image caption), our model will be able to predict if it's about food or not.

More specifically, we're going to follow the following steps:

1. **[Data](https://huggingface.co/datasets/mrdbourke/learn_hf_food_not_food_image_captions): Problem defintion and dataset preparation** - Getting a dataset/setting up the problem space.
2. **[Model](https://huggingface.co/mrdbourke/learn_hf_food_not_food_text_classifier-distilbert-base-uncased): Finding, training and evaluating a model** - Finding a text classification model suitable for our problem on Hugging Face and customizing it to our own dataset.
3. **[Demo](https://huggingface.co/spaces/mrdbourke/learn_hf_food_not_food_text_classifier_demo): Creating a demo and put our model into the real world** - Sharing our trained model in a way others can access and use.

By the end of this project, you'll have a trained model and [demo on Hugging Face](https://huggingface.co/spaces/mrdbourke/learn_hf_food_not_food_text_classifier_demo) you can share with others:

### Import Necessary Libraries

In [2]:
# install dependencies
try:
    import datasets, evaluate, accelerate
    import gradio as gr
except ModuleNotFoundError:
    %pip install -U datasets, evaluate, accelerate, gradio
    import datasets, evaluate, accelerate
    import gradio as gr

import random

import numpy as np
import pandas as pd

import torch
import transformers

print(f"Transformers version: {transformers.__version__}")
print(f"Datasets version: {datasets.__version__}")
print(f"Torch version: {torch.__version__}")


  from .autonotebook import tqdm as notebook_tqdm
2024-11-16 15:44:32.817934: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-16 15:44:32.844732: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-16 15:44:32.844764: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-16 15:44:32.845550: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-16 15:44:32.8

Transformers version: 4.44.2
Datasets version: 3.0.0
Torch version: 2.4.1+cu121


## Loading a Dataset

In [3]:
# Load the dataset from hugging face hub
dataset = datasets.load_dataset(path="mrdbourke/learn_hf_food_not_food_image_captions")

# inspect the dataset
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 250
    })
})

In [4]:
# what features are there
dataset.column_names

{'train': ['text', 'label']}

In [5]:
# Access the training split
dataset['train']

Dataset({
    features: ['text', 'label'],
    num_rows: 250
})

In [6]:
dataset['train'][0]

{'text': 'Creamy cauliflower curry with garlic naan, featuring tender cauliflower in a rich sauce with cream and spices, served with garlic naan bread.',
 'label': 'food'}

### Inspect random examples from the dataset

In [7]:
import random

random_indexes = random.sample(range(len(dataset['train'])), 5)
random_samples  = dataset['train'][random_indexes]

print(f"[INFO] Random samples from dataset:\n")
for item in zip(random_samples['text'], random_samples['label']):
    print(f"Text: {item[0]} | Label: {item[1]}")

[INFO] Random samples from dataset:

Text: Rich and spicy lamb rogan josh with yogurt garnish, featuring tender lamb pieces in a bold sauce with spices, finished with creamy yogurt. | Label: food
Text: Black and white checkered kitchen floor adding a classic touch | Label: not_food
Text: Microscope set up on a table | Label: not_food
Text: Walking in the park, a man jogs with his energetic dog | Label: not_food
Text: Parsnips in a bowl, sprinkled with thyme and served with a side of honey for a tasty, unique snack. | Label: food


In [8]:
# Get unique label values
dataset['train'].unique('label')

['food', 'not_food']

In [9]:
# Check number of each label
from collections import Counter

Counter(dataset['train']['label'])

Counter({'food': 125, 'not_food': 125})

In [10]:
# Turn our dataset into a DataFrame and get a random sample
food_not_food_df = pd.DataFrame(dataset['train'])
food_not_food_df

Unnamed: 0,text,label
0,"Creamy cauliflower curry with garlic naan, fea...",food
1,Set of books stacked on a desk,not_food
2,"Watching TV together, a family has their dog s...",not_food
3,Wooden dresser with a mirror reflecting the room,not_food
4,Lawn mower stored in a shed,not_food
...,...,...
245,Standing floor lamp providing light next to an...,not_food
246,Luxurious coconut shrimp curry on a generous p...,food
247,Barbecue grill waiting on a patio,not_food
248,"Family gathered around a dining table, laughin...",not_food


In [11]:
# Get the value counts of the label column
food_not_food_df['label'].value_counts()

label
food        125
not_food    125
Name: count, dtype: int64

## Preparing data for text classification

### Creating a mapping from labels to numbers

In [12]:
# Create mapping from id2label and label2id
id2label = {'0': 'not_food', '1' : 'food'}
label2id = {'not_food' : '0', 'food' : '1'}

print(id2label)
print(label2id)

ERROR! Session/line number was not unique in database. History logging moved to new session 73
{'0': 'not_food', '1': 'food'}
{'not_food': '0', 'food': '1'}


In [13]:
# Create mappings programmatically from dataset
id2label = {idx: label for idx, label in enumerate(dataset['train'].unique('label')[::-1])}
label2id = {label: idx for idx, label in id2label.items()}

print(f"ID to Label mapping: {id2label}")
print(f"Label to ID mapping: {label2id}")

ID to Label mapping: {0: 'not_food', 1: 'food'}
Label to ID mapping: {'not_food': 0, 'food': 1}


In [14]:
# Turn labels into 0 or 1 (e.g. 0 for "not_food", 1 for "food")
def map_labels_to_number(example):
    example['label'] = label2id[example['label']]

    return example

example_sample = {"text": "I love eating chicken.", "label": "food"}

# Test the function 
map_labels_to_number(example_sample)

{'text': 'I love eating chicken.', 'label': 1}

In [15]:
# Map our dataset labels to numbers
dataset = dataset["train"].map(map_labels_to_number)
dataset[:5]

{'text': ['Creamy cauliflower curry with garlic naan, featuring tender cauliflower in a rich sauce with cream and spices, served with garlic naan bread.',
  'Set of books stacked on a desk',
  'Watching TV together, a family has their dog stretched out on the floor',
  'Wooden dresser with a mirror reflecting the room',
  'Lawn mower stored in a shed'],
 'label': [1, 0, 0, 0, 0]}

In [16]:
# Shuffle the dataset and view the first 5 samples (will return different results each time) 
dataset.shuffle()[:5]

{'text': ['A close-up shot of a big orange pumpkin with a face cut out of the side for Halloween.',
  'Set of binoculars placed on a table',
  'Bowl of sashimi with thin slices of raw fish.',
  'Colorful area rug brightening up a living room',
  'Yellow squash in a bowl, sprinkled with oregano and served with a side of pesto sauce for a tasty, flavorful dish.'],
 'label': [1, 0, 1, 0, 1]}

### Split the dataset into training and test sets

In [17]:
dataset = dataset.train_test_split(test_size=0.2, seed=42)
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 200
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 50
    })
})

In [18]:
random_idx_train = random.randint(0, len(dataset['train']))
random_sample_train = dataset['train'][random_idx_train]

random_idx_test = random.randint(0, len(dataset['test']))
random_sample_test = dataset['test'][random_idx_test]

print(f"[INFO] Random sample from training dataset:")
print(f"Text: {random_sample_train['text']}\nLabel: {random_sample_train['label']} ({id2label[random_sample_train['label']]})\n")
print(f"[INFO] Random sample from testing dataset:")
print(f"Text: {random_sample_test['text']}\nLabel: {random_sample_test['label']} ({id2label[random_sample_test['label']]})")

[INFO] Random sample from training dataset:
Text: Fusion sushi roll with ingredients like cream cheese or teriyaki sauce.
Label: 1 (food)

[INFO] Random sample from testing dataset:
Text: A bowl of sliced kiwi with a sprinkle of sugar and a side of yogurt
Label: 1 (food)


### Tokenizing text data

In [19]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path="distilbert/distilbert-base-uncased",
                                          use_fast = True)

tokenizer



DistilBertTokenizerFast(name_or_path='distilbert/distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [20]:
# Test out tokenizer
tokenizer("I love pizza")

{'input_ids': [101, 1045, 2293, 10733, 102], 'attention_mask': [1, 1, 1, 1, 1]}

In [21]:
tokenizer("Machine learning")

{'input_ids': [101, 3698, 4083, 102], 'attention_mask': [1, 1, 1, 1]}

In [22]:
# Get the length of the vocabulary 
length_of_vocab = len(tokenizer.vocab)
print(f"Length of vocabulary is {length_of_vocab}")

# Get the maximum sequence length the tokenizer can handle
max_tokenizer_input_seq = tokenizer.model_max_length
print(f"Length of max tokenizer input sequence: {max_tokenizer_input_seq}")

Length of vocabulary is 30522
Length of max tokenizer input sequence: 512


In [23]:
tokenizer.vocab['chicken']

7975

In [24]:
# Gets error because this word is not in the vocab
# tokenizer.vocab['shivaji']

when calling the tokenizer on the word, it will automatically split the word into word pieces or subwords.

In [25]:
# We can check what word pieces got broken into with tokenizer.convert_ids_to_tokens(input_ids).
tokenizer.convert_ids_to_tokens(tokenizer('shivaji').input_ids)

['[CLS]', 'shiva', '##ji', '[SEP]']

In [26]:
# Try to tokenize an emoji
tokenizer.convert_ids_to_tokens(tokenizer("🏏").input_ids)

['[CLS]', '[UNK]', '[SEP]']

Since the tokenizer.vocab is a Python dictionary, we can get a sample of the vocabulary using tokenizer.vocab.items().

In [27]:
# Get the first 5 items in the tokenizer vocab
sorted(tokenizer.vocab.items())[:5]

[('!', 999), ('"', 1000), ('#', 1001), ('##!', 29612), ('##"', 29613)]

In [28]:
import random
random.sample(sorted(tokenizer.vocab.items()), k=5)

[('masterpiece', 17743),
 ('gideon', 12137),
 ('tinged', 22683),
 ('handler', 28213),
 ('[unused927]', 932)]

### Making a preprocessing function to tokenize text

In [29]:
def tokenize_text(examples):
    return tokenizer(examples['text'],
                     padding=True,
                     truncation=True)

In [30]:
example_sample_2 = {'text':"I love chicken", "label":1}

tokenize_text(example_sample_2)

{'input_ids': [101, 1045, 2293, 7975, 102], 'attention_mask': [1, 1, 1, 1, 1]}

In [31]:
tokenized_dataset = dataset.map(function=tokenize_text,
                                batched=True,
                                batch_size=1000)

tokenized_dataset

Map: 100%|██████████| 200/200 [00:00<00:00, 12637.06 examples/s]
Map: 100%|██████████| 50/50 [00:00<00:00, 7586.28 examples/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 200
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 50
    })
})

In [32]:
# Get two samples from the tokenized dataset
train_tokenized_sample = tokenized_dataset['train'][0]
test_tokenized_sample = tokenized_dataset["test"][0]

for key in train_tokenized_sample.keys():
    print(f"[INFO] Key: {key}")
    print(f"Train sample: {train_tokenized_sample[key]}")
    print(f"Test sample: {test_tokenized_sample[key]}")
    print("")

[INFO] Key: text
Train sample: Set of headphones placed on a desk
Test sample: A slice of pepperoni pizza with a layer of melted cheese

[INFO] Key: label
Train sample: 0
Test sample: 1

[INFO] Key: input_ids
Train sample: [101, 2275, 1997, 2132, 19093, 2872, 2006, 1037, 4624, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Test sample: [101, 1037, 14704, 1997, 11565, 10698, 10733, 2007, 1037, 6741, 1997, 12501, 8808, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

[INFO] Key: attention_mask
Train sample: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Test sample: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]



## Setting up an evaluation metric

In [33]:
import evaluate
import numpy as np
from typing import Tuple

accuracy_metric = evaluate.load('accuracy')

def compute_accuracy(predictions_and_labels: Tuple[np.array, np.array]):
    predictions, labels = predictions_and_labels

    if len(predictions.shape)>=2:
        predictions = np.argmax(predictions, axis=1)

    return accuracy_metric.compute(predictions=predictions, references=labels)

Downloading builder script: 100%|██████████| 4.20k/4.20k [00:00<00:00, 10.7MB/s]


In [35]:
# Create example list of predictions and labels
example_labels = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
example_predictions_all_correct = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
example_predictions_one_wrong = np.array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0])

# Test the function
print(f"Accuracy when all predictions are correct: {compute_accuracy((example_predictions_all_correct, example_labels))}")
print(f"Accuracy when one prediction is wrong: {compute_accuracy((example_predictions_one_wrong, example_labels))}")

Accuracy when all predictions are correct: {'accuracy': 1.0}
Accuracy when one prediction is wrong: {'accuracy': 0.9}


## Setting up a model for training

In [36]:
# Get id and label mappings
print(f"id2label: {id2label}")
print(f"label2id: {label2id}")

id2label: {0: 'not_food', 1: 'food'}
label2id: {'not_food': 0, 'food': 1}


In [37]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path="distilbert/distilbert-base-uncased",
    num_labels = 2,
    id2label = id2label,
    
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Try and make a prediction with the loaded model (this will error)
# model(**tokenized_dataset['train'][0])

TypeError: DistilBertForSequenceClassification.forward() got an unexpected keyword argument 'text'

In [40]:
# Inspect the model
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 