In [6]:
# Setting .env
from dotenv import load_dotenv
import os

load_dotenv()

api_key = os.getenv("HF_TOKEN")

### What we're going to build

We're going to be bulding a `food`/`not_food` **text classification model**. 

Given a piece of a text (such as an image caption), our model will be able to predict if it's about food or not.

More specifically, we're going to follow the following steps:

1. **[Data](https://huggingface.co/datasets/mrdbourke/learn_hf_food_not_food_image_captions): Problem defintion and dataset preparation** - Getting a dataset/setting up the problem space.
2. **[Model](https://huggingface.co/mrdbourke/learn_hf_food_not_food_text_classifier-distilbert-base-uncased): Finding, training and evaluating a model** - Finding a text classification model suitable for our problem on Hugging Face and customizing it to our own dataset.
3. **[Demo](https://huggingface.co/spaces/mrdbourke/learn_hf_food_not_food_text_classifier_demo): Creating a demo and put our model into the real world** - Sharing our trained model in a way others can access and use.

By the end of this project, you'll have a trained model and [demo on Hugging Face](https://huggingface.co/spaces/mrdbourke/learn_hf_food_not_food_text_classifier_demo) you can share with others:

### Import Necessary Libraries

In [7]:
# install dependencies
try:
    import datasets, evaluate, accelerate
    import gradio as gr
except ModuleNotFoundError:
    %pip install -U datasets, evaluate, accelerate, gradio
    import datasets, evaluate, accelerate
    import gradio as gr

import random

import numpy as np
import pandas as pd

import torch
import transformers

print(f"Transformers version: {transformers.__version__}")
print(f"Datasets version: {datasets.__version__}")
print(f"Torch version: {torch.__version__}")


Transformers version: 4.44.2
Datasets version: 3.0.0
Torch version: 2.4.1+cu121


## Loading a Dataset

In [3]:
# Load the dataset from hugging face hub
dataset = datasets.load_dataset(path="mrdbourke/learn_hf_food_not_food_image_captions")

# inspect the dataset
dataset

Generating train split: 100%|██████████| 250/250 [00:00<00:00, 4518.96 examples/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 250
    })
})

In [4]:
# what features are there
dataset.column_names

{'train': ['text', 'label']}

In [8]:
# Access the training split
dataset['train']

Dataset({
    features: ['text', 'label'],
    num_rows: 250
})

In [9]:
dataset['train'][0]

{'text': 'Creamy cauliflower curry with garlic naan, featuring tender cauliflower in a rich sauce with cream and spices, served with garlic naan bread.',
 'label': 'food'}

### Inspect random examples from the dataset

In [10]:
import random

random_indexes = random.sample(range(len(dataset['train'])), 5)
random_samples  = dataset['train'][random_indexes]

print(f"[INFO] Random samples from dataset:\n")
for item in zip(random_samples['text'], random_samples['label']):
    print(f"Text: {item[0]} | Label: {item[1]}")

[INFO] Random samples from dataset:

Text: A bowl of mixed berries, including blueberries, raspberries, and blackberries | Label: food
Text: Set of mixing bowls perched on a shelf | Label: not_food
Text: Vintage telephone sitting on a desk | Label: not_food
Text: A colorful bowl of mixed carrots, including orange and purple. | Label: food
Text: Basketball hoop set up in a driveway | Label: not_food


In [11]:
# Get unique label values
dataset['train'].unique('label')

['food', 'not_food']

In [12]:
# Check number of each label
from collections import Counter

Counter(dataset['train']['label'])

Counter({'food': 125, 'not_food': 125})

In [14]:
# Turn our dataset into a DataFrame and get a random sample
food_not_food_df = pd.DataFrame(dataset['train'])
food_not_food_df

Unnamed: 0,text,label
0,"Creamy cauliflower curry with garlic naan, fea...",food
1,Set of books stacked on a desk,not_food
2,"Watching TV together, a family has their dog s...",not_food
3,Wooden dresser with a mirror reflecting the room,not_food
4,Lawn mower stored in a shed,not_food
...,...,...
245,Standing floor lamp providing light next to an...,not_food
246,Luxurious coconut shrimp curry on a generous p...,food
247,Barbecue grill waiting on a patio,not_food
248,"Family gathered around a dining table, laughin...",not_food


In [16]:
# Get the value counts of the label column
food_not_food_df['label'].value_counts()

label
food        125
not_food    125
Name: count, dtype: int64

## Preparing data for text classification

### Creating a mapping from labels to numbers

In [17]:
# Create mapping from id2label and label2id
id2label = {'0': 'not_food', '1' : 'food'}
label2id = {'not_food' : '0', 'food' : '1'}

print(id2label)
print(label2id)

{'0': 'not_food', '1': 'food'}
{'not_food': '0', 'food': '1'}


In [23]:
# Create mappings programmatically from dataset
id2label = {idx: label for idx, label in enumerate(dataset['train'].unique('label')[::-1])}
label2id = {label: idx for idx, label in id2label.items()}

print(f"ID to Label mapping: {id2label}")
print(f"Label to ID mapping: {label2id}")

ID to Label mapping: {0: 'not_food', 1: 'food'}
Label to ID mapping: {'not_food': 0, 'food': 1}


In [24]:
# Turn labels into 0 or 1 (e.g. 0 for "not_food", 1 for "food")
def map_labels_to_number(example):
    example['label'] = label2id[example['label']]

    return example

example_sample = {"text": "I love eating chicken.", "label": "food"}

# Test the function 
map_labels_to_number(example_sample)

{'text': 'I love eating chicken.', 'label': 1}

In [25]:
# Map our dataset labels to numbers
dataset = dataset["train"].map(map_labels_to_number)
dataset[:5]

Map: 100%|██████████| 250/250 [00:00<00:00, 17198.23 examples/s]


{'text': ['Creamy cauliflower curry with garlic naan, featuring tender cauliflower in a rich sauce with cream and spices, served with garlic naan bread.',
  'Set of books stacked on a desk',
  'Watching TV together, a family has their dog stretched out on the floor',
  'Wooden dresser with a mirror reflecting the room',
  'Lawn mower stored in a shed'],
 'label': [1, 0, 0, 0, 0]}

In [34]:
# Shuffle the dataset and view the first 5 samples (will return different results each time) 
dataset.shuffle()[:5]

{'text': ['Sweet and spicy sushi roll with ingredients like mango and jalapeno.',
  'Flat screen TV neatly mounted on a wall',
  'Set of spoons stored in a drawer',
  'Bicycle leaning casually against a wall',
  'Sushi platter featuring a rainbow of colors with salmon, tuna, and avocado.'],
 'label': [1, 0, 0, 0, 1]}