In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m37.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.1 tokenizers-0.13.2 transformers-4.26.1


In [2]:
import ast
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [3]:
tokenizer = AutoTokenizer.from_pretrained("edwardjross/xlm-roberta-base-finetuned-recipe-all")
model = AutoModelForTokenClassification.from_pretrained("edwardjross/xlm-roberta-base-finetuned-recipe-all")

device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = model.to(device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/398 [00:00<?, ?B/s]

Downloading (…)ncepiece.bpe.model";:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)"tokenizer.json";:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.04k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

In [4]:
def get_food_indices(sentence):
    # Tokenise and get the token labels
    tokens = tokenizer(sentence, return_tensors="pt").to(device)
    sentence_tokens = tokenizer.tokenize(sentence)
    output = model(**tokens).logits
    predicted_token_class_ids = output.argmax(-1)
    predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]
    # Only assumtption is that "salt pepper is not an ingredient", there will be some space, special character bw 2 different ingredients.
    extractions = []
    ind = 1
    while(ind<len(predicted_tokens_classes)-1):
      if predicted_tokens_classes[ind] == "I-NAME":
        word = sentence_tokens[ind-1]
        # If there is a "▁" at the begining, replace it.
        word = word.replace("▁","")
        # print(word)
        i = ind+1
        # Then loop ahead to see for multiword. 
        while(i<(len(predicted_tokens_classes)-1) and predicted_tokens_classes[i] == "I-NAME"):
          part = sentence_tokens[i-1]
          # If begins with ▁, then needs to be appended to the previous word. 
          if(part[0] != "▁"):
            word= word+part
          # Otherwise, it is another word. 
          else:
            if word == "":
              word = part[1:]
            else:
              word = word+" "+part[1:]
          i+=1
        ind = i
        # Final ingredient formed, append it to the list. 
        extractions.append(word)
      else:
        ind+=1
    return extractions

In [10]:
sent = "2 tablespoons of butter with a pinch of salt and 1 butter lettuce"

In [11]:
get_food_indices(sent)

['butter', 'butter lett']

In [12]:
def get_ingredient(description):
    # 1. Lowercase
    description = description.lower()
    # 2. Remove special characters
    description = description.replace("-","")
    description = description.replace("®","")
    extractions = get_food_indices(description)
    # Lemmatize each word. 
    lemmatizer = WordNetLemmatizer()
    names = []
    for ingredient in extractions:
      ingredient_name = []
      for word in ingredient.split(" "):
        ingredient_name.append(lemmatizer.lemmatize(word))
      ingredient_name = " ".join(ingredient_name)
      names.append(ingredient_name)
    return names

In [26]:
get_ingredient(sent)

"['peanut butter', 'salt']"

In [24]:
sent = "peanut butter and a pinch of salt"