# Creating the Dataset with Mistakes

In [1]:
from recipe_tagger import RecipeTagger

### Tagger Example

In [2]:
tagger = RecipeTagger('/Users/rlopez/PTG/experiments/models/xlm-roberta-base-finetuned-recipe-all')

In [3]:
tokens, tags = tagger.predict_entities('25 grams whole coffee beans.')
tagger.plot_entities(tokens, tags)
tokens, tags = tagger.predict_entities('Let the coffee drain completely into the mug before removing the dripper')
tagger.plot_entities(tokens, tags)

### Loading Ingredients, Units and Quantities

In [4]:
import pandas as pd

nyt_data = pd.read_csv('/Users/rlopez/PTG/experiments/datasets/nyt/nyt-ingredients-snapshot-2015.csv')
nyt_data

Unnamed: 0,index,input,name,qty,range_end,unit,comment
0,0,1 1/4 cups cooked and pureed fresh butternut s...,butternut squash,1.25,0.0,cup,"cooked and pureed fresh, or 1 10-ounce package..."
1,1,1 cup peeled and cooked fresh chestnuts (about...,chestnuts,1.00,0.0,cup,"peeled and cooked fresh (about 20), or 1 cup c..."
2,2,"1 medium-size onion, peeled and chopped",onion,1.00,0.0,,"medium-size, peeled and chopped"
3,3,"2 stalks celery, chopped coarse",celery,2.00,0.0,stalk,chopped coarse
4,4,1 1/2 tablespoons vegetable oil,vegetable oil,1.50,0.0,tablespoon,
...,...,...,...,...,...,...,...
179202,179202,3/4 oz. pineapple juice,pineapple juice,0.75,0.0,ounce,
179203,179203,1 tsp. fresh lemon juice,lemon juice,1.00,0.0,teaspoon,fresh
179204,179204,Angostura bitters,Angostura bitters,0.00,0.0,,
179205,179205,Wedge of pineapple,pineapple,1.00,0.0,wedge,


In [5]:
from fractions import Fraction
import math

def get_frequent_entities(entity, min_frequency=20):
    entity_counts = entity.value_counts()
    entity_counts = entity_counts[entity_counts > min_frequency]
    
    return entity_counts.index.tolist()

def create_fraction(number):
    if number < 1:
        fraction = str(Fraction(number).limit_denominator(10))
        return fraction
    else:
        decimal_part, int_part  = math.modf(number)
        if decimal_part == 0:
            return str(int(int_part))
        else:
            fraction = str(Fraction(decimal_part).limit_denominator(10))
            return '%d %s' % (int_part, fraction)
        
nyt_data['name'] = nyt_data['name'].str.lower()
nyt_data['unit'] = nyt_data['unit'].str.lower()

ingredients = get_frequent_entities(nyt_data['name'])
units = get_frequent_entities(nyt_data['unit'])
quantities = get_frequent_entities(nyt_data['qty'])
quantities = [create_fraction(x)  for x in quantities]

print('Found %d unique ingredients: %s...' % (len(ingredients), str(ingredients[:5])))
print('Found %d unique units: %s...' % (len(units), str(units[:5])))
print('Found %d unique quantities: %s...' % (len(quantities), str(quantities[:5])))

corpus_recipe_entities = {'ingredients': ingredients, 'units': units, 'quantities': quantities}

Found 779 unique ingredients: ['salt', 'garlic', 'olive oil', 'sugar', 'butter']...
Found 52 unique units: ['cup', 'tablespoon', 'teaspoon', 'pound', 'ounce']...
Found 60 unique quantities: ['1', '2', '0', '1/2', '1/4']...


### Replacing Entities

In [6]:
# Read the Recipe1M+ dataset
import json
with open('/Users/rlopez/PTG/experiments/datasets/recipe1M/layer1.json') as fin:
    dataset_recipe1m = json.load(fin)

In [7]:
import random
random.seed(0)

def create_negatives_1mrecipe(entities_to_replace, recipe_id):
    recipe_title = dataset_recipe1m[recipe_id]['title']
    recipe_ingredients = [x['text'] for x in dataset_recipe1m[recipe_id]['ingredients']]
    recipe_steps = [x['text'] for x in dataset_recipe1m[recipe_id]['instructions']]
    recipe_sentences = recipe_ingredients + recipe_steps
    create_wrong_recipe(recipe_title, recipe_sentences, entities_to_replace)
    
def create_wrong_recipe(recipe_title, recipe_sentences, entities_to_replace):
    recipe_entities = {'tokens': [], 'tags': []}
    
    for recipe_sentence in recipe_sentences:
        tokens, tags = tagger.predict_entities(recipe_sentence)
        recipe_entities['tokens'].append(list(tokens))
        recipe_entities['tags'].append(list(tags))
    print('RECIPE TITLE:')
    print(recipe_title)
    #print('ORIGINAL RECIPE:')
    #print_recipe(recipe_title, recipe_entities, entities_to_replace)
    #print('MODIFIED RECIPE:')
    new_recipe_entities = replace_entity(recipe_entities, entities_to_replace)
    #print_recipe(recipe_title, new_recipe_entities, entities_to_replace)
    print('CHANGES:')
    print_changes(recipe_entities, new_recipe_entities, entities_to_replace)
    
def replace_entity(recipe_entities, entities_to_replace):
    new_recipe_entities = {'tokens': [], 'tags': recipe_entities['tags']}
    replacement_mapping = {}
    
    for sentence_tokens, sentence_tags in zip(recipe_entities['tokens'], recipe_entities['tags']):
        new_sentence_tokens = []
        for token, tag in zip(sentence_tokens, sentence_tags):
            if tag in entities_to_replace:
                if tag == 'UNIT':
                    new_token = select_random(token, corpus_recipe_entities['units'], replacement_mapping)
                elif tag == 'INGREDIENT':
                    new_token = select_random(token, corpus_recipe_entities['ingredients'], replacement_mapping)
                elif tag == 'QUANTITY':
                    new_token = select_random(token, corpus_recipe_entities['quantities'], replacement_mapping)
                replacement_mapping[token] = new_token
                token = new_token
            new_sentence_tokens.append(token)
            
        new_recipe_entities['tokens'].append(new_sentence_tokens)
    
    return new_recipe_entities
                
def select_random(token, options, replacement_mapping):
    if token in replacement_mapping:
        return replacement_mapping[token]
    
    return random.choice(options)
    
def print_recipe1(recipe_title, recipe_entities):
    print('\n'.join([' '.join(x) for x in recipe_entities['tokens']]))

def print_recipe(recipe_title, recipe_entities, display_entities):
    for sentence_tokens, sentence_tags in zip(recipe_entities['tokens'], recipe_entities['tags']):
        tagger.plot_entities(sentence_tokens, sentence_tags, display_entities)
        
def print_changes(original_recipe_entities, new_recipe_entities, display_entities):
    for index in range(len(original_recipe_entities['tokens'])):
        sentence_tags = original_recipe_entities['tags'][index]
        sentence_tokens = original_recipe_entities['tokens'][index]
        new_sentence_tokens = new_recipe_entities['tokens'][index]
        print('o-----------begin-----------o')
        tagger.plot_entities(sentence_tokens, sentence_tags, display_entities)
        tagger.plot_entities(new_sentence_tokens, sentence_tags, display_entities)
        print('o------------end------------o')

#### Replacing Quantities

In [8]:
create_negatives_1mrecipe(['QUANTITY'], 1)

RECIPE TITLE:
Dilly Macaroni Salad Recipe
CHANGES:
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o


#### Replacing Units

In [9]:
create_negatives_1mrecipe(['UNIT'], 1)

RECIPE TITLE:
Dilly Macaroni Salad Recipe
CHANGES:
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o


#### Replacing Ingredients

In [10]:
create_negatives_1mrecipe(['INGREDIENT'], 1)

RECIPE TITLE:
Dilly Macaroni Salad Recipe
CHANGES:
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
o-----------begin-----------o


o------------end------------o
