In [28]:
from joblib import load
import numpy as np
import pandas as pd
import spacy
import re
from fractions import Fraction
from decimal import Decimal, InvalidOperation
import unicodedata


In [29]:
crf = load('../models/crf_model.joblib')
nlp = spacy.load('en_core_web_lg', disable=['ner','textcat'])

In [30]:
raw_df = pd.read_json("../data/raw/recipes_raw_epi.json")

In [31]:
raw_df.head()

Unnamed: 0,http://www.epicurious.com/recipes/food/views/-adult-pimiento-cheese-100852,http://www.epicurious.com/recipes/food/views/-an-apple-a-day-51133430,http://www.epicurious.com/recipes/food/views/-blanketed-eggplant-305,http://www.epicurious.com/recipes/food/views/-bloody-mary-tomato-toast-with-celery-and-horseradish-56389813,http://www.epicurious.com/recipes/food/views/-bow-tie-pasta-with-zucchini-101932,http://www.epicurious.com/recipes/food/views/-burnt-carrots-and-parsnips-56390131,http://www.epicurious.com/recipes/food/views/-california-roll-salad-12246,http://www.epicurious.com/recipes/food/views/-candy-corn-frozen-citrus-cream-pops-368770,http://www.epicurious.com/recipes/food/views/-candy-corn-pumpkin-blondies-51254510,http://www.epicurious.com/recipes/food/views/-cannoli-ice-cream-sandwiches-242004,...,http://www.epicurious.com/recipes/food/views/zucchini-wrapped-red-snapper-with-tomato-cumin-and-orange-sauce-10045,http://www.epicurious.com/recipes/food/views/zuni-ricotta-gnocchi-241532,http://www.epicurious.com/recipes/food/views/zuni-roast-chicken-with-bread-salad-56389456,http://www.epicurious.com/recipes/food/views/zuni-rolls-with-raspberry-chipotle-sauce-15259,http://www.epicurious.com/recipes/food/views/zuppa-di-cavolo-nero-cannellini-e-salsicce-kale-white-bean-and-sausage-soup-363386,http://www.epicurious.com/recipes/food/views/zuppa-inglese-394,http://www.epicurious.com/recipes/food/views/zwetschgenkuchen-14600,http://www.epicurious.com/recipes/seared-scallops-with-tomato-water-lime-and-mint-51242060-recipe,http://www.epicurious.com/simple-syrup-368889-recipe,http://www.epicurious.com/suzanne-goin-s-corned-beef-and-cabbage-with-parsley-mustard-sauce-56389323-recipe
avg_rating,2.5,3.5,3,4,3.5,3.5,3.5,2,3,3,...,3,3,4,3.5,3.5,4,3,,,
best_rating,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,,,
ingredients,"[2 or 3 large garlic cloves, a 2-ounce jar dic...","[1/4 cup (1/2 stick) unsalted butter, 4-5 medi...","[8 small Japanese eggplants, peeled, 16 large ...","[1 lemon, zested, juiced, 1/2 shallot, finely ...","[6 small zucchini, 2 teaspoons salt, 2 cups pa...","[1 1/2 pounds carrots, peeled, halved lengthwi...","[1 1/2 cups long-grain rice, 1/4 cup plus 3 ta...","[2 (14-ounce) cans sweetened condensed milk, 1...","[1 cup (2 sticks) cold unsalted butter, plus m...","[3/4 cup whole-milk ricotta, 1/4 cup cream che...",...,"[2 teaspoons cumin seeds, four 6- to 7-ounce r...","[1 pound fresh ricotta (2 cups), 2 large cold ...","[One small chicken, 2 3/4 to 3 1/2 pounds, 4 t...",[1 cup fresh raspberries or frozen unsweetened...,"[2 Italian sausages, skins removed and meat cr...","[4 cups milk (do not use low-fat or nonfat), 1...","[1 cup unbleached all-purpose flour, dash of s...",[],[],[]
instructions,Force garlic through a garlic press into a lar...,Melt butter in a large heavy skillet over medi...,Place eggplants on double thickness of paper t...,"Combine lemon zest, lemon juice, shallot, toma...",Cut zucchini crosswise into 1/8-inch-thick sli...,"Preheat oven to 450°F. Toss carrots, parsnips,...",Into a large saucepan of salted boiling water ...,"Put one can of milk in each of 2 bowls, then w...",Preheat oven to 350°F. Lightly butter a 9- by ...,Stir together all ingredients except chocolate...,...,Preheat oven to 450°F.\nIn a small dry heavy s...,Check the cheese for wetness. If you are lucky...,Remove and discard the lump of fat inside the ...,In a small saucepan combine sauce ingredients ...,Sauté the crumbled sausage in the oil until br...,Bring milk and 1/2 cup plus 3 tablespoons suga...,"To make the crust using a food processor, fitt...",,,
num_reviews,25,3,4,2,48,2,83,1,15,1,...,8,14,4,47,14,7,6,,,


In [32]:
trans_df = raw_df.transpose()
print(trans_df.shape)
trans_df = trans_df[trans_df.astype(str)['ingredients'] != '[]']
print(trans_df.shape)

(36100, 11)
(35998, 11)


In [33]:
def word2features(sent, i):

    features = {
        "bias": 1.0,
        "lemma": sent[i].lemma_,
        "pos": sent[i].pos_,
        "tag": sent[i].tag_,
        "dep": sent[i].dep_,
        "shape": sent[i].shape_,
        "is_alpha": sent[i].is_alpha,
        "is_stop": sent[i].is_stop,
        "is_title": sent[i].is_title,
        "is_punct": sent[i].is_punct,
    }
    if i > 0:
        features.update(
            {
                "-1:lemma": sent[i - 1].lemma_,
                "-1:pos": sent[i - 1].pos_,
                "-1:tag": sent[i - 1].tag_,
                "-1:dep": sent[i - 1].dep_,
                "-1:shape": sent[i - 1].shape_,
                "-1:is_alpha": sent[i - 1].is_alpha,
                "-1:is_stop": sent[i - 1].is_stop,
                "-1:is_title": sent[i - 1].is_title,
                "-1:is_left_punct": sent[i - 1].is_left_punct,
            }
        )
        if i > 1:
            features.update(
                {
                    "-2:lemma": sent[i - 2].lemma_,
                    "-2:pos": sent[i - 2].pos_,
                    "-2:tag": sent[i - 2].tag_,
                    "-2:dep": sent[i - 2].dep_,
                    "-2:shape": sent[i - 2].shape_,
                    "-2:is_alpha": sent[i - 2].is_alpha,
                    "-2:is_stop": sent[i - 2].is_stop,
                    "-2:is_title": sent[i - 2].is_title,
                    "-2:is_left_punct": sent[i - 2].is_left_punct,
                }
            )
    else:
        features["BOS"] = True

    if i < len(sent) - 1:
        features.update(
            {
                "+1:lemma": sent[i + 1].lemma_,
                "+1:pos": sent[i + 1].pos_,
                "+1:tag": sent[i + 1].tag_,
                "+1:dep": sent[i + 1].dep_,
                "+1:shape": sent[i + 1].shape_,
                "+1:is_alpha": sent[i + 1].is_alpha,
                "+1:is_stop": sent[i + 1].is_stop,
                "+1:is_title": sent[i + 1].is_title,
                "+1:is_right_punct": sent[i + 1].is_right_punct,
            }
        )
        if i < len(sent) - 2:
            features.update(
                {
                    "+2:lemma": sent[i + 2].lemma_,
                    "+2:pos": sent[i + 2].pos_,
                    "+2:tag": sent[i + 2].tag_,
                    "+2:dep": sent[i + 2].dep_,
                    "+2:shape": sent[i + 2].shape_,
                    "+2:is_alpha": sent[i + 2].is_alpha,
                    "+2:is_stop": sent[i + 2].is_stop,
                    "+2:is_title": sent[i + 2].is_title,
                    "+2:is_right_punct": sent[i + 2].is_right_punct,
                }
            )
    else:
        features["EOS"] = True

    return features

In [7]:
def cleanHTML(s):
    try: # There are empty input strings that need to be caught
        m1 = re.findall(r"\<?\/?epi:recipelink\>?", s) # this will remove all: epi:recipelink stuff
        if m1:
            for m in m1:
                s = re.sub(r"\<?\/?epi:recipelink\>?", "", s)
        m2 = re.findall('([1-9][0-9]*)\\"', s) # this will remove all: \" and replace with inch
        if m2:
            for m in m2:
                s = re.sub('([0-9]+)\\"', m+"-inch", s)
        if not s:
            s = float('nan')
        else:
            s = s.strip()
        return s
    
    except TypeError:
        return s
    
def cleanUnicodeFractions(s):
    """
    Replace unicode fractions with ascii representation, preceded by a
    space.

    "1\x215e" => "1 7/8"
    """
    
    # match all mixed fractions with a unicode fraction (e.g. 1 ¾ or 1¾) and add them together
    try:
        m1 = re.findall(r"(\d+)\s?([\u2150-\u215E\u00BC-\u00BE])", s)
        if m1:
            for m in m1:
                num = float(m[0]) + float(Fraction(unicodedata.numeric(m[1])))
                s = re.sub(r"(\d+)\s?([\u2150-\u215E\u00BC-\u00BE])", str(round(num, 3)), s)
        # match all unicode fractions
        m2 = re.findall(r"([\u2150-\u215E\u00BC-\u00BE])", s)
        if m2:
            for m in m2:
                s = re.sub(r"([\u2150-\u215E\u00BC-\u00BE])", str(round(float(Fraction(unicodedata.numeric(m))), 3)), s)
    except TypeError:
        print(s)
    return s

def mergeFractions(sent):
    m1 = re.findall(r'(\d+)\s+(\d\/\d)', sent)
    if m1:
        for m in m1:
            num = float(m[0]) + float(Fraction(m[1]))
            sent = re.sub(r'(\d+)\s+(\d\/\d)', str(round(num, 3)), sent)
        
    m2 = re.findall(r'(\d\/\d)', sent)
    if m2:
        for m in m2:
            num = float(Fraction(m))
            sent = re.sub(r'(\d\/\d)', str(round(num, 3)), sent)
    return sent

def multiplyQty(s):
    m1 = re.findall(r'(\d+)\s+(\d+\.\d+)', s)
    if m1:
        for m in m1:
            num = float(m[0]) * float(m[1])
            s = re.sub(r'(\d+)\s+(\d+\.\d+)', str(round(num, 3)), s)
    return s

In [35]:

s = trans_df['ingredients'].apply(pd.Series, 1).stack()
s.index = s.index.droplevel(-1)
s.name = 'ingredient'
del trans_df['ingredients']
trans_df = trans_df.join(s)
trans_df.head()

Unnamed: 0,avg_rating,best_rating,instructions,num_reviews,prepare_again_rating,tags,title,total_time,worst_rating,yields,ingredient
http://www.epicurious.com/recipes/food/views/-adult-pimiento-cheese-100852,2.5,4,Force garlic through a garlic press into a lar...,25,0.85,"[Cheese, Vegetable, No-Cook, Vegetarian, Quick...","""Adult"" Pimiento Cheese",0,0,2 item(s),2 or 3 large garlic cloves
http://www.epicurious.com/recipes/food/views/-adult-pimiento-cheese-100852,2.5,4,Force garlic through a garlic press into a lar...,25,0.85,"[Cheese, Vegetable, No-Cook, Vegetarian, Quick...","""Adult"" Pimiento Cheese",0,0,2 item(s),a 2-ounce jar diced pimientos
http://www.epicurious.com/recipes/food/views/-adult-pimiento-cheese-100852,2.5,4,Force garlic through a garlic press into a lar...,25,0.85,"[Cheese, Vegetable, No-Cook, Vegetarian, Quick...","""Adult"" Pimiento Cheese",0,0,2 item(s),3 cups coarsely grated sharp Cheddar (preferab...
http://www.epicurious.com/recipes/food/views/-adult-pimiento-cheese-100852,2.5,4,Force garlic through a garlic press into a lar...,25,0.85,"[Cheese, Vegetable, No-Cook, Vegetarian, Quick...","""Adult"" Pimiento Cheese",0,0,2 item(s),1/3 to 1/2 cup mayonnaise
http://www.epicurious.com/recipes/food/views/-adult-pimiento-cheese-100852,2.5,4,Force garlic through a garlic press into a lar...,25,0.85,"[Cheese, Vegetable, No-Cook, Vegetarian, Quick...","""Adult"" Pimiento Cheese",0,0,2 item(s),crackers


In [8]:
# Load raw data and do some preprocessing
trans_df["ingredients"] = trans_df["ingredients"].apply(lambda row: [cleanHTML(ingredient) for ingredient in row if ingredient]) 
# convert mixed and partial fractions at begining of string to float
trans_df["ingredients"] = trans_df["ingredients"].apply(lambda row: [cleanUnicodeFractions(ingredient) for ingredient in row])
trans_df["ingredients"] = trans_df["ingredients"].apply(lambda row: [mergeFractions(ingredient) for ingredient in row])
trans_df["ingredients"] = trans_df["ingredients"].apply(lambda row: [multiplyQty(ingredient) for ingredient in row])


In [9]:
s = trans_df['ingredients'].apply(pd.Series, 1).stack()
s.index = s.index.droplevel(-1)
s.name = 'ingredient'
del trans_df['ingredients']
trans_df = trans_df.join(s)

In [10]:
trans_df.head()

Unnamed: 0,avg_rating,best_rating,instructions,num_reviews,prepare_again_rating,tags,title,total_time,worst_rating,yields,ingredient
http://www.epicurious.com/recipes/food/views/-adult-pimiento-cheese-100852,2.5,4,Force garlic through a garlic press into a lar...,25,0.85,"[Cheese, Vegetable, No-Cook, Vegetarian, Quick...","""Adult"" Pimiento Cheese",0,0,2 item(s),2 or 3 large garlic cloves
http://www.epicurious.com/recipes/food/views/-adult-pimiento-cheese-100852,2.5,4,Force garlic through a garlic press into a lar...,25,0.85,"[Cheese, Vegetable, No-Cook, Vegetarian, Quick...","""Adult"" Pimiento Cheese",0,0,2 item(s),a 2-ounce jar diced pimientos
http://www.epicurious.com/recipes/food/views/-adult-pimiento-cheese-100852,2.5,4,Force garlic through a garlic press into a lar...,25,0.85,"[Cheese, Vegetable, No-Cook, Vegetarian, Quick...","""Adult"" Pimiento Cheese",0,0,2 item(s),3 cups coarsely grated sharp Cheddar (preferab...
http://www.epicurious.com/recipes/food/views/-adult-pimiento-cheese-100852,2.5,4,Force garlic through a garlic press into a lar...,25,0.85,"[Cheese, Vegetable, No-Cook, Vegetarian, Quick...","""Adult"" Pimiento Cheese",0,0,2 item(s),0.333 to 0.333 cup mayonnaise
http://www.epicurious.com/recipes/food/views/-adult-pimiento-cheese-100852,2.5,4,Force garlic through a garlic press into a lar...,25,0.85,"[Cheese, Vegetable, No-Cook, Vegetarian, Quick...","""Adult"" Pimiento Cheese",0,0,2 item(s),crackers


In [11]:
# have spacy parse the input string with the full pipeline to generate features this will take some time
trans_df["ingredient"] = list(nlp.pipe(trans_df["ingredient"].astype('unicode').values, batch_size=50))


In [12]:
crf_predict_features = trans_df["ingredient"].apply(lambda doc: [word2features(doc,i) for i in range(len(doc))])
    

In [13]:
y_pred = crf.predict(crf_predict_features.values)

In [14]:
trans_df["ingredient"] = trans_df["ingredient"].apply(lambda doc: [token.lemma_ for token in doc])

In [15]:
trans_df.head()

Unnamed: 0,avg_rating,best_rating,instructions,num_reviews,prepare_again_rating,tags,title,total_time,worst_rating,yields,ingredient
http://www.epicurious.com/recipes/food/views/-adult-pimiento-cheese-100852,2.5,4,Force garlic through a garlic press into a lar...,25,0.85,"[Cheese, Vegetable, No-Cook, Vegetarian, Quick...","""Adult"" Pimiento Cheese",0,0,2 item(s),"[2, or, 3, large, garlic, clove]"
http://www.epicurious.com/recipes/food/views/-adult-pimiento-cheese-100852,2.5,4,Force garlic through a garlic press into a lar...,25,0.85,"[Cheese, Vegetable, No-Cook, Vegetarian, Quick...","""Adult"" Pimiento Cheese",0,0,2 item(s),"[a, 2-ounce, jar, dice, pimiento]"
http://www.epicurious.com/recipes/food/views/-adult-pimiento-cheese-100852,2.5,4,Force garlic through a garlic press into a lar...,25,0.85,"[Cheese, Vegetable, No-Cook, Vegetarian, Quick...","""Adult"" Pimiento Cheese",0,0,2 item(s),"[3, cup, coarsely, grated, sharp, Cheddar, (, ..."
http://www.epicurious.com/recipes/food/views/-adult-pimiento-cheese-100852,2.5,4,Force garlic through a garlic press into a lar...,25,0.85,"[Cheese, Vegetable, No-Cook, Vegetarian, Quick...","""Adult"" Pimiento Cheese",0,0,2 item(s),"[0.333, to, 0.333, cup, mayonnaise]"
http://www.epicurious.com/recipes/food/views/-adult-pimiento-cheese-100852,2.5,4,Force garlic through a garlic press into a lar...,25,0.85,"[Cheese, Vegetable, No-Cook, Vegetarian, Quick...","""Adult"" Pimiento Cheese",0,0,2 item(s),[cracker]


In [16]:
def smartJoin(words):
    """
    Joins list of words with spaces, but is smart about not adding spaces
    before commas.
    """

    input = " ".join(words)

    # replace " , " with ", "
    input = input.replace(" , ", ", ")

    # replace " ( " with " ("
    input = input.replace("( ", "(")

    # replace " ) " with ") "
    input = input.replace(" )", ")")

    return input


def format_ingredient_output(tokens, tags, index, title, display=False):
    """Formats the tagger output into a more convenient dictionary"""
    data = [{}]
    display = [[]]
    prevTag = None
    
    for token, tag in zip(tokens, tags):
    # turn B-NAME/123 back into "name"
        tag = re.sub(r'^[BI]\-', "", tag).lower()
        # ---- DISPLAY ----
        # build a structure which groups each token by its tag, so we can
        # rebuild the original display name later.

        if prevTag != tag:
            display[-1].append((tag, [token]))
            prevTag = tag
        else:
            display[-1][-1][1].append(token)
            #               ^- token
            #            ^---- tag
            #        ^-------- ingredient

            # ---- DATA ----
            # build a dict grouping tokens by their tag

            # initialize this attribute if this is the first token of its kind
        if tag not in data[-1]:
            data[-1][tag] = []


        data[-1][tag].append(token)

    # reassemble the output into a list of dicts.
    output = [
        dict([(k, smartJoin(tokens)) for k, tokens in ingredient.items()])
        for ingredient in data
        if len(ingredient)
    ]

    # Add the raw ingredient phrase
    for i, v in enumerate(output):
        output[i]["input"] = smartJoin(
            [" ".join(tokens) for k, tokens in display[i]])
    try:
        output[-1]['index'] = index
        output[-1]['title'] = title
    except IndexError:
        print(tokens, tags, index, title)
    return output

format_ingredient_output(trans_df["ingredient"].values[0], y_pred[0], '0', 'bob')

[{'qty': '2',
  'comment': 'or 3 large',
  'name': 'garlic',
  'other': 'clove',
  'input': '2 or 3 large garlic clove',
  'index': '0',
  'title': 'bob'}]

In [17]:
from itertools import chain
vfunc = np.vectorize(format_ingredient_output)

ingredient_df = pd.DataFrame(
    list(
        chain.from_iterable(
            vfunc(
                trans_df.ingredient.values,
                y_pred,
                trans_df.index.values,
                trans_df.title.values
            )
        )
    )
)
ingredient_df.set_index("index", inplace=True)

In [18]:
trans_df.loc["http://www.epicurious.com/recipes/food/views/roast-leg-of-lamb-with-tarragon-mint-butter-352043"].ingredient.values
             
             

array([list(['0.75', 'cup', '(', '1.5', 'stick', ')', 'unsalted', 'butter', ',', 'room', 'temperature']),
       list(['3', 'tablespoon', 'chop', 'fresh', 'tarragon']),
       list(['3', 'tablespoon', 'chop', 'fresh', 'mint']),
       list(['4', 'teaspoon', 'tarragon', 'vinegar']),
       list(['2', 'teaspoon', 'coarse', 'kosher', 'salt']),
       list(['6.5-pound', 'leg', 'of', 'lamb', 'with', 'bone', ',', 'well', 'trim']),
       list(['1', 'tablespoon', '1-inch', '-', 'long', 'very', 'thin', 'strip', 'orange', 'peel', '(', 'orange', 'part', 'only', ')']),
       list(['2', 'tablespoon', 'olive', 'oil']),
       list(['coarse', 'kosher', 'salt']),
       list(['2', 'cup', 'dry', 'red', 'wine']),
       list(['1.333', 'cup', 'low', '-', 'salt', 'chicken', 'broth']),
       list(['2', 'teaspoon', 'finely', 'grate', 'orange', 'peel']),
       list(['fresh', 'tarragon', 'and', 'mint', 'sprig', '(', 'for', 'garnish', ')'])],
      dtype=object)

In [19]:
len(y_pred)

350834

In [20]:
ingredient_df.head(20)

Unnamed: 0_level_0,qty,comment,name,other,input,title,unit,range_end
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
http://www.epicurious.com/recipes/food/views/-adult-pimiento-cheese-100852,2.0,or 3 large,garlic,clove,2 or 3 large garlic clove,"""Adult"" Pimiento Cheese",,
http://www.epicurious.com/recipes/food/views/-adult-pimiento-cheese-100852,,a 2-ounce jar dice,pimiento,,a 2-ounce jar dice pimiento,"""Adult"" Pimiento Cheese",,
http://www.epicurious.com/recipes/food/views/-adult-pimiento-cheese-100852,3.0,"coarsely grated sharp (preferably English, Can...",Cheddar,,3 cup coarsely grated sharp Cheddar (preferabl...,"""Adult"" Pimiento Cheese",cup,
http://www.epicurious.com/recipes/food/views/-adult-pimiento-cheese-100852,,,mayonnaise,0.333 to 0.333,0.333 to 0.333 cup mayonnaise,"""Adult"" Pimiento Cheese",cup,
http://www.epicurious.com/recipes/food/views/-adult-pimiento-cheese-100852,,,cracker,,cracker,"""Adult"" Pimiento Cheese",,
http://www.epicurious.com/recipes/food/views/-adult-pimiento-cheese-100852,,,toast baguette slice,,toast baguette slice,"""Adult"" Pimiento Cheese",,
http://www.epicurious.com/recipes/food/views/-adult-pimiento-cheese-100852,,,crudités,,crudités,"""Adult"" Pimiento Cheese",,
http://www.epicurious.com/recipes/food/views/-an-apple-a-day-51133430,0.25,,unsalted butter,(0.25 stick),0.25 cup (0.25 stick) unsalted butter,"""An Apple a Day""",cup,
http://www.epicurious.com/recipes/food/views/-an-apple-a-day-51133430,4.0,"(such as Honeycrisp ; about pound), peel, core...",apple,- 5 medium 2.333 0.5-inch,4 - 5 medium apple (such as Honeycrisp ; about...,"""An Apple a Day""",,
http://www.epicurious.com/recipes/food/views/-an-apple-a-day-51133430,0.5,pack,light brown sugar,(),0.5 cup (pack) light brown sugar,"""An Apple a Day""",cup,


In [21]:
len(ingredient_df.name.unique())

24140

In [22]:
def qty2Decimal(qty):
    try:
        qty = float(qty)
    except ValueError:
        qty = np.nan

    return qty
ingredient_df["qty"] = ingredient_df["qty"].apply(lambda x: qty2Decimal(x))

In [23]:
# Now we convert as many units as possible to metric
ingredient_df.loc[ingredient_df.unit == "pound", "qty"] *= 453.592
ingredient_df.loc[ingredient_df.unit == "pound", "unit"] = "grams"

ingredient_df.loc[ingredient_df.unit == "teaspoon", "qty"] *= 4.92892
ingredient_df.loc[ingredient_df.unit == "teaspoon", "unit"] = "milliliters"

ingredient_df.loc[ingredient_df.unit == "tablespoon", "qty"] *= 14.7868
ingredient_df.loc[ingredient_df.unit == "tablespoon", "unit"] = "milliliters"

ingredient_df.loc[ingredient_df.unit == "cup", "qty"] *= 236.588
ingredient_df.loc[ingredient_df.unit == "cup", "unit"] = "milliliters"

ingredient_df.loc[ingredient_df.unit == "pinch", "qty"] *= 4.92892 * (1 / 16)
ingredient_df.loc[ingredient_df.unit == "pinch", "unit"] = "milliliters"

ingredient_df.loc[ingredient_df.unit == "dash", "qty"] *= 4.92892 * (1 / 8)
ingredient_df.loc[ingredient_df.unit == "dash", "unit"] = "milliliters"

ingredient_df.loc[ingredient_df.unit == "ounce", "qty"] *= 28.3495
ingredient_df.loc[ingredient_df.unit == "ounce", "unit"] = "grams"

ingredient_df.loc[ingredient_df.unit == "fluid ounce", "qty"] *= 29.5735
ingredient_df.loc[ingredient_df.unit == "fluid ounce", "unit"] = "milliliters"

ingredient_df.loc[ingredient_df.unit == "pint", "qty"] *= 473.176
ingredient_df.loc[ingredient_df.unit == "pint", "unit"] = "milliliters"

ingredient_df.loc[ingredient_df.unit == "quart", "qty"] *= 946.353
ingredient_df.loc[ingredient_df.unit == "quart", "unit"] = "milliliters"

ingredient_df.loc[ingredient_df.unit == "liter", "qty"] *= 1000
ingredient_df.loc[ingredient_df.unit == "liter", "unit"] = "milliliters"

ingredient_df.loc[ingredient_df.unit == "gallon", "qty"] *= 3785.41
ingredient_df.loc[ingredient_df.unit == "gallon", "unit"] = "milliliters"

ingredient_df.loc[ingredient_df.unit == "drop", "qty"] *= 0.05
ingredient_df.loc[ingredient_df.unit == "drop", "unit"] = "milliliters"

ingredient_df.loc[ingredient_df.unit == "jigger", "qty"] *= 44.3603
ingredient_df.loc[ingredient_df.unit == "jigger", "unit"] = "milliliters"

ingredient_df.head(10)

Unnamed: 0_level_0,qty,comment,name,other,input,title,unit,range_end
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
http://www.epicurious.com/recipes/food/views/-adult-pimiento-cheese-100852,2.0,or 3 large,garlic,clove,2 or 3 large garlic clove,"""Adult"" Pimiento Cheese",,
http://www.epicurious.com/recipes/food/views/-adult-pimiento-cheese-100852,,a 2-ounce jar dice,pimiento,,a 2-ounce jar dice pimiento,"""Adult"" Pimiento Cheese",,
http://www.epicurious.com/recipes/food/views/-adult-pimiento-cheese-100852,709.764,"coarsely grated sharp (preferably English, Can...",Cheddar,,3 cup coarsely grated sharp Cheddar (preferabl...,"""Adult"" Pimiento Cheese",milliliters,
http://www.epicurious.com/recipes/food/views/-adult-pimiento-cheese-100852,,,mayonnaise,0.333 to 0.333,0.333 to 0.333 cup mayonnaise,"""Adult"" Pimiento Cheese",milliliters,
http://www.epicurious.com/recipes/food/views/-adult-pimiento-cheese-100852,,,cracker,,cracker,"""Adult"" Pimiento Cheese",,
http://www.epicurious.com/recipes/food/views/-adult-pimiento-cheese-100852,,,toast baguette slice,,toast baguette slice,"""Adult"" Pimiento Cheese",,
http://www.epicurious.com/recipes/food/views/-adult-pimiento-cheese-100852,,,crudités,,crudités,"""Adult"" Pimiento Cheese",,
http://www.epicurious.com/recipes/food/views/-an-apple-a-day-51133430,59.147,,unsalted butter,(0.25 stick),0.25 cup (0.25 stick) unsalted butter,"""An Apple a Day""",milliliters,
http://www.epicurious.com/recipes/food/views/-an-apple-a-day-51133430,4.0,"(such as Honeycrisp ; about pound), peel, core...",apple,- 5 medium 2.333 0.5-inch,4 - 5 medium apple (such as Honeycrisp ; about...,"""An Apple a Day""",,
http://www.epicurious.com/recipes/food/views/-an-apple-a-day-51133430,118.294,pack,light brown sugar,(),0.5 cup (pack) light brown sugar,"""An Apple a Day""",milliliters,


In [24]:
recipe_df = ingredient_df.pivot_table(
    index="index", columns="name", values="qty", aggfunc=np.mean
)
recipe_df.fillna(0, inplace=True)
recipe_df.head(10)

name,""" 00 "" (soft wheat) flour flour",""" 00 "" flour",""" 00 "" pasta flour",""" > chicken stock chicken broth",""" > panko bread crumb",""" baby "" Pattypan squash",""" large "" egg",""" lite "" ricotta",""" sheets""carta da musica bread",""" shrimp and crab boil "" spice",...,| honey,| organic powdered sugar,| unsalted butter f,árbol chile,árbol chile powder or cayenne pepper,árbol chile red chile,ñame,‚ tablespoon medium - dry sherry,ﬁne bread crumb,ﬁne salt
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
http://www.epicurious.com/recipes/food/views/-adult-pimiento-cheese-100852,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
http://www.epicurious.com/recipes/food/views/-an-apple-a-day-51133430,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
http://www.epicurious.com/recipes/food/views/-blanketed-eggplant-305,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
http://www.epicurious.com/recipes/food/views/-bloody-mary-tomato-toast-with-celery-and-horseradish-56389813,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
http://www.epicurious.com/recipes/food/views/-bow-tie-pasta-with-zucchini-101932,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
http://www.epicurious.com/recipes/food/views/-burnt-carrots-and-parsnips-56390131,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
http://www.epicurious.com/recipes/food/views/-california-roll-salad-12246,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
http://www.epicurious.com/recipes/food/views/-candy-corn-frozen-citrus-cream-pops-368770,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
http://www.epicurious.com/recipes/food/views/-candy-corn-pumpkin-blondies-51254510,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
http://www.epicurious.com/recipes/food/views/-cannoli-ice-cream-sandwiches-242004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
# Let's save our dataframe so we can look at it without having to reload and recompute everything later.
# pickle is much faster than csv
recipe_df.to_pickle("../data/processed/recipe_vectors.pickle")

In [26]:
recipe_df.columns

Index(['" 00 " (soft wheat) flour flour', '" 00 " flour', '" 00 " pasta flour',
       '" > chicken stock chicken broth', '" > panko bread crumb',
       '" baby " Pattypan squash', '" large " egg', '" lite " ricotta',
       '" sheets"carta da musica bread', '" shrimp and crab boil " spice',
       ...
       '| honey', '| organic powdered sugar', '| unsalted butter f',
       'árbol chile', 'árbol chile powder or cayenne pepper',
       'árbol chile red chile', 'ñame', '‚ tablespoon medium - dry sherry',
       'ﬁne bread crumb', 'ﬁne salt'],
      dtype='object', name='name', length=17764)

In [27]:
recipe_df.loc[recipe_df['" bake pie crust'] > 0]

KeyError: '" bake pie crust'

In [None]:
# Here we compute the t-SNE
# WARNING: THIS TAKES A LONG TIME FOR LARGE DATAFRAMES
#from sklearn.manifold import TSNE
#data = recipe_df[recipe_df.columns].values
#tsne = TSNE(n_components=2, verbose=1, perplexity=50, n_iter=5000)
#tsne_results = tsne.fit_transform(data)
print("tsne done")
#recipe_df["tsne-2d-one"] = tsne_results[:, 0]
#recipe_df["tsne-2d-two"] = tsne_results[:, 1]

In [None]:
#recipe_df = pd.read_pickle("../data/processed/recipe_vectors.pickle")

In [None]:
#from sklearn.manifold import TSNE
#recipe_df = recipe_df.iloc[0:2000]
#data = recipe_df[recipe_df.columns].values

In [None]:
#tsne = TSNE(n_components=2, verbose=1, perplexity=50, n_iter=5000)
#tsne_results = tsne.fit_transform(data)
#print("tsne done")

In [None]:
#import matplotlib.pyplot as plt
#import seaborn as sns
#plt.figure(figsize=(16, 10))
#recipe_df["tsne-2d-one"] = tsne_results[:, 0]
#recipe_df["tsne-2d-two"] = tsne_results[:, 1]
#tsne_1 = tsne_results[:, 0]
#sns.scatterplot(x="tsne-2d-one", y="tsne-2d-two", data=recipe_df, linewidth=0)
#plt.show()

In [None]:
## Here I'm just exploring outliers in the t-SNE
#sample_df = recipe_df.loc[recipe_df["tsne-2d-two"] >= 50]
#drop_cols = sample_df.columns[(sample_df == 0).sum() == sample_df.shape[0]]
#sample_df.drop(drop_cols, axis=1, inplace=True)
#sample_df.columns.unique().values
#sample_df.head(15)

In [None]:
#sample_df.columns.unique().values