In [3]:
from functional import seq
import json
import re

In [14]:
def list_from_file(path):
    '''Returns text file contents as list (one line => one element in list)'''
    lst = []
    with open(path) as f:
        for line in f:
            string = line.strip()
            if len(string) > 0:
                lst.append(string)
    return lst

In [4]:
recipes = seq.open("../data/recipes-raw.jsonl", encoding="utf-8").map(json.loads)

recipes.len()

65981

In [5]:
recipes.take(1).list()

[{'title': 'Drowned Beef Sandwich with Chipotle Sauce (Torta Ahogada)',
  'url': 'https://www.allrecipes.com/recipe/244940/drowned-beef-sandwich-with-chipotle-sauce-torta-ahogada/',
  'rating': '4.71',
  'numreviews': '11',
  'ingredients': ['12 ounces chipotle cooking sauce (such a Knorr(R))',
   '1 (14 ounce) can reduced-sodium beef broth',
   '1/4 cup chopped fresh cilantro (optional)',
   '2 tablespoons vegetable oil',
   '1 onion, thinly sliced',
   '3 cloves garlic, minced',
   '1 pound thinly sliced deli roast beef',
   '4 bolillo rolls, halved and lightly toasted',
   '4 sprigs fresh cilantro, or to taste (optional)'],
  'instructions': 'Combine chipotle cooking sauce, beef broth, and 1/4 cup chopped cilantro in a saucepan; bring to a boil. Reduce heat to medium-low and simmer, stirring occasionally, for 10 minutes.\nHeat oil in a skillet over medium-high heat; saute onion until softened, about 5 minutes. Stir garlic into onion and cook for 1 minute. Add roast beef and 1/4 cup 

In [6]:
ingredients = recipes.map(lambda x: x['ingredients']).flatten()
ingredients.take(3).list()

['12 ounces chipotle cooking sauce (such a Knorr(R))',
 '1 (14 ounce) can reduced-sodium beef broth',
 '1/4 cup chopped fresh cilantro (optional)']

In [7]:
with open("all-ingredients.txt", "w", encoding="utf-8") as f:
    ingredients.distinct().sorted().for_each(lambda x: f.write(x + '\n'))

In [8]:
#remove section eg "Frosting:"
ingredients_sans_section = ingredients.filter(lambda x: x.find(":") == -1)


In [9]:
#remove numbers
number_regex = re.compile("([0-9]+)|([0-9]*.[0-9]+)|([0-9]+/[0-9]+)")

ingredients_sans_numbers = (ingredients_sans_section
                                .map(lambda x: number_regex.sub("",x))
                                .map(lambda x: x.strip().lower())
                                .distinct()
                                .sorted()
                           )

ingredients_sans_numbers.take(3).list()

['', '% milk', "'bouqet garni' spice balls"]

In [10]:
with open("all-ingredients-sans-numbers.txt", "w") as f:
    ingredients_sans_numbers.for_each(lambda x: f.write(x + '\n'))

In [11]:
# remove stuff after a comma

ingredients_sans_details = (ingredients_sans_numbers
                                .map(lambda x: x.split(","))
                                .map(lambda x: x[0])
                                .distinct()
                                .sorted()
                           )

details = (ingredients_sans_numbers
                                .map(lambda x: x.split(","))
                                .map(lambda x: ",".join(x[1:]))
                                .distinct()
                                .sorted()
                           )

with open("all-ingredients-sans-details.txt", "w") as f:
    ingredients_sans_details.for_each(lambda x: f.write(x + '\n'))
    
    
with open("all-details.txt", "w") as f:
    details.for_each(lambda x: f.write(x + '\n'))

In [37]:
#remove measurements
measurements = list_from_file("standard-units.txt")
measurements_regex = re.compile("(^| |-)(" + "|".join(measurements) + ")(es)?(s)?" + "[)\- ]")

ingredients_sans_measurements = (ingredients_sans_details
                                .map(lambda x: measurements_regex.sub("", x))
                                .map(lambda x: x.strip())
                                .distinct()
                                .sorted()
                               )

with open("all-ingredients-sans-measurements.txt", "w") as f:
    ingredients_sans_measurements.for_each(lambda x: f.write(x + '\n'))


