In [1]:
import pandas as pd
import json

In order to avoid problems uploading large files in Github, I've taken a sample of the original recipe dataset (about half of it), which should be enough for our purposes.

In [6]:
recipes = pd.read_csv('foodRecSysdata_recipe_sample.csv')

In [7]:
recipes.shape

(22388, 6)

In [8]:
recipes.head()

Unnamed: 0,recipe_id,recipe_name,image_url,ingredients,cooking_directions,nutritions
0,240488,"Pork Loin, Apples, and Sauerkraut",https://images.media-allrecipes.com/userphotos...,sauerkraut drained^Granny Smith apples sliced^...,{'directions': u'Prep\n15 m\nCook\n2 h 30 m\nR...,"{u'niacin': {u'hasCompleteData': False, u'name..."
1,218939,Foolproof Rosemary Chicken Wings,https://images.media-allrecipes.com/userphotos...,chicken wings^sprigs rosemary^head garlic^oliv...,"{'directions': u""Prep\n20 m\nCook\n40 m\nReady...","{u'niacin': {u'hasCompleteData': True, u'name'..."
2,87211,Chicken Pesto Paninis,https://images.media-allrecipes.com/userphotos...,focaccia bread quartered^prepared basil pesto^...,{'directions': u'Prep\n15 m\nCook\n5 m\nReady ...,"{u'niacin': {u'hasCompleteData': True, u'name'..."
3,245714,Potato Bacon Pizza,https://images.media-allrecipes.com/userphotos...,red potatoes^strips bacon^Sauce:^heavy whippin...,{'directions': u'Prep\n20 m\nCook\n45 m\nReady...,"{u'niacin': {u'hasCompleteData': True, u'name'..."
4,218545,Latin-Inspired Spicy Cream Chicken Stew,https://images.media-allrecipes.com/userphotos...,skinless boneless chicken breast halves^diced ...,{'directions': u'Prep\n10 m\nCook\n8 h 15 m\nR...,"{u'niacin': {u'hasCompleteData': False, u'name..."


In [9]:
recipes.isnull().sum()

recipe_id             0
recipe_name           0
image_url             0
ingredients           0
cooking_directions    0
nutritions            0
dtype: int64

Since we're just identifying a recipe by ingredients and sending the user to the URL in allrecipes.com, I get rid of the cooking_directions and nutritions column. If cooking_directions proves to be useful to build a recommender system, the column can be reintroduced. There is also recipe rating information available if we include rating functionality in our app.

In [10]:
copy = recipes.copy()
copy.drop(columns=["cooking_directions", "nutritions"], inplace=True)

In [11]:
copy["recipe_url"] = "https://www.allrecipes.com/recipe/" + copy["recipe_id"].astype(str)
copy.head()

Unnamed: 0,recipe_id,recipe_name,image_url,ingredients,recipe_url
0,240488,"Pork Loin, Apples, and Sauerkraut",https://images.media-allrecipes.com/userphotos...,sauerkraut drained^Granny Smith apples sliced^...,https://www.allrecipes.com/recipe/240488
1,218939,Foolproof Rosemary Chicken Wings,https://images.media-allrecipes.com/userphotos...,chicken wings^sprigs rosemary^head garlic^oliv...,https://www.allrecipes.com/recipe/218939
2,87211,Chicken Pesto Paninis,https://images.media-allrecipes.com/userphotos...,focaccia bread quartered^prepared basil pesto^...,https://www.allrecipes.com/recipe/87211
3,245714,Potato Bacon Pizza,https://images.media-allrecipes.com/userphotos...,red potatoes^strips bacon^Sauce:^heavy whippin...,https://www.allrecipes.com/recipe/245714
4,218545,Latin-Inspired Spicy Cream Chicken Stew,https://images.media-allrecipes.com/userphotos...,skinless boneless chicken breast halves^diced ...,https://www.allrecipes.com/recipe/218545


In [12]:
copy["ingredients"] = copy["ingredients"].str.replace('^', ', ')
copy.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,recipe_id,recipe_name,image_url,ingredients,recipe_url
0,240488,"Pork Loin, Apples, and Sauerkraut",https://images.media-allrecipes.com/userphotos...,"sauerkraut drained, Granny Smith apples sliced...",https://www.allrecipes.com/recipe/240488
1,218939,Foolproof Rosemary Chicken Wings,https://images.media-allrecipes.com/userphotos...,"chicken wings, sprigs rosemary, head garlic, o...",https://www.allrecipes.com/recipe/218939
2,87211,Chicken Pesto Paninis,https://images.media-allrecipes.com/userphotos...,"focaccia bread quartered, prepared basil pesto...",https://www.allrecipes.com/recipe/87211
3,245714,Potato Bacon Pizza,https://images.media-allrecipes.com/userphotos...,"red potatoes, strips bacon, Sauce:, heavy whip...",https://www.allrecipes.com/recipe/245714
4,218545,Latin-Inspired Spicy Cream Chicken Stew,https://images.media-allrecipes.com/userphotos...,"skinless boneless chicken breast halves, diced...",https://www.allrecipes.com/recipe/218545


In [13]:
out = copy.to_json(orient='index')
with open('recipe_dataset_sample_clean.json', 'w') as f:
    f.write(out)