In [1]:
%load_ext autoreload
%autoreload 2

from pymongo import MongoClient
import datetime
import numpy as np
import pandas as pd
import getpass
from utility_functions import *



In [2]:
mapping_from_database_names_to_geojson_ids_filename = '../website/data/database_names_to_geojson_ids.json'
name_to_geojson_content = load_JSON(mapping_from_database_names_to_geojson_ids_filename)

database_aggregated_values_filename = '../website/data/aggregatedData.json'
database_aggregated_content = load_JSON(database_aggregated_values_filename)

In [3]:
name_to_geojson_content

{'country': {'argentinian': {'display_name': 'Argentina',
   'geo_identifier': 'ARG'},
  'australian-and-new-zealander': {'display_name': 'New Zealand',
   'geo_identifier': 'NZL'},
  'austrian': {'display_name': 'Austria', 'geo_identifier': 'AUT'},
  'bangladeshi': {'display_name': 'Bangladesh', 'geo_identifier': 'BGD'},
  'belgian': {'display_name': 'Belgium', 'geo_identifier': 'BEL'},
  'brazilian': {'display_name': 'Brazil', 'geo_identifier': 'BRA'},
  'canadian': {'display_name': 'Canada', 'geo_identifier': 'CAN'},
  'chilean': {'display_name': 'Chile', 'geo_identifier': 'CHL'},
  'chili': {'display_name': 'Chile', 'geo_identifier': 'CHL'},
  'chinese': {'display_name': 'China', 'geo_identifier': 'CHN'},
  'colombian': {'display_name': 'Colombia', 'geo_identifier': 'COL'},
  'cuban': {'display_name': 'Cuba', 'geo_identifier': 'CUB'},
  'czech': {'display_name': 'Czech Republic', 'geo_identifier': 'CZE'},
  'danish': {'display_name': 'Denmark', 'geo_identifier': 'DNK'},
  'dutch': 

In [4]:
database_aggregated_content

{'global': {'Global aggregation': {'avg_cookMinutes': 41.1077453516,
   'avg_nutrition_calcium_amount': 114.9318745406,
   'avg_nutrition_caloriesFromFat_amount': 160.0322171934,
   'avg_nutrition_calories_amount': 350.2467986302,
   'avg_nutrition_carbohydrates_amount': 31.5380632211,
   'avg_nutrition_cholesterol_amount': 66.2203374766,
   'avg_nutrition_fat_amount': 17.7817337376,
   'avg_nutrition_fiber_amount': 3.1178096283,
   'avg_nutrition_folate_amount': 60.2745651289,
   'avg_nutrition_iron_amount': 2.6678069684,
   'avg_nutrition_magnesium_amount': 42.7467841021,
   'avg_nutrition_niacin_amount': 6.5819486949,
   'avg_nutrition_potassium_amount': 434.1991336157,
   'avg_nutrition_protein_amount': 16.2278095183,
   'avg_nutrition_saturatedFat_amount': 6.5396016803,
   'avg_nutrition_sodium_amount': 661.4516873115,
   'avg_nutrition_sugars_amount': 9.8466210198,
   'avg_nutrition_thiamin_amount': 0.2271286606,
   'avg_nutrition_vitaminA_amount': 1418.1540015684,
   'avg_nutrit

In [5]:
# map from one file to the other
aggregation_field_mappings = {
    'per_continent': 'continent',
    'per_subcontinent': 'subcontinent',
    'per_country': 'country',
    'per_state': 'state',
    'per_city': 'city',
}

We now add the geodata to the database.

In [6]:
for (aggregation_key, fields) in database_aggregated_content.items():
    if aggregation_key not in aggregation_field_mappings:
        # ignore
        continue
    # otherwise is an aggregation e.g. "per_country"
    # e.g. display name and geoid for each country
    equivalent_key = aggregation_field_mappings[aggregation_key]
    if equivalent_key not in name_to_geojson_content:
        continue
        
    properties = name_to_geojson_content[equivalent_key]
    for location_name in fields:
        if location_name not in properties:
            continue
        fields[location_name]['geo_identifier'] = properties[location_name]['geo_identifier']
        fields[location_name]['geo_name'] = properties[location_name]['display_name']
        fields[location_name]['database_name'] = location_name
        
        
# now make a list out of the dict
for (aggregation_key, fields) in database_aggregated_content.items():
    if aggregation_key not in aggregation_field_mappings:
        # ignore
        continue
    database_aggregated_content[aggregation_key] = list(fields.values())
    

In [7]:
database_aggregated_content

{'global': {'Global aggregation': {'avg_cookMinutes': 41.1077453516,
   'avg_nutrition_calcium_amount': 114.9318745406,
   'avg_nutrition_caloriesFromFat_amount': 160.0322171934,
   'avg_nutrition_calories_amount': 350.2467986302,
   'avg_nutrition_carbohydrates_amount': 31.5380632211,
   'avg_nutrition_cholesterol_amount': 66.2203374766,
   'avg_nutrition_fat_amount': 17.7817337376,
   'avg_nutrition_fiber_amount': 3.1178096283,
   'avg_nutrition_folate_amount': 60.2745651289,
   'avg_nutrition_iron_amount': 2.6678069684,
   'avg_nutrition_magnesium_amount': 42.7467841021,
   'avg_nutrition_niacin_amount': 6.5819486949,
   'avg_nutrition_potassium_amount': 434.1991336157,
   'avg_nutrition_protein_amount': 16.2278095183,
   'avg_nutrition_saturatedFat_amount': 6.5396016803,
   'avg_nutrition_sodium_amount': 661.4516873115,
   'avg_nutrition_sugars_amount': 9.8466210198,
   'avg_nutrition_thiamin_amount': 0.2271286606,
   'avg_nutrition_vitaminA_amount': 1418.1540015684,
   'avg_nutrit

# Getting the nutrition intake values

In [8]:
# install: pip install html5lib, pip install lxml, pip install bs4
# see http://stackoverflow.com/questions/38447738/beautifulsoup-html5lib-module-object-has-no-attribute-base
# if this doesn't run straightforwardly

reference_intakes_website = 'https://en.wikipedia.org/wiki/Dietary_Reference_Intake' # Wikipedia
intake_tables = pd.read_html(reference_intakes_website, header=0, na_values=['NE', 'ND']) # na values requires version 0.19.1
intake_df = pd.DataFrame(intake_tables[0])
intake_df = intake_df.rename(columns={'EAR': 'Estimated', 'RDA/AI': 'Recommended', 'UL[6]': 'Tolerable', 'unit': 'Unit'})
intake_df.head(10)
#intake_df

Unnamed: 0,Nutrient,Estimated,Recommended,Tolerable,Unit,"Top sources in common measures, USDA[7]"
0,Vitamin A,625.0,900.0,3000.0,µg,"turkey and chicken giblets, liver, red capsicu..."
1,Vitamin C,75.0,90.0,2000.0,mg,"guavas, oranges, grapefruits, frozen peaches,[..."
2,Vitamin D[9],10.0,15.0,100.0,µg,"fortified cereals, mushrooms, yeast, sockeye s..."
3,Vitamin K,,120.0,,µg,"kale, collards, spinach, broccoli, brussel spr..."
4,Vitamin B6,1.1,1.3,100.0,mg,"fortified cereals, chickpeas, sockeye salmon"
5,α-tocopherol (Vitamin E),12.0,15.0,1000.0,mg,"fortified cereals, tomato paste, sunflower seeds"
6,Biotin (B7),,30.0,,µg,"whole grains, almonds, peanuts, beef liver, eg..."
7,Calcium[9],800.0,1000.0,2500.0,mg,"fortified cereals, collards, almonds, condense..."
8,Chloride,,2300.0,3600.0,mg,table salt
9,Chromium,,35.0,,µg,"broccoli, turkey ham, dried apricots, tuna, pi..."


In [9]:
import re
myStr = 'Vitamin D[9]'
myStr = re.sub(r'\[.*\]', '', myStr)
myStr = myStr.replace(' ', '')
myStr

'VitaminD'

In [10]:
def simplify_name(myStr):
    myStr = re.sub(r'\[.*\]', '', myStr)
    myStr = myStr.replace(' ', '')
    myStr = myStr.lower()
    return myStr



In [11]:
additional_daily_intake_dict = {
    'Fat': 65, #g
    'SaturatedFat': 20, #g
    'Cholesterol': 300, #mg
    'protein': 56, #g
    'calories': 2000, #kcal
    'caloriesFromFat': 400, #kcal
    'carbohydrates': 130, #g
    'sugars': 30, #g
    'fiber': 38, #g
}
units = ['g', 'g', 'mg', 'g', 'kcal', 'kcal', 'g', 'g', 'g'];
additional_df = pd.DataFrame.from_dict([(key, value) for (key, value) in additional_daily_intake_dict.items()])
additional_df.columns=['Nutrient', 'Recommended']
additional_df['Unit'] = units
#additional_df

intake_df = pd.concat([intake_df, additional_df], axis=0)
intake_df = intake_df.reset_index()

intake_df['simpleName'] = intake_df['Nutrient'].map(simplify_name)
intake_df.head(3)

Unnamed: 0,index,Estimated,Nutrient,Recommended,Tolerable,"Top sources in common measures, USDA[7]",Unit,simpleName
0,0,625.0,Vitamin A,900.0,3000,"turkey and chicken giblets, liver, red capsicu...",µg,vitamina
1,1,75.0,Vitamin C,90.0,2000,"guavas, oranges, grapefruits, frozen peaches,[...",mg,vitaminc
2,2,10.0,Vitamin D[9],15.0,100,"fortified cereals, mushrooms, yeast, sockeye s...",µg,vitamind


In [12]:
# We can take this column
assert( not intake_df[['Recommended', 'Unit']].isnull().values.any() )

In [13]:
# from nutrition fields in database to information about this nutrition
intake_map = dict()

for field_key in database_aggregated_content['global']['Global aggregation']:
    if "nutrition" in field_key:
        searchObj = re.search( r'nutrition_(.*)_amount', field_key, re.M|re.I) # first occurrence only
        if not searchObj:
            print("Error processing {}".format(field_key))
            continue
        nutrition_name = searchObj.group(1)
        
        # 'Estimated', 'Recommended', 'Tolerable'
        nutrition_data = intake_df[intake_df['simpleName'].str.contains(nutrition_name.lower())][['Recommended', 'Unit']]
        nutrition_data = nutrition_data.iloc[0] # only first entry
        if len(nutrition_data) == 0:
            additional_daily_intake_map
        if len(nutrition_data) == 0:
            print("Could not map nutrition {}".format(field_key))
            continue
        intake_map[field_key] = nutrition_data.values
        

In [14]:
intake_map

{'avg_nutrition_calcium_amount': array([1000.0, 'mg'], dtype=object),
 'avg_nutrition_caloriesFromFat_amount': array([400.0, 'g'], dtype=object),
 'avg_nutrition_calories_amount': array([2000.0, 'g'], dtype=object),
 'avg_nutrition_carbohydrates_amount': array([130.0, 'g'], dtype=object),
 'avg_nutrition_cholesterol_amount': array([300.0, 'g'], dtype=object),
 'avg_nutrition_fat_amount': array([65.0, 'g'], dtype=object),
 'avg_nutrition_fiber_amount': array([38.0, 'kcal'], dtype=object),
 'avg_nutrition_folate_amount': array([400.0, 'µg'], dtype=object),
 'avg_nutrition_iron_amount': array([8.0, 'mg'], dtype=object),
 'avg_nutrition_magnesium_amount': array([420.0, 'mg'], dtype=object),
 'avg_nutrition_niacin_amount': array([16.0, 'mg'], dtype=object),
 'avg_nutrition_potassium_amount': array([4700.0, 'mg'], dtype=object),
 'avg_nutrition_protein_amount': array([56.0, 'g'], dtype=object),
 'avg_nutrition_saturatedFat_amount': array([20.0, 'kcal'], dtype=object),
 'avg_nutrition_sodium_

We now add this data to the json aggregated data.

In [15]:
intake_field_name = 'daily_recommended_intake'
database_aggregated_content[intake_field_name] = dict()
for (nutrition_key, nutr_values) in intake_map.items():
    database_aggregated_content[intake_field_name][nutrition_key] = {'Recommended': nutr_values[0], 'unit': nutr_values[1]}

In [16]:
database_aggregated_content

{'daily_recommended_intake': {'avg_nutrition_calcium_amount': {'Recommended': 1000.0,
   'unit': 'mg'},
  'avg_nutrition_caloriesFromFat_amount': {'Recommended': 400.0, 'unit': 'g'},
  'avg_nutrition_calories_amount': {'Recommended': 2000.0, 'unit': 'g'},
  'avg_nutrition_carbohydrates_amount': {'Recommended': 130.0, 'unit': 'g'},
  'avg_nutrition_cholesterol_amount': {'Recommended': 300.0, 'unit': 'g'},
  'avg_nutrition_fat_amount': {'Recommended': 65.0, 'unit': 'g'},
  'avg_nutrition_fiber_amount': {'Recommended': 38.0, 'unit': 'kcal'},
  'avg_nutrition_folate_amount': {'Recommended': 400.0, 'unit': 'µg'},
  'avg_nutrition_iron_amount': {'Recommended': 8.0, 'unit': 'mg'},
  'avg_nutrition_magnesium_amount': {'Recommended': 420.0, 'unit': 'mg'},
  'avg_nutrition_niacin_amount': {'Recommended': 16.0, 'unit': 'mg'},
  'avg_nutrition_potassium_amount': {'Recommended': 4700.0, 'unit': 'mg'},
  'avg_nutrition_protein_amount': {'Recommended': 56.0, 'unit': 'g'},
  'avg_nutrition_saturatedFa

In [17]:
database_aggregated_values_full_filename = '../website/data/fullAggregatedData.json'

write_JSON(database_aggregated_values_full_filename, database_aggregated_content)
!cat {database_aggregated_values_full_filename}

{
    "per_city": [
        {
            "avg_nutrition_sugars_amount": 14.3039433875,
            "avg_nutrition_carbohydrates_amount": 39.3388236094,
            "avg_nutrition_thiamin_amount": 0.3077651906,
            "avg_ratingCount": 411.625,
            "avg_nutrition_calories_amount": 394.5755590625,
            "avg_readyInMinutes": 118.5,
            "avg_similarRecipes_metaData_page": 1.0,
            "avg_similarRecipes_metaData_totalCount": 17.65625,
            "avg_submitter_ratingsCount": 6.296875,
            "avg_nutrition_vitaminA_amount": 1207.9711992188,
            "avg_nutrition_cholesterol_amount": 71.3078094688,
            "avg_nutrition_protein_amount": 17.0540378156,
            "avg_nutrition_vitaminB6_amount": 0.351701125,
            "avg_submitter_followingCount": 0.0,
            "nbRecipes": 64,
            "avg_reviewCount": 316.03125,
            "avg_nutrition_caloriesFromFat_amount": 172.1815514531,
            "avg_submitter_

Idea: We can use other column to recommend what to eat

# Stubs

In [18]:
dailyIntake = {
    'calcium': 
    'caloriesFromFat':
    'calories':
    'carbohydrates':
    'cholesterol':
    'fat':
    'fiber':
    'folate':
    'iron':
    'magnesium':
    'niacin':
    'potassium':
    'protein':
    'saturatedFat':
    'sodium':
    'sugars':
    'thiamin':
    'vitaminA': 3000 # ug (microgram)
    'vitaminB6':
    'vitaminC': 
}

SyntaxError: invalid syntax (<ipython-input-18-d763b8d7b9c7>, line 3)