
# Create JSONs with aggregated data from database

Queries database and gets aggregated data (possible grouped before by country).
Creates a single JSON that contains all this information.

In [32]:
%load_ext autoreload
%autoreload 2

from pymongo import MongoClient
import datetime
import numpy as np
import pandas as pd
import getpass
from aggregation_functions import * # get all functions used below to communicate with mongodb database



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Be sure you are connect on the **EPFL network** (connection at EPFL or via VPN)

Note on authentification:
* Your username-password pair has read-only credentials
* Use the admin user to insert, modify, or create indexes

In [3]:
database = 'ada-project'
user = input('MongoDB name: ') #maxmordig
password = getpass.getpass('MongoDB password: ')

MongoDB name: maxmordig
MongoDB password: ········


In [4]:
# Mongo Client and authentification
client = MongoClient('www.cocotte-minute.ovh', 27017)
db = client[database]
db.authenticate(user, password)
collection = db['recipes']

In [5]:
# Number of recipes
serverMongoVersion = client.server_info()['version']
serverMongoVersion

'3.2.10'

# Example usage of this function

In [6]:
results = group_and_get_aggregate_of_field(collection, group_by_field_address='$ada-country', 
                                aggregate_field_address='$nutrition.calories.amount', 
                                group_by_field_output_name='country', 
                                aggregate_field_output_name='avgCalories', 
                                aggregationFunction = '$avg' 
                               )

df = pd.DataFrame.from_dict(results)
df = df.set_index(['country'])
df.head(3)

Unnamed: 0_level_0,avgCalories,nbRecipes
country,Unnamed: 1_level_1,Unnamed: 2_level_1
israeli,255.634929,24
bangladeshi,355.95435,14
colombian,374.666667,13


In [7]:
results = group_and_get_aggregate_of_field(collection, group_by_field_address='get_global_aggregation', 
                                aggregate_field_address='$nutrition.calories.amount', 
                                group_by_field_output_name='DummyColumn', 
                                aggregate_field_output_name='avgCalories', 
                                aggregationFunction = '$avg' 
                               )

df1 = pd.DataFrame.from_dict(results)
df1.head(3)

Unnamed: 0,DummyColumn,avgCalories,nbRecipes
0,Global aggregation,350.246799,16242


# Several Fields

We now suppose we are interested in several fields, e.g. 'avgCalories' etc. and the goal is to merge this information into a single dataframe.

To first get an idea of what fields we have, we search for a simple recipe.

In [8]:
oneRecipe = collection.find_one({'recipeID':47564})
oneRecipe

{'_id': ObjectId('5825054ace06e50446084706'),
 'adUnit': {'adKeys': [7,
   79,
   95,
   125,
   148,
   150,
   169,
   173,
   184,
   201,
   221,
   235,
   241,
   242,
   249,
   254,
   265,
   608],
  'adZone': 'recipes',
  'contentProviderId': 451,
  'networkCode': '3865',
  'site': 'ar.ios.apps.allrecipes.recipes'},
 'ada-city': [],
 'ada-continent': ['asian'],
 'ada-country': ['indian'],
 'ada-region': [],
 'ada-subcontinent': [],
 'ada-tags': ['world-cuisine',
  'asian',
  'indian',
  'main-dishes',
  'curry',
  'vegetarian'],
 'cookMinutes': 45,
 'description': 'This is a really easy and tasty Indian dish that is sure to stir up your taste buds. Delicious baingan bharta is ready to eat with pita bread, Indian naan, or rice.',
 'directions': [{'displayValue': 'Preheat oven to 450 degrees F (230 degrees C).',
   'ordinal': 1,
   'videoTimestamp': 0},
  {'displayValue': 'Place eggplant on a medium baking sheet. Bake 20 to 30 minutes in the preheated oven, until tender. Remove

In [9]:
showFields(oneRecipe)

> video: None
> reviewCount: 309
> readyInMinutes: 60
> directions: [{'ordinal': 1, 'vid...
> ada-city: []
> footnotes: [{'ordinal': 100, 't...
> similarRecipes
> ->-recipes: [{'sourceID': 461, '...
> ->-links
> ->-->-self
> ->-->-->-href: https://apps.allreci...
> ->-metaData
> ->-->-page: 1
> ->-->-totalCount: 20
> ->-->-pagesize: 20
> submitter
> ->-favoritesCount: 0
> ->-followersCount: 0
> ->-ratingsCount: 0
> ->-handle: None
> ->-region: None
> ->-itemType: Cook
> ->-promotedBrandPixelTrackingUrl: None
> ->-name: Yakuta
> ->-madeRecipesCount: 0
> ->-country: None
> ->-city: None
> ->-userID: 0
> ->-photo
> ->-->-photoDetailUrl: None
> ->-->-rawItemType: 10
> ->-->-title: None
> ->-->-itemType: Photo
> ->-->-recipeTitle: None
> ->-->-description: None
> ->-->-urls: [{'height': 50, 'wid...
> ->-isPro: False
> ->-reciepesCount: 0
> ->-profileUrl: 
> ->-followingCount: 0
> ->-reviewsCount: 0
> ->-rawItemType: 11
> ->-brandedSourceID: 0
> ->-personalRecipeSharedCount: 0
> ada-subconti

In [111]:
# contains all parameters to pass to the aggregation function

# group bys
groupBysToQuery = [
    {
        # get global average
        'group_by_field_address': 'get_global_aggregation',
        'group_by_field_output_name': 'DummyColumn' # ignored,
    },
    {
        'group_by_field_address': '$ada-city',
        'group_by_field_output_name': 'city', # as it appears in dataframe
    },
]

# keep in sync with above groupBysToQuery
dataframeNames = ['global', 'per_city']

# what to aggregate for
fieldArgumentsToQuery = [
    {
        'aggregate_field_address': '$nutrition.calories.amount', 
        'aggregate_field_output_name': 'avg_calories', 
        'aggregationFunction': '$avg' 
    },
    {
        'aggregate_field_address': '$nutrition.vitaminA.amount', 
        'aggregate_field_output_name': 'total_vitaminA_Intake', 
        'aggregationFunction': '$sum' 
    },
]

allDfs = getAllDataframes(collection, groupBysToQuery, fieldArgumentsToQuery, dataframeNames)

Query 1/4 with arguments: {'aggregate_field_address': '$nutrition.calories.amount', 'group_by_field_address': 'get_global_aggregation', 'aggregate_field_output_name': 'avg_calories', 'aggregationFunction': '$avg', 'group_by_field_output_name': 'DummyColumn'}
Query 2/4 with arguments: {'aggregate_field_address': '$nutrition.vitaminA.amount', 'group_by_field_address': 'get_global_aggregation', 'aggregate_field_output_name': 'total_vitaminA_Intake', 'aggregationFunction': '$sum', 'group_by_field_output_name': 'DummyColumn'}
Query 3/4 with arguments: {'aggregate_field_address': '$nutrition.calories.amount', 'group_by_field_address': '$ada-city', 'aggregate_field_output_name': 'avg_calories', 'aggregationFunction': '$avg', 'group_by_field_output_name': 'city'}
Query 4/4 with arguments: {'aggregate_field_address': '$nutrition.vitaminA.amount', 'group_by_field_address': '$ada-city', 'aggregate_field_output_name': 'total_vitaminA_Intake', 'aggregationFunction': '$sum', 'group_by_field_output_n

In [112]:
allDfs['global'].head(3)

Unnamed: 0_level_0,avg_calories,nbRecipes,total_vitaminA_Intake
DummyColumn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Global aggregation,350.246799,16242,22703230.0


In [113]:
allDfs['per_city'].head(3)

Unnamed: 0_level_0,avg_calories,nbRecipes,total_vitaminA_Intake
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
atlanta,424.314499,28,85023.39307
baltimore,282.584182,40,27968.763135
boston,411.666383,12,2768.165049


In [13]:
import os

json_output_filename = '../website/data/aggregatedDataSmall.json'

os.makedirs(os.path.dirname(json_output_filename), exist_ok=True)
write_dataframes_to_json(json_output_filename, allDfs)

#loadedAllDfs = load_dataframes_from_json(json_output_filename)

In [60]:
!cat '../website/data/aggregatedData.json'

{
    "per_city": {
        "dallas": {
            "avg_calories": 281.9190327586,
            "total_vitaminA_Intake": 62139.256174,
            "nbRecipes": 29
        },
        "new-orleans": {
            "avg_calories": 445.9402834884,
            "total_vitaminA_Intake": 42673.55009,
            "nbRecipes": 46
        },
        "cincinnati": {
            "avg_calories": 424.625025,
            "total_vitaminA_Intake": 39608.794,
            "nbRecipes": 12
        },
        "washington-dc": {
            "avg_calories": 291.5897064706,
            "total_vitaminA_Intake": 9453.05414,
            "nbRecipes": 17
        },
        "indianapolis": {
            "avg_calories": 425.4122357143,
            "total_vitaminA_Intake": 12283.5292282,
            "nbRecipes": 15
        },
        "milwaukee": {
            "avg_calories": 344.2545928571,
            "total_vitaminA_Intake": 26335.21554,
            "nbRecipes": 27
        },
        "

# Get many aggregate values

We must first find which fields to pick.

In [45]:
allKeysWithDescription = get_all_field_addresses(oneRecipe, shortenLongStrings=False)
allKeysWithDescription

{'$_id': '5825054ace06e50446084706',
 '$adUnit.adKeys': '[7, 79, 95, 125, 148, 150, 169, 173, 184, 201, 221, 235, 241, 242, 249, 254, 265, 608]',
 '$adUnit.adZone': 'recipes',
 '$adUnit.contentProviderId': '451',
 '$adUnit.networkCode': '3865',
 '$adUnit.site': 'ar.ios.apps.allrecipes.recipes',
 '$ada-city': '[]',
 '$ada-continent': "['asian']",
 '$ada-country': "['indian']",
 '$ada-region': '[]',
 '$ada-subcontinent': '[]',
 '$ada-tags': "['world-cuisine', 'asian', 'indian', 'main-dishes', 'curry', 'vegetarian']",
 '$cookMinutes': '45',
 '$description': 'This is a really easy and tasty Indian dish that is sure to stir up your taste buds. Delicious baingan bharta is ready to eat with pita bread, Indian naan, or rice.',
 '$directions': "[{'ordinal': 1, 'videoTimestamp': 0, 'displayValue': 'Preheat oven to 450 degrees F (230 degrees C).'}, {'ordinal': 2, 'videoTimestamp': 0, 'displayValue': 'Place eggplant on a medium baking sheet. Bake 20 to 30 minutes in the preheated oven, until tende

In [47]:
str.isnumeric('[7, 79, 95, 125, 148]') # false
str.isnumeric('45') # true

True

In [None]:
{
        'aggregate_field_address': '$nutrition.calories.amount', 
        'aggregate_field_output_name': 'avg_calories', 
        'aggregationFunction': '$avg' 
    }

In [61]:
'$submitter.followersCount'.endswith(tuple(['percentDailyValue', 'Count']))

True

In [140]:
allKeysWithDescription

{'$_id': '5825054ace06e50446084706',
 '$adUnit.adKeys': '[7, 79, 95, 125, 148, 150, 169, 173, 184, 201, 221, 235, 241, 242, 249, 254, 265, 608]',
 '$adUnit.adZone': 'recipes',
 '$adUnit.contentProviderId': '451',
 '$adUnit.networkCode': '3865',
 '$adUnit.site': 'ar.ios.apps.allrecipes.recipes',
 '$ada-city': '[]',
 '$ada-continent': "['asian']",
 '$ada-country': "['indian']",
 '$ada-region': '[]',
 '$ada-subcontinent': '[]',
 '$ada-tags': "['world-cuisine', 'asian', 'indian', 'main-dishes', 'curry', 'vegetarian']",
 '$cookMinutes': '45',
 '$description': 'This is a really easy and tasty Indian dish that is sure to stir up your taste buds. Delicious baingan bharta is ready to eat with pita bread, Indian naan, or rice.',
 '$directions': "[{'ordinal': 1, 'videoTimestamp': 0, 'displayValue': 'Preheat oven to 450 degrees F (230 degrees C).'}, {'ordinal': 2, 'videoTimestamp': 0, 'displayValue': 'Place eggplant on a medium baking sheet. Bake 20 to 30 minutes in the preheated oven, until tende

In [173]:
def is_number(myStr):
    # str as input, returns true if is int or float
    try:
        float(myStr)
        return True
    except ValueError:
        return False
    
def make_column_name(myName):
    # creates a name inspired from myName removing leading '$' sign (for mongodb) and '.' because of Pandas
    return myName[1:].replace('.', '_') 
    
is_number(allKeysWithDescription['$nutrition.potassium.amount'])
#allKeysWithDescription['$nutrition.potassium.amount']

True

In [174]:
def get_keys_to_take_with_args(allKeysWithDescription):
    # only takes keys that are ints and make some sense
    
    acceptedKeysWithArguments = dict() # arguments for function

    for keyName, value in allKeysWithDescription.items():

        # only recognizes single integers, not lists
        if is_number(value):
            
            #if 'submitter' in keyName:
            #    print(keyName)
            
            

                
            if (keyName.startswith(tuple(['$adUnit', '$photo.']))
                or keyName.endswith(tuple(['.displayValue']))
                or (keyName in ['$similarRecipes.metaData.page'])
                or any((word in keyName) for word in ['.rawItemType', 'ID', 
                                                      'percentDailyValue'])): 
                # ignore percentDailyValue
                
                # displayValue not relevant
                continue

            aggregFunction = None
            if (keyName.endswith(tuple(['Count', '.amount']))
                or (keyName in ['$prepMinutes', '$cookMinutes', '$servings', '$readyInMinutes',
                                '$ratingAverage', '$similarRecipes.metaData.pagesize', 
                               ])):
                aggregFunction = '$avg'
                outputName = 'avg_' + make_column_name(keyName)
            else:
                print("Ignoring ok? {}: {}".format(keyName, value))
                continue

            args = {
                'aggregate_field_address': keyName, 
                'aggregate_field_output_name': outputName,
                'aggregationFunction': aggregFunction
            }
            acceptedKeysWithArguments[keyName] = args
    
    return acceptedKeysWithArguments

acceptedKeysWithArguments = get_keys_to_take_with_args(allKeysWithDescription)
acceptedKeysWithArguments
#list(acceptedKeysWithArguments.values())

{'$cookMinutes': {'aggregate_field_address': '$cookMinutes',
  'aggregate_field_output_name': 'avg_cookMinutes',
  'aggregationFunction': '$avg'},
 '$nutrition.calcium.amount': {'aggregate_field_address': '$nutrition.calcium.amount',
  'aggregate_field_output_name': 'avg_nutrition_calcium_amount',
  'aggregationFunction': '$avg'},
 '$nutrition.calories.amount': {'aggregate_field_address': '$nutrition.calories.amount',
  'aggregate_field_output_name': 'avg_nutrition_calories_amount',
  'aggregationFunction': '$avg'},
 '$nutrition.caloriesFromFat.amount': {'aggregate_field_address': '$nutrition.caloriesFromFat.amount',
  'aggregate_field_output_name': 'avg_nutrition_caloriesFromFat_amount',
  'aggregationFunction': '$avg'},
 '$nutrition.carbohydrates.amount': {'aggregate_field_address': '$nutrition.carbohydrates.amount',
  'aggregate_field_output_name': 'avg_nutrition_carbohydrates_amount',
  'aggregationFunction': '$avg'},
 '$nutrition.cholesterol.amount': {'aggregate_field_address': '$

In [202]:
# group bys
groupBysToQuery = [
    {
        # get global average
        'group_by_field_address': 'get_global_aggregation',
        'group_by_field_output_name': 'DummyColumn' # ignored,
    },
    {
        'group_by_field_address': '$ada-country',
        'group_by_field_output_name': 'country', # as it appears in dataframe
    }
    {
        'group_by_field_address': '$ada-city',
        'group_by_field_output_name': 'city', # as it appears in dataframe
    },
]

# keep in sync with above groupBys
dataframeNames = ['global', 'per_country', 'per_city']

# what to aggregate for, from valid keys extracted above
fieldArgumentsToQuery = list(acceptedKeysWithArguments.values())

allDfs = getAllDataframes(collection, groupBysToQuery, fieldArgumentsToQuery, dataframeNames)

Query 1/74 with arguments: {'aggregate_field_address': '$nutrition.caloriesFromFat.amount', 'group_by_field_address': 'get_global_aggregation', 'aggregate_field_output_name': 'avg_nutrition_caloriesFromFat_amount', 'aggregationFunction': '$avg', 'group_by_field_output_name': 'DummyColumn'}
Query 2/74 with arguments: {'aggregate_field_address': '$submitter.madeRecipesCount', 'group_by_field_address': 'get_global_aggregation', 'aggregate_field_output_name': 'avg_submitter_madeRecipesCount', 'aggregationFunction': '$avg', 'group_by_field_output_name': 'DummyColumn'}
Query 3/74 with arguments: {'aggregate_field_address': '$submitter.reciepesCount', 'group_by_field_address': 'get_global_aggregation', 'aggregate_field_output_name': 'avg_submitter_reciepesCount', 'aggregationFunction': '$avg', 'group_by_field_output_name': 'DummyColumn'}
Query 4/74 with arguments: {'aggregate_field_address': '$submitter.personalRecipeSharedCount', 'group_by_field_address': 'get_global_aggregation', 'aggregate

In [198]:
import os

json_output_filename = '../website/data/aggregatedData.json'

os.makedirs(os.path.dirname(json_output_filename), exist_ok=True)
write_dataframes_to_json(json_output_filename, allDfs)

In [199]:
!cat '../website/data/aggregatedData.json'

{
    "per_city": {
        "dallas": {
            "avg_submitter_ratingsCount": 10.3103448276,
            "avg_submitter_reciepesCount": 2.1724137931,
            "avg_nutrition_fiber_amount": 3.3479804724,
            "avg_submitter_followingCount": 0.0,
            "avg_nutrition_thiamin_amount": 0.1641491599,
            "avg_similarRecipes_metaData_pagesize": 17.2413793103,
            "avg_readyInMinutes": 113.9310344828,
            "avg_submitter_favoritesCount": 95.724137931,
            "avg_cookMinutes": 16.5172413793,
            "avg_nutrition_saturatedFat_amount": 4.5802688862,
            "avg_reviewCount": 85.7586206897,
            "avg_nutrition_potassium_amount": 347.4378434483,
            "avg_submitter_madeRecipesCount": 10.7931034483,
            "avg_nutrition_calcium_amount": 84.7029391034,
            "avg_submitter_personalRecipeSharedCount": 4.7931034483,
            "avg_nutrition_vitaminA_amount": 2142.7329715172,
            "avg_nutr

# A Safety Check for the Data

Check that the number of calories (for instance) is roughly constant per serving, i.e. the values (e.g. calories) do not need to be divided by the number of servings.

In [71]:
res = group_and_get_aggregate_of_field(
        collection, 
        group_by_field_address='$servings', 
        aggregate_field_address = '$nutrition.calories.amount', 
        group_by_field_output_name = 'servings', 
        aggregate_field_output_name = 'calories', 
        aggregationFunction = '$addToSet' 
)
res = pd.DataFrame.from_dict(res)
res

Unnamed: 0,calories,nbRecipes,servings
0,[130.5006],1,39
1,[182.1069],1,26
2,[60.56655],1,44
3,[],1,55
4,[],1,51
5,[45.91531],1,240
6,[85.97153],1,108
7,[],1,34
8,[24.79369],1,150
9,"[23.77281, 45.6426, 14.3961]",3,128


In [85]:
res['averageCalories'] = res['calories'].map(lambda x: np.mean(x))
res[res['servings'] < 20]




Unnamed: 0,calories,nbRecipes,servings,averageCalories
27,"[30.7259, 117.7393, 409.1838, 1076.825, 165.77...",66,14,287.960521
28,"[7.663048, 114.8627, 152.9864, 96.72525, 73.0,...",752,10,327.745643
33,"[378.8421, 666.5646, 610.725, 439.9805, 472.83...",599,2,494.605193
35,"[87.89926, 58.68952, 574.4297, 10.7974, 11.35,...",501,16,238.532929
36,"[845.1532, 321.1483, 805.8989, 47.271, 381.285...",306,1,392.117609
38,"[424.8909, 397.6094, 199.6972, 217.3545, 466.8...",99,7,386.462684
40,"[120.9205, 92.85124, 152.5559, 376.8631, 280.1...",170,18,256.450726
42,"[314.1674, 290.44, 19.44138, 33.27737, 524.404...",2919,8,356.31638
43,"[469.1374, 317.9422, 582.0063, 67.08359, 414.7...",3128,6,388.3871
45,"[253.275, 24.12324, 131.3087, 227.8033, 308.63...",1526,12,298.971966


We observe there exist recipes for 240 servings !

In [86]:
anotherRecipe = collection.find_one({'servings':240})
anotherRecipe

{'_id': ObjectId('5825c738b779830612cba06d'),
 'adUnit': {'adKeys': [2,
   59,
   64,
   106,
   109,
   125,
   135,
   139,
   148,
   173,
   184,
   201,
   203,
   221,
   228,
   241,
   244,
   264,
   265,
   268,
   586,
   639,
   680],
  'adZone': 'recipe',
  'networkCode': '3865',
  'site': 'ar.ios.apps.allrecipes.recipes'},
 'ada-city': [],
 'ada-continent': [],
 'ada-country': ['us-recipes', 'australian-and-new-zealander'],
 'ada-region': ['hawaii'],
 'ada-subcontinent': ['north-american'],
 'ada-tags': ['us-recipes',
  'us-recipes-by-state',
  'hawaii',
  'world-cuisine',
  'australian-and-new-zealander'],
 'cookMinutes': 90,
 'description': 'A Hawaiian chutney, excellent with pork or lamb. Also a treat with peanut butter on bread. Note: Common mangos are small and sweet even when half-ripe, not juicy.',
 'directions': [{'displayValue': 'In a large saucepan combine vinegar, white sugar, brown sugar, cinnamon, ground ginger, allspice, cloves, nutmeg, chile peppers and sal

## Check that unit is the same for all same nutrients

In [176]:
allKeysWithDescription = get_all_field_addresses(oneRecipe, shortenLongStrings=False)
keyAddressesToCheck = []
for key in allKeysWithDescription:
    if key.endswith('.unit'):
        keyAddressesToCheck.append(key)


In [197]:
df_units = pd.DataFrame(columns = ['nutrition' , 'observed units'])
for keyName in keyAddressesToCheck:

    results = group_and_get_aggregate_of_field(
        collection, 
        group_by_field_address='get_global_aggregation', 
        aggregate_field_address=keyName,
        group_by_field_output_name='DummyName', 
        aggregate_field_output_name='units', 
        aggregationFunction = '$addToSet' 
    )
    #print(results[0])
    #df2.loc[-1] = 
    df_units = df_units.append(pd.DataFrame({'nutrition': make_column_name(keyName), 'observed units': results[0]['units']}))
    #a = results
    
df_units = df_units.reset_index()
df_units

Unnamed: 0,nutrition,observed units
0,nutrition_potassium_unit,mg
0,nutrition_saturatedFat_unit,g
0,nutrition_carbohydrates_unit,g
0,nutrition_sodium_unit,mg
0,nutrition_sugars_unit,g
0,nutrition_thiamin_unit,mg
0,nutrition_vitaminA_unit,IU
0,nutrition_iron_unit,mg
0,nutrition_vitaminB6_unit,mg
0,nutrition_fat_unit,g


In [203]:
df_units.reset_index()

Unnamed: 0,index,nutrition,observed units
0,0,nutrition_potassium_unit,mg
1,0,nutrition_saturatedFat_unit,g
2,0,nutrition_carbohydrates_unit,g
3,0,nutrition_sodium_unit,mg
4,0,nutrition_sugars_unit,g
5,0,nutrition_thiamin_unit,mg
6,0,nutrition_vitaminA_unit,IU
7,0,nutrition_iron_unit,mg
8,0,nutrition_vitaminB6_unit,mg
9,0,nutrition_fat_unit,g


Hence we see that each ingredient is only given in a single unit in all recipes.

In [201]:
json_output_filename = '../website/data/nutritionAmountUnits.json'

os.makedirs(os.path.dirname(json_output_filename), exist_ok=True)
write_dataframes_to_json(json_output_filename, df_units)



ValueError: DataFrame index must be unique for orient='index'.

# Stubs

To be removed in the very end

In [None]:
# old: not currently used
def prettifyCities(cities):
    # cities is an array of arrays of cities
    # flattens this array
    return [city for citiesList in cities for city in citiesList]

In [None]:
def getAverageOfFieldPerRegion(fieldName, fieldAddress):
    # groups by continent, subcontinent, ... and computes the average
    # fieldName used in output, fieldAddress: path to access field in the collection

    results = collection.aggregate( [ 
            { '$group': 
                {
                    '_id': { 'continent': "$ada-continent", 'subcontinent': "$ada-subcontinent", 
                            'country': "$ada-country", 'region': '$ada-region', 'city': '$ada-city'},
                    # e.g. 'avgCalories': { '$avg': '$nutrition.calories.amount' },
                    fieldName: {'$avg': ('$' + fieldAddress)}, 
                    'nbRecipes': { "$sum": 1},
                }
            },
            { '$project': 
                {
                    '_id': 0,
                    # e.g. 'avgCalories': "$avgCalories",
                    fieldName: ("$" + fieldName),
                    'nbRecipes': "$nbRecipes",
                    'continent': "$_id.continent",
                    'subcontinent': "$_id.subcontinent",
                    'country': "$_id.country",
                    'region': "$_id.region",
                    'city': "$_id.city",
                }    
            }
        ]
    )
    
    return results

results = getAverageOfFieldPerRegion(fieldName='avgCalories', fieldAddress='nutrition.calories.amount')

In [24]:
#~ # group bys
#~ groupBysToQuery = [
    #~ {
        #~ # get global average
        #~ 'group_by_field_address': 'get_global_aggregation',
        #~ 'group_by_field_output_name': 'DummyColumn' # ignored,
    #~ },
    #~ {
        #~ 'group_by_field_address': '$ada-city',
        #~ 'group_by_field_output_name': 'city', # as it appears in dataframe
    #~ },
#~ ]

#~ # keep in sync with above groupBys
#~ dataframeNames = ['global', 'per_city']

#~ # what to aggregate for
#~ fieldArgumentsToQuery = [
    #~ {
        #~ 'aggregate_field_address': '$nutrition.calories.amount', 
        #~ 'aggregate_field_output_name': 'avg_calories', 
        #~ 'aggregationFunction': '$avg' 
    #~ },
    #~ {
        #~ 'aggregate_field_address': '$nutrition.vitaminA.amount', 
        #~ 'aggregate_field_output_name': 'total_vitaminA_Intake', 
        #~ 'aggregationFunction': '$sum' 
    #~ },
#~ ]



#~ for keyName, value in allKeysWithDescription.items():
    #~ if keyName == '_id':
        #~ continue
    
    #~ res = group_and_get_aggregate_of_field(collection, group_by_field_address='get_global_aggregation',
                                    #~ aggregate_field_address=keyName, 
                                    #~ group_by_field_output_name='outputField', 
                                    #~ aggregate_field_output_name=keyName, 
                                    #~ aggregationFunction = '$type' )
    
    #~ print(res)
    #~ break
    
    #~ #getAllDataframes(collection, groupBysToQuery, fieldArgumentsToQuery, dataframeNames)


# What is below is not yet done

Need to add one entry per row when there are multiple countries.

In [None]:
import itertools


for i, document in enumerate(results):
    fieldnames = ['continent', 'subcontinent', 'country', 'region', 'city']
    fieldVals = []
    for field in fieldnames:
        fieldVals.append(document[field] or ['NA'])
    combinations = list(itertools.product(*fieldVals))
    localDf = pd.DataFrame.from_records(combinations, columns=fieldnames)
    pd.DataFrame.add(localDf, axis=document.keys())
    
    remainingFields = list(set(document.keys()) - set(fieldnames))
    
    localDf.fillna(value={key: document[key] for key in remainingFields})
    
    if i >= 0:
        break
        
list(combinations)
localDf

In [None]:
# if there occurs an error "NoCursorFound", rerun the above query or set the property 'noCursorTimeout'
#df = pd.DataFrame(columns={'continent', 'country', 'avgCalories', 'nbRecipes'})


allData = []
for i, document in enumerate(results):
    #print(document)
    #df = df.append(document, ignore_index=True)
    allData.append(document)
    #if i >= 3:
    #    break

#allData
df = pd.DataFrame.from_dict(allData)

applyDataTransf = False

if applyDataTransf:
    joinList = lambda myList: ", ".join(myList)
    df['country'] = df['country'].apply(joinList)
    df['continent'] = df['continent'].apply(joinList)
    df['subcontinent'] = df['subcontinent'].apply(joinList)
    df['region'] = df['region'].apply(joinList)
    df['city'] = df['city'].apply(joinList)
    
    
df.head(3)

In [None]:
for i, do

In [None]:
df['continent'].apply(len).value_counts()

df[df['continent'].apply(len) == 2]

In [None]:
df = df.set_index(['continent', 'subcontinent', 'country'])
df.head(3)

In [None]:
df.groupby(level=[0,1])[['nbRecipes']].sum()

In [None]:
# Find an element by ID
oneRecipe = collection.find_one({'recipeID':47564})
oneRecipe

In [None]:
#started from http://stackoverflow.com/a/23282291

from bson.code import Code

# only emit the key of the field, no values
mapper = Code("""
    function() {
    
        function isObject(val) {
            if (val === null) { return false;}
            return ( (typeof val === 'function') || (typeof val === 'object') );
        }
        function getSubkeys(obj) {
            var subKeys = [];
            for (subKey in obj) {
                if (isObject(obj[subKey])) {
                    subKeys.push(subKey);
                }
                
            }
            return subKeys;
        }
        for (var key in this) { 
            subkeys = getSubkeys(this[key])
            emit(key, subkeys); 
        }
    }
""")

# keep the key, do nothing
reducer = Code("""
    function(key, values) {
        //return values.toString() + "END";
        /*
        function onlyUnique(value, index, self) { 
            return self.indexOf(value) === index;
        }
        return values.filter( onlyUnique ).toString();
        */
        return Array.from(new Set(values)).toString();
        //return new Set(values);
        
        
    }
    
    
""")

distinctThingFields = collection.map_reduce(mapper, reducer, 
    out = {'inline' : 1}, query = {'recipeID': { '$in': [47564, 98310]} }, full_response = True) #
distinctThingFields

In [None]:
collection.aggregate( [ 
        { 
            '$project': {
                '_id': 0
            }
        }
    ]
)

In [None]:
oneRecipe['nutrition']['sugars']

In [None]:
distinctThingFields

In [None]:
distinctThingFields.ok == 1.0

Resources:
* [PyMongo Tutorial](https://api.mongodb.com/python/current/tutorial.html)
* [Mongo Documentation](https://docs.mongodb.com/ecosystem/drivers/python/)