In [1]:
from pymongo import MongoClient
import datetime
import numpy as np
import pandas as pd
import getpass

Be sure you are connect on the **EPFL network** (connection at EPFL or via VPN)

Note on authentification:
* Your username-password pair has read-only credentials
* Use the admin user to insert, modify, or create indexes

In [2]:
database = 'ada-project'
user = input('MongoDB name: ') #maxmordig
password = getpass.getpass('MongoDB password: ')

MongoDB name: loicveyssiere
MongoDB password: ········


In [3]:
# Mongo Client and authentification
client = MongoClient('www.cocotte-minute.ovh', 27017)
db = client[database]
db.authenticate(user, password)
collection = db['recipes']

In [4]:
# Number of recipes
serverMongoVersion = client.server_info()['version']
serverMongoVersion

'3.2.10'

In [5]:
def getAggregateOfFieldPerCountry(fieldName, fieldAddress, aggregationFunction = '$avg'):
    # computes the average of the field by country
    # fieldName used in output dictionary for the requested quantity, fieldAddress: path to access field in the collection
    # if 'aggregateFunction', not the average is used, but this function, e.g. $sum, $avg 
    # needs to be preceded by a '$' sign !
    # (see mongodb reference for expressions to appear in '$group' statement)

    fieldAddress = '$' + fieldAddress # add $ sign, syntax in mongodb
    
    results = collection.aggregate( [ 
            { "$unwind": 
                 "$ada-country", 
            },
            #{ "$unwind": 
            #     "$ada-city", 
             # to have cities separately (because ada-city is an array of cities), i.e. "flatten" array
             # note, this way we weight these recipes by the number of cities!
             # may change this and flatten array manually later
            #},
            { '$group': 
                {
                    '_id': { 'country': "$ada-country"},
                    # e.g. 'avgCalories': { '$avg': '$nutrition.calories.amount' },
                    'avgQuantity': {aggregationFunction: fieldAddress}, 
                    'nbRecipes': { "$sum": 1},
                    'cities': { "$addToSet": "$ada-city" },
                }
            },
            { '$project': 
                {
                    '_id': 0,
                    # e.g. 'avgCalories': "$avgCalories",
                    fieldName: "$avgQuantity",
                    'nbRecipes': "$nbRecipes",
                    'country': "$_id.country",
                    'allCities': "$cities", # possibly over multiple countries
                }    
            }
        ]
    )
    
    return results

# Example usage of this function

In [6]:
results = getAggregateOfFieldPerCountry(fieldName='avgCalories', fieldAddress='nutrition.calories.amount')
results = list(results) # to keep them

In [7]:
def prettifyCities(cities):
    # cities is an array of arrays of cities
    # flattens this array
    return [city for citiesList in cities for city in citiesList]

#prettifyCities(df.iloc[6, 0])

In [11]:
df = pd.DataFrame.from_dict(results)
df = df.set_index(['country'])
df['allCities'] = df['allCities'].map(prettifyCities)
df.head(50)

Unnamed: 0_level_0,allCities,avgCalories,nbRecipes
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
israeli,[],255.634929,24
bangladeshi,[],355.95435,14
colombian,[houston],374.666667,13
dutch,[],314.347771,38
english,"[cleveland, vancouver, toronto]",346.135023,157
brazilian,[dallas],321.4505,88
polish,"[indianapolis, dallas, pittsburgh, minneapolis...",336.713056,124
korean,"[buffalo, california, los-angeles, vancouver]",372.247254,112
welsh,[],218.828295,19
scottish,[toronto],366.916204,51


# Several Fields

We now suppose we are interested in several fields, e.g. 'avgCalories' etc. and the goal is to merge this information into a single dataframe.

To first get an idea of what fields we have, we search for a simple recipe.

In [12]:
oneRecipe = collection.find_one({'recipeID':47564})
oneRecipe

{'_id': ObjectId('5825054ace06e50446084706'),
 'adUnit': {'adKeys': [7,
   79,
   95,
   125,
   148,
   150,
   169,
   173,
   184,
   201,
   221,
   235,
   241,
   242,
   249,
   254,
   265,
   608],
  'adZone': 'recipes',
  'contentProviderId': 451,
  'networkCode': '3865',
  'site': 'ar.ios.apps.allrecipes.recipes'},
 'ada-city': [],
 'ada-continent': ['asian'],
 'ada-country': ['indian'],
 'ada-region': [],
 'ada-subcontinent': [],
 'ada-tags': ['world-cuisine',
  'asian',
  'indian',
  'main-dishes',
  'curry',
  'vegetarian'],
 'cookMinutes': 45,
 'description': 'This is a really easy and tasty Indian dish that is sure to stir up your taste buds. Delicious baingan bharta is ready to eat with pita bread, Indian naan, or rice.',
 'directions': [{'displayValue': 'Preheat oven to 450 degrees F (230 degrees C).',
   'ordinal': 1,
   'videoTimestamp': 0},
  {'displayValue': 'Place eggplant on a medium baking sheet. Bake 20 to 30 minutes in the preheated oven, until tender. Remove

In [13]:
def showFields(recipeObject, rowPrefix = '> '):
    # This methods recursively displays all fields in the dict structure (possibly dict of dicts of dicts ...)
    for key, value in recipeObject.items():
        if isinstance(value, dict):
            print("{}{}".format(rowPrefix, key))
            showFields(value, rowPrefix = (rowPrefix + '->-'))
        else:
            valueToShow = "{}".format(value)
            if len(valueToShow) > 20:
                valueToShow = valueToShow[:20] + "..."
            print("{}{}: {}".format(rowPrefix, key, valueToShow))

In [14]:
showFields(oneRecipe)

> topReviews: [{'dateLastModified'...
> footnotes: [{'ordinal': 100, 't...
> isSponsored: False
> _id: 5825054ace06e5044608...
> ada-city: []
> type: recipes
> recipeID: 47564
> submitter
> ->-handle: None
> ->-madeRecipesCount: 0
> ->-isPro: False
> ->-brandedSourceID: 0
> ->-country: None
> ->-city: None
> ->-itemType: Cook
> ->-name: Yakuta
> ->-promotedBrandPixelTrackingUrl: None
> ->-region: None
> ->-favoritesCount: 0
> ->-personalRecipeSharedCount: 0
> ->-followingCount: 0
> ->-reviewsCount: 0
> ->-profileUrl: 
> ->-photo
> ->-->-urls: [{'url': 'http://ima...
> ->-->-description: None
> ->-->-recipeTitle: None
> ->-->-itemType: Photo
> ->-->-title: None
> ->-->-rawItemType: 10
> ->-->-photoDetailUrl: None
> ->-userID: 0
> ->-reciepesCount: 0
> ->-followersCount: 0
> ->-rawItemType: 11
> ->-ratingsCount: 0
> ada-country: ['indian']
> servings: 4
> ada-region: []
> ada-continent: ['asian']
> adUnit
> ->-networkCode: 3865
> ->-adKeys: [7, 79, 95, 125, 148...
> ->-contentProviderId:

In [15]:
# contains all parameters to pass to the aggregation function
fieldArgumentsToQuery = [
    # can also add other arguments, e.g.
    {'fieldName': 'avgCalories', 'fieldAddress': 'nutrition.calories.amount'},
    {'fieldName': 'avgIronIntake', 'fieldAddress': 'nutrition.vitaminA.amount'},
    #['avgIronIntake', 'nutrition.caloriesFromFat.amount'],
                ]

df = None
for i, fieldArguments in enumerate(fieldArgumentsToQuery):
    print('Query {} with arguments'.format(i+1))
    results = getAggregateOfFieldPerCountry(**fieldArguments)
    results = list(results) # to keep them
    
    localDf = pd.DataFrame.from_dict(results)
    localDf['allCities'] = localDf['allCities'].map(prettifyCities)
    
    if df is None:
        df = localDf
    else:
        addedColumnName = fieldArguments['fieldName']
        df = df.merge(localDf[['country', addedColumnName]], how='outer', on='country')
        
df = df.set_index(['country'])
df = df.sort_index()

Query 1 with arguments
Query 2 with arguments


In [17]:
df.head(30)

Unnamed: 0_level_0,allCities,avgCalories,nbRecipes,avgIronIntake
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
argentinian,[],263.49,23,1370.474883
australian-and-new-zealander,"[pittsburgh, seattle, vancouver, toronto, cali...",320.260396,548,2052.773498
austrian,[],318.212563,30,496.887805
bangladeshi,[],355.95435,14,954.603441
belgian,"[minneapolis, green-bay]",345.666367,15,2523.998303
brazilian,[dallas],321.4505,88,1237.762397
canadian,"[california, los-angeles, baltimore, seattle, ...",312.092431,1167,1377.118249
chilean,[],373.220238,32,1638.903829
chili,[],423.847957,30,1268.63334
chinese,"[seattle, california, los-angeles, vancouver, ...",337.734472,247,951.867099


# Stubs

In [184]:
def getAverageOfFieldPerRegion(fieldName, fieldAddress):
    # groups by continent, subcontinent, ... and computes the average
    # fieldName used in output, fieldAddress: path to access field in the collection

    results = collection.aggregate( [ 
            { '$group': 
                {
                    '_id': { 'continent': "$ada-continent", 'subcontinent': "$ada-subcontinent", 
                            'country': "$ada-country", 'region': '$ada-region', 'city': '$ada-city'},
                    # e.g. 'avgCalories': { '$avg': '$nutrition.calories.amount' },
                    fieldName: {'$avg': ('$' + fieldAddress)}, 
                    'nbRecipes': { "$sum": 1},
                }
            },
            { '$project': 
                {
                    '_id': 0,
                    # e.g. 'avgCalories': "$avgCalories",
                    fieldName: ("$" + fieldName),
                    'nbRecipes': "$nbRecipes",
                    'continent': "$_id.continent",
                    'subcontinent': "$_id.subcontinent",
                    'country': "$_id.country",
                    'region': "$_id.region",
                    'city': "$_id.city",
                }    
            }
        ]
    )
    
    return results

results = getAverageOfFieldPerRegion(fieldName='avgCalories', fieldAddress='nutrition.calories.amount')

# What is below is not yet done

Need to add one entry per row when there are multiple countries.

In [241]:
import itertools


for i, document in enumerate(results):
    fieldnames = ['continent', 'subcontinent', 'country', 'region', 'city']
    fieldVals = []
    for field in fieldnames:
        fieldVals.append(document[field] or ['NA'])
    combinations = list(itertools.product(*fieldVals))
    localDf = pd.DataFrame.from_records(combinations, columns=fieldnames)
    pd.DataFrame.add(localDf, axis=document.keys())
    
    remainingFields = list(set(document.keys()) - set(fieldnames))
    
    localDf.fillna(value={key: document[key] for key in remainingFields})
    
    if i >= 0:
        break
        
list(combinations)
localDf

TypeError: f() missing 1 required positional argument: 'other'

In [178]:
# if there occurs an error "NoCursorFound", rerun the above query or set the property 'noCursorTimeout'
#df = pd.DataFrame(columns={'continent', 'country', 'avgCalories', 'nbRecipes'})


allData = []
for i, document in enumerate(results):
    #print(document)
    #df = df.append(document, ignore_index=True)
    allData.append(document)
    #if i >= 3:
    #    break

#allData
df = pd.DataFrame.from_dict(allData)

applyDataTransf = False

if applyDataTransf:
    joinList = lambda myList: ", ".join(myList)
    df['country'] = df['country'].apply(joinList)
    df['continent'] = df['continent'].apply(joinList)
    df['subcontinent'] = df['subcontinent'].apply(joinList)
    df['region'] = df['region'].apply(joinList)
    df['city'] = df['city'].apply(joinList)
    
    
df.head(3)

Unnamed: 0,avgCalories,city,continent,country,nbRecipes,region,subcontinent
0,138.6014,[],[],[us-recipes],1,[delaware],[north-american]
1,619.0743,[seattle],[],[us-recipes],1,"[tennessee, washington]",[north-american]
2,72.87125,[],[],[us-recipes],1,"[washington, wisconsin]",[north-american]


In [None]:
for i, do

In [182]:
df['continent'].apply(len).value_counts()

df[df['continent'].apply(len) == 2]

Unnamed: 0,avgCalories,city,continent,country,nbRecipes,region,subcontinent
129,118.5825,[],"[asian, european]","[korean, russian]",1,[],[eastern-european]
229,443.6317,[],"[asian, european]",[],1,[],[]
249,177.8172,[],"[european, asian]",[italian],1,[],[]
343,553.4908,[],"[asian, european]",[thai],1,[],[]
371,467.3918,[],"[african, european]","[moroccan, australian-and-new-zealander]",1,[],[north-african]
381,212.681,[],"[european, asian]",[russian],2,[],[eastern-european]
394,95.19547,[],"[european, asian]","[greek, indian]",1,[],[]
428,456.912183,[],"[asian, european]","[indian, italian]",6,[],[]
465,395.5239,[],"[african, european]","[south-african, english]",1,[],[]
522,554.46635,[],"[asian, european]",[french],2,[],[]


In [154]:
df = df.set_index(['continent', 'subcontinent', 'country'])
df.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,avgCalories,city,nbRecipes,region
continent,subcontinent,country,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
,north-american,us-recipes,269.27,new-york-city,1,"mississippi, new-york"
,north-american,us-recipes,265.9317,,1,"oregon, wyoming"
,north-american,us-recipes,228.9727,,1,"vermont, west-virginia"


In [174]:
df.groupby(level=[0,1])[['nbRecipes']].sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,nbRecipes
continent,subcontinent,Unnamed: 2_level_1
,,326
,latin-american,2241
,"latin-american, north-american",278
,"latin-american, south-american",206
,"latin-american, south-american, north-american",7
,middle-eastern,269
,"middle-eastern, north-american",7
,"middle-eastern, persian",18
,north-american,4769
african,,61


In [25]:
# Find an element by ID
oneRecipe = collection.find_one({'recipeID':47564})
oneRecipe

{'_id': ObjectId('5825054ace06e50446084706'),
 'adUnit': {'adKeys': [7,
   79,
   95,
   125,
   148,
   150,
   169,
   173,
   184,
   201,
   221,
   235,
   241,
   242,
   249,
   254,
   265,
   608],
  'adZone': 'recipes',
  'contentProviderId': 451,
  'networkCode': '3865',
  'site': 'ar.ios.apps.allrecipes.recipes'},
 'cookMinutes': 45,
 'description': 'This is a really easy and tasty Indian dish that is sure to stir up your taste buds. Delicious baingan bharta is ready to eat with pita bread, Indian naan, or rice.',
 'directions': [{'displayValue': 'Preheat oven to 450 degrees F (230 degrees C).',
   'ordinal': 1,
   'videoTimestamp': 0},
  {'displayValue': 'Place eggplant on a medium baking sheet. Bake 20 to 30 minutes in the preheated oven, until tender. Remove from heat, cool, peel, and chop.',
   'ordinal': 2,
   'videoTimestamp': 0},
  {'displayValue': 'Heat oil in a medium saucepan over medium heat. Mix in cumin seeds and onion. Cook and stir until onion is tender.',
  

In [58]:
#started from http://stackoverflow.com/a/23282291

from bson.code import Code

# only emit the key of the field, no values
mapper = Code("""
    function() {
    
        function isObject(val) {
            if (val === null) { return false;}
            return ( (typeof val === 'function') || (typeof val === 'object') );
        }
        function getSubkeys(obj) {
            var subKeys = [];
            for (subKey in obj) {
                if (isObject(obj[subKey])) {
                    subKeys.push(subKey);
                }
                
            }
            return subKeys;
        }
        for (var key in this) { 
            subkeys = getSubkeys(this[key])
            emit(key, subkeys); 
        }
    }
""")

# keep the key, do nothing
reducer = Code("""
    function(key, values) {
        //return values.toString() + "END";
        /*
        function onlyUnique(value, index, self) { 
            return self.indexOf(value) === index;
        }
        return values.filter( onlyUnique ).toString();
        */
        return Array.from(new Set(values)).toString();
        //return new Set(values);
        
        
    }
    
    
""")

distinctThingFields = collection.map_reduce(mapper, reducer, 
    out = {'inline' : 1}, query = {'recipeID': { '$in': [47564, 98310]} }, full_response = True) #
distinctThingFields

{'counts': {'emit': 50, 'input': 2, 'output': 25, 'reduce': 25},
 'ok': 1.0,
 'results': [{'_id': '_id',
   'value': 'tojson,valueOf,getTimestamp,equals,tojson,valueOf,getTimestamp,equals'},
  {'_id': 'adUnit', 'value': 'adKeys,adKeys'},
  {'_id': 'cookMinutes', 'value': 'toPercentStr,zeroPad,toPercentStr,zeroPad'},
  {'_id': 'description',
   'value': 'ltrim,rtrim,includes,pad,ltrim,rtrim,includes,pad'},
  {'_id': 'directions', 'value': '0,1,2,3,0,1'},
  {'_id': 'footnotes', 'value': '0,1,'},
  {'_id': 'ingredients', 'value': '0,1,2,3,4,5,6,7,8,9,10,0,1,2,3,4,5,6,7,8'},
  {'_id': 'isSponsored', 'value': ','},
  {'_id': 'links', 'value': 'recipeUrl,self,self,recipeUrl'},
  {'_id': 'nutrition',
   'value': 'calories,magnesium,vitaminA,saturatedFat,cholesterol,sugars,carbohydrates,potassium,protein,vitaminC,niacin,sodium,folate,calcium,caloriesFromFat,vitaminB6,iron,fiber,fat,thiamin,magnesium,sodium,thiamin,iron,vitaminB6,calcium,carbohydrates,fiber,fat,sugars,protein,folate,niacin,vita

In [7]:
collection.aggregate( [ 
        { 
            '$project': {
                '_id': 0
            }
        }
    ]
)

OperationFailure: $projection requires at least one output field

In [37]:
oneRecipe['nutrition']['sugars']

{'amount': 7.522098,
 'displayValue': '7.5',
 'hasCompleteData': True,
 'name': 'Sugars',
 'percentDailyValue': '0',
 'unit': 'g'}

In [19]:
distinctThingFields

{'counts': {'emit': 403675, 'input': 16147, 'output': 25, 'reduce': 4050},
 'ok': 1.0,
 'results': [{'_id': '_id',
   'value': ',,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,

In [9]:
distinctThingFields.ok == 1.0

{'counts': {'emit': 403675, 'input': 16147, 'output': 25, 'reduce': 4050},
 'ok': 1.0,
 'results': [{'_id': '_id', 'value': None},
  {'_id': 'adUnit', 'value': None},
  {'_id': 'cookMinutes', 'value': None},
  {'_id': 'description', 'value': None},
  {'_id': 'directions', 'value': None},
  {'_id': 'footnotes', 'value': None},
  {'_id': 'ingredients', 'value': None},
  {'_id': 'isSponsored', 'value': None},
  {'_id': 'links', 'value': None},
  {'_id': 'nutrition', 'value': None},
  {'_id': 'photo', 'value': None},
  {'_id': 'prepMinutes', 'value': None},
  {'_id': 'ratingAverage', 'value': None},
  {'_id': 'ratingCount', 'value': None},
  {'_id': 'readyInMinutes', 'value': None},
  {'_id': 'recipeID', 'value': None},
  {'_id': 'reviewCount', 'value': None},
  {'_id': 'servings', 'value': None},
  {'_id': 'similarRecipes', 'value': None},
  {'_id': 'sourceID', 'value': None},
  {'_id': 'submitter', 'value': None},
  {'_id': 'title', 'value': None},
  {'_id': 'topReviews', 'value': None},

Resources:
* [PyMongo Tutorial](https://api.mongodb.com/python/current/tutorial.html)
* [Mongo Documentation](https://docs.mongodb.com/ecosystem/drivers/python/)