In [203]:
from pymongo import MongoClient
import datetime
import numpy as np
import pandas as pd
import getpass
import json

import nltk
from nltk.stem import WordNetLemmatizer

Be sure you are connect on the **EPFL network** (connection at EPFL or via VPN)

Note on authentification:
* Your username-password pair has read-only credentials
* Use the admin user to insert, modify, or create indexes

In [204]:
database = 'ada-project'
user = input('MongoDB name: ')
password = getpass.getpass('MongoDB password: ')

MongoDB name: davidrivollet
MongoDB password: ········


In [205]:
# Mongo Client and authentification
client = MongoClient('www.cocotte-minute.ovh', 27017)
db = client[database]
db.authenticate(user, password)
collection = db['recipes']

## Finding ingredient labels

In [206]:
def tokenize_and_lemmatize(text):
    # Tokenize
    tokens_raw = nltk.regexp_tokenize(text, pattern='\w+')

    # Normalization
    tokens_norm = [t.lower() for t in tokens_raw]

    # Removing the numbers
    tokens_without_digits = [t for t in tokens_norm if t.isdigit() == False]
    
    wnl = WordNetLemmatizer()
    tokens_lemmatized = [wnl.lemmatize(t) for t in tokens_without_digits]
    
    return tokens_lemmatized

In [207]:
def count_occurences(text):
    tokens_ref = tokenize_and_lemmatize(text[0])
    
    df_occurences = pd.DataFrame({'occurence': np.zeros(len(tokens_ref))}, index = tokens_ref)
    df_occurences = df_occurences[~df_occurences.index.duplicated()]
    
    for sentence in text:
        for token in tokenize_and_lemmatize(sentence):
            found_token = False
            
            for token_ref in tokens_ref:
                if(token == token_ref):
                    occ = df_occurences.get_value(token, 'occurence')
                    df_occurences.set_value(token, 'occurence', occ + 1)
                    found_token = True
            
            if(found_token == False):
                tokens_ref.append(token)
                df_occurences.set_value(token, 'occurence', 1)

    return df_occurences

In [208]:
def define_ingredient_label(text):
    df_occurences = count_occurences(text)
    
    max_occ = df_occurences.max(axis=0).values[0]
    if max_occ < len(text):
        name_array = df_occurences[(df_occurences['occurence'] > max_occ - (max_occ/100*10))].index.values
    else:
        name_array = df_occurences[(df_occurences['occurence'] > len(text) - (len(text)/100*10))].index.values
    
    name = name_array[0]
    for word in name_array[1:]:
        name = name + " " + word
    
    return name

In [209]:
ingredients = collection.aggregate([
    {"$unwind": "$ingredients" }, 
    {"$group": 
        {"_id": "$ingredients.ingredientID",
         "doc_count": { "$sum": 1 },
         "doc_name": {"$addToSet": "$ingredients.displayValue"}
        }
    },
    {"$sort": { "doc_count":-1}}
    ])

df_ingredients = pd.DataFrame(columns=['id', 'name', 'occurence'])

i = 0;
for ingredient in ingredients:
    if(ingredient["_id"] != 0):
        i = i + 1;
        name = define_ingredient_label(ingredient["doc_name"])
        df_ingredients.loc[len(df_ingredients)] = (ingredient["_id"], name, ingredient["doc_count"])

        #if i == 10:
        #    break;
    
df_ingredients.drop(df_ingredients[df_ingredients['occurence'] < 10].index, inplace = True)

In [210]:
df_ingredients.head(10)

Unnamed: 0,id,name,occurence
0,16421.0,salt,8730.0
1,4342.0,garlic,5246.0
2,4397.0,onion,4947.0
3,2496.0,water,4583.0
4,1526.0,sugar,4371.0
5,16157.0,butter,4036.0
6,16317.0,egg,3449.0
7,1684.0,all purpose flour,3215.0
8,16406.0,pepper,3194.0
9,6307.0,olive oil,3127.0


In [211]:
len(df_ingredients)

1046

In [212]:
df_ingredients_duplicates = df_ingredients.groupby('name').count()
len(df_ingredients_duplicates)

861

In [213]:
df_ingredients_duplicates = df_ingredients_duplicates[df_ingredients_duplicates['occurence'] > 2]
df_ingredients_duplicates.head(10)

Unnamed: 0_level_0,id,occurence
name,Unnamed: 1_level_1,Unnamed: 2_level_1
cheese,5,5
chicken,12,12
chipotle,3,3
coconut,3,3
corn,4,4
crabmeat,3,3
cucumber,3,3
cup rice,6,6
flour tortilla,3,3
ginger,3,3


In [214]:
len(df_ingredients_duplicates)

37

## Counting number of recipes per country

In [356]:
def initialize_dataFrame(json_name):
    with open(json_name) as data_file:
        agg_data = json.load(data_file)
    
    df_countries = pd.DataFrame(columns=['country name', 'country adj', 'total recipe num'])
    df_countries.index.set_names('country tag')
    
    for country in agg_data["per_country"]:
        df_countries.set_value(country["geo_identifier"], "country name", country["geo_name"].lower())
        df_countries.set_value(country["geo_identifier"], "country adj", country["database_name"])
        df_countries.set_value(country["geo_identifier"], "total recipe num", country["nbRecipes"])
    
    return df_countries.sort_index()

In [357]:
df_countries = initialize_dataFrame('fullAggregatedData.json')

In [358]:
df_countries.head(10)

Unnamed: 0,country name,country adj,total recipe num
ARG,argentina,argentinian,23
AUT,austria,austrian,30
BEL,belgium,belgian,15
BGD,bangladesh,bangladeshi,14
BRA,brazil,brazilian,88
CAN,canada,canadian,1167
CHE,switzerland,swiss,29
CHL,chile,chili,30
CHN,china,chinese,247
COL,colombia,colombian,13


## Making a recipe request by ingredient

In [359]:
def find_ingredient_ids(name):
    found_ingredients = []
    ids = []
    for ingredient in df_ingredients['name']:
        if ingredient.find(name) != -1:
            found_ingredients.append(ingredient)
    
    for value in df_ingredients[df_ingredients['name'].isin(found_ingredients)]["id"].values:
        ids.append(value)
    print(ids)
    return ids

In [360]:
def request_recipes_by_ingredient(ingredient_ids):
    recipes = collection.find(
    {"ingredients":{
            "$elemMatch": {
                "ingredientID": {"$in":ingredient_ids}
            }
        }}, 
    )
    return recipes

In [361]:
def fill_dataFrame_ingredient(df, name):
    ingredient_ids = find_ingredient_ids(name)
    recipes = request_recipes_by_ingredient(ingredient_ids)
    
    df[name + ' recipes %'] = 0
    
    for recipe in recipes:
        for country in recipe["ada-country"]:

            country_index = "Unfound"

            found_row = df[df['country adj'] == country]
            if(len(found_row) != 0):
                country_index = found_row.index.values[0]

            else:
                found_row = df[df['country name'] == country]
                if(len(found_row) != 0):
                    country_index = found_row.index.values[0]

            if(country_index != "Unfound"):
                df.set_value(country_index, 
                             name + ' recipes %', 
                             df.get_value(country_index, name + ' recipes %') + 1)
    
    df[name + ' recipes %'] = df[name + ' recipes %']/df['total recipe num'] *100
    return 

In [362]:
fill_dataFrame_ingredient(df_countries, 'soy sauce')

[2882.0, 2958.0, 10274.0, 10275.0]


In [363]:
fill_dataFrame_ingredient(df_countries, 'chicken')

[6494.0, 16133.0, 20384.0, 13968.0, 6493.0, 5521.0, 6522.0, 5467.0, 5520.0, 16124.0, 6527.0, 6531.0, 5679.0, 21025.0, 6506.0, 6442.0, 10643.0, 6489.0, 6496.0, 13405.0, 6498.0, 5429.0, 6459.0, 6497.0, 5465.0, 6446.0, 6441.0, 6443.0, 21031.0, 6549.0]


In [364]:
fill_dataFrame_ingredient(df_countries, 'beef')

[3103.0, 5460.0, 3868.0, 5516.0, 13974.0, 4133.0, 3910.0, 4086.0, 4004.0, 5515.0, 3886.0, 22104.0, 3138.0, 9358.0, 7120.0, 3140.0, 4040.0, 4074.0, 9445.0]


In [365]:
fill_dataFrame_ingredient(df_countries, 'cheese')

[16238.0, 16215.0, 16234.0, 16223.0, 16225.0, 16243.0, 16231.0, 16246.0, 16218.0, 16241.0, 16244.0, 22106.0, 445.0, 16340.0, 16351.0, 16232.0, 20324.0, 16248.0, 443.0, 16210.0, 16229.0, 19360.0, 578.0, 19356.0, 454.0, 20611.0, 14388.0, 19362.0, 21130.0, 16212.0, 16217.0, 20474.0, 16360.0, 16242.0, 16366.0, 16226.0, 16236.0, 8270.0]


In [366]:
fill_dataFrame_ingredient(df_countries, 'egg')

[16317.0, 16319.0, 16318.0, 4339.0, 16323.0, 1709.0]


In [369]:
fill_dataFrame_ingredient(df_countries, 'chocolate')

[1338.0, 18521.0, 1336.0, 18526.0, 8610.0, 3507.0, 1429.0, 20199.0, 1373.0]


In [374]:
df_ingredients.loc[150:200]

Unnamed: 0,id,name,occurence
150,4252.0,broccoli,184.0
151,3789.0,coconut,182.0
152,10536.0,cooking spray,180.0
153,22005.0,shrimp,180.0
154,16423.0,thyme fresh,179.0
155,4339.0,eggplant,178.0
156,20303.0,cinnamon stick,177.0
157,4577.0,ounce can diced tomato green,177.0
158,18794.0,whole clove,176.0
159,18741.0,salt,176.0


In [355]:
df_countries.sort_index()

Unnamed: 0,country name,country adj,total recipe num,soy sauce recipes %,chicken recipes %,beef recipes %,cheese recipes %,egg recipes %,porc recipes %
ARG,argentina,argentinian,23,0.0,4.34783,4.34783,4.34783,39.1304,0
AUT,austria,austrian,30,0.0,0.0,3.33333,3.33333,70.0,0
BEL,belgium,belgian,15,0.0,13.3333,6.66667,13.3333,40.0,0
BGD,bangladesh,bangladeshi,14,0.0,7.14286,21.4286,0.0,14.2857,0
BRA,brazil,brazilian,88,2.27273,12.5,4.54545,18.1818,32.9545,0
CAN,canada,canadian,1167,6.34105,11.9966,5.74122,21.0797,29.3916,0
CHE,switzerland,swiss,29,0.0,3.44828,17.2414,37.931,34.4828,0
CHL,chile,chili,30,0.0,13.3333,43.3333,16.6667,0.0,0
CHN,china,chinese,247,65.1822,41.7004,2.02429,1.21457,31.5789,0
COL,colombia,colombian,13,0.0,46.1538,23.0769,15.3846,7.69231,0
