In [1]:
from pymongo import MongoClient
import datetime
import numpy as np
import pandas as pd
import getpass
import json

import nltk
from nltk.stem import WordNetLemmatizer

Be sure you are connect on the **EPFL network** (connection at EPFL or via VPN)

Note on authentification:
* Your username-password pair has read-only credentials
* Use the admin user to insert, modify, or create indexes

In [4]:
database = 'ada-project'
user = input('MongoDB name: ')
password = getpass.getpass('MongoDB password: ')

MongoDB name: davidrivollet
MongoDB password: ········


In [5]:
# Mongo Client and authentification
client = MongoClient('www.cocotte-minute.ovh', 27017)
db = client[database]
db.authenticate(user, password)
collection = db['recipes']

# Construct ingredient DataFrame

## Find ingredient labels

In [6]:
def tokenize_and_lemmatize(text):
    # Tokenize
    tokens_raw = nltk.regexp_tokenize(text, pattern='\w+')

    # Normalization
    tokens_norm = [t.lower() for t in tokens_raw]

    # Removing the numbers
    tokens_without_digits = [t for t in tokens_norm if t.isdigit() == False]
    
    wnl = WordNetLemmatizer()
    tokens_lemmatized = [wnl.lemmatize(t) for t in tokens_without_digits]
    
    return tokens_lemmatized

In [7]:
def count_occurences(text):
    tokens_ref = tokenize_and_lemmatize(text[0])
    
    df_occurences = pd.DataFrame({'occurence': np.zeros(len(tokens_ref))}, index = tokens_ref)
    df_occurences = df_occurences[~df_occurences.index.duplicated()]
    
    for sentence in text:
        for token in tokenize_and_lemmatize(sentence):
            found_token = False
            
            for token_ref in tokens_ref:
                if(token == token_ref):
                    occ = df_occurences.get_value(token, 'occurence')
                    df_occurences.set_value(token, 'occurence', occ + 1)
                    found_token = True
            
            if(found_token == False):
                tokens_ref.append(token)
                df_occurences.set_value(token, 'occurence', 1)

    return df_occurences

In [8]:
def define_ingredient_label(text):
    df_occurences = count_occurences(text)
    
    max_occ = df_occurences.max(axis=0).values[0]
    if max_occ < len(text):
        name_array = df_occurences[(df_occurences['occurence'] > max_occ - (max_occ/100*10))].index.values
    else:
        name_array = df_occurences[(df_occurences['occurence'] > len(text) - (len(text)/100*10))].index.values
    
    name = name_array[0]
    for word in name_array[1:]:
        name = name + " " + word
    
    return name

In [9]:
ingredients = collection.aggregate([
    {"$unwind": "$ingredients" }, 
    {"$group": 
        {"_id": "$ingredients.ingredientID",
         "doc_count": { "$sum": 1 },
         "doc_name": {"$addToSet": "$ingredients.displayValue"}
        }
    },
    {"$sort": { "doc_count":-1}}
    ])

df_ingredients = pd.DataFrame(columns=['id', 'name', 'occurence'])

i = 0;
for ingredient in ingredients:
    if(ingredient["_id"] != 0):
        name = define_ingredient_label(ingredient["doc_name"])
        df_ingredients.loc[len(df_ingredients)] = (ingredient["_id"], name, ingredient["doc_count"])
    
df_ingredients.drop(df_ingredients[df_ingredients['occurence'] < 10].index, inplace = True)

In [10]:
df_ingredients.head(10)

Unnamed: 0,id,name,occurence
0,16421.0,salt,8730.0
1,4342.0,garlic,5246.0
2,4397.0,onion,4947.0
3,2496.0,water,4583.0
4,1526.0,sugar,4371.0
5,16157.0,butter,4036.0
6,16317.0,egg,3449.0
7,1684.0,all purpose flour,3215.0
8,16406.0,pepper,3194.0
9,6307.0,olive oil,3127.0


In [11]:
len(df_ingredients)

1046

In [12]:
df_ingredients_duplicates = df_ingredients.groupby('name').count()
len(df_ingredients_duplicates)

861

In [13]:
df_ingredients_duplicates = df_ingredients_duplicates[df_ingredients_duplicates['occurence'] > 2]
df_ingredients_duplicates.head(10)

Unnamed: 0_level_0,id,occurence
name,Unnamed: 1_level_1,Unnamed: 2_level_1
cheese,5,5
chicken,12,12
chipotle,3,3
coconut,3,3
corn,4,4
crabmeat,3,3
cucumber,3,3
cup rice,6,6
flour tortilla,3,3
ginger,3,3


In [14]:
len(df_ingredients_duplicates)

37

## Find ingredient type

In [15]:
df_ingredients['type'] = 'None'

In [16]:
def find_ingredient_ids(df, name):
    found_ingredients = []
    ids = []
    for ingredient in df['name']:
        if ingredient.find(name) != -1:
            found_ingredients.append(ingredient)
    
    for value in df[df['name'].isin(found_ingredients)]["id"].values:
        ids.append(value)
    return ids

In [17]:
def fill_ingredient_type(df, type_name, ingredient_names):
    ids = []
    for ingredient in ingredient_names:
        ids = ids + find_ingredient_ids(df, ingredient)
    
    df.loc[df['id'].isin(ids),'type'] = type_name
    return

#### Meat

In [18]:
meat_names = ['chicken', 'turkey', 'beef', 'pork', 'veal', 'lamb', 'bacon', 'sausage', 'ham',
             'prosciutto', 'steak', 'dog', 'chorizo', 'lard', 'salami', 'pancetta']
fill_ingredient_type(df_ingredients, 'meat', meat_names)

#### Fish

In [19]:
fish_names = ['fish', 'cod', 'salmon', 'swordfish', 'shrimp', 'sea', 'crabmeat', 'anchovy', 
              'clam', 'tuna', 'prawn']
fill_ingredient_type(df_ingredients, 'fish and seafood', fish_names)

#### Egg

In [20]:
egg_names = ['egg']
fill_ingredient_type(df_ingredients, 'egg', egg_names)

#### Dairy

In [21]:
dairy_names = ['milk', 'cream', 'cheese', 'yogurt', 'butter', 'margarine', 'mozzarella', 'queso']
fill_ingredient_type(df_ingredients, 'dairy', dairy_names)

#### Vegetables

In [22]:
vegetables_names = ['garlic', 'onion', 'carrot', 'tomato', 'celery', 'mushroom', 'zucchini',
                    'olive', 'cabbage', 'spinach', 'cucumber', 'lettuce', 'broccoli', 'vegetable',
                    'pepper', 'pea', 'caper', 'asparagus', 'cauliflower', 'chipotle', 'chestnut',
                    'artichoke', 'shallot', 'leek', 'pickle', 'chile', 'choy', 'kale', 'pumpkin',
                    'squash', 'radish', 'salad']
fill_ingredient_type(df_ingredients, 'vegetables', vegetables_names)

#### Fruit

In [23]:
fruit_names = ['lemon', 'lime', 'avocado', 'raisin', 'orange', 'apple', 'banana', 'mango',
               'coconut', 'sherry', 'strawberry', 'pineapple', 'blueberry', 'papaya', 'fig',
               'cherry', 'date', 'cranberry', 'raspberry', 'prune', 'apricot', 'blackberry',
               'guacamole']
fill_ingredient_type(df_ingredients, 'fruit', fruit_names)

#### Starchy

In [24]:
starchy_names = ['potato', 'rice', 'bread', 'cornstarch', 'corn', 'walnut', 'almond', 'bean',
                 'flour', 'sesame', 'pecan', 'polenta', 'noodle', 'lentil', 'spaghetti', 'pasta',
                 'macaroni', 'tortilla', 'tofu', 'linguine', 'oat', 'paste', 'seed', 'cashew',
                 'ravioli', 'gnocchi', 'crust', 'nut', 'baguette', 'quinoa', 'couscous']
fill_ingredient_type(df_ingredients, 'starchy', starchy_names)

#### Condiment

In [25]:
condiment_names = ['salt', 'oil', 'soy sauce', 'vinegar', 'mayonnaise', 'ketchup', 'mustard',
                   'sauce', 'salsa', 'seasoning', 'mix']
fill_ingredient_type(df_ingredients, 'condiment', condiment_names)

#### Spicies

In [26]:
spicies_names = ['curry', 'cinnamon', 'cumin', 'ginger', 'vanilla', 'paprika', 'powder', 'nutmeg',
                'cocoa', 'allspice', 'masala', 'clove', 'cardamom', 'turmeric', 'saffron']
fill_ingredient_type(df_ingredients, 'spicies', spicies_names)

#### Herbs

In [27]:
herbs_names = ['cilantro', 'basil','parsley', 'oregano', 'mint', 'thyme', 'coriander', 'rosemary',
              'leaf', 'chive', 'sage']
fill_ingredient_type(df_ingredients, 'herbs', herbs_names)

#### Sugared Integredient

In [28]:
sugared_names = ['sugar', 'honey', 'maple syrup', 'chocolate']
fill_ingredient_type(df_ingredients, 'sugared', sugared_names)

#### Alcohol

In [29]:
alcohol_names = ['wine', 'liqueur', 'pisco', 'beer', 'tequila', 'rum', 'brandy', 'triple sec',
                 'vodka', 'sake', 'whiskey', 'bourbon']
fill_ingredient_type(df_ingredients, 'alcohol', alcohol_names)

In [30]:
len(df_ingredients[df_ingredients['type'] == 'None'])

199

In [31]:
len(df_ingredients)

1046

In [32]:
df_ingredients.head(10)

Unnamed: 0,id,name,occurence,type
0,16421.0,salt,8730.0,condiment
1,4342.0,garlic,5246.0,vegetables
2,4397.0,onion,4947.0,vegetables
3,2496.0,water,4583.0,
4,1526.0,sugar,4371.0,sugared
5,16157.0,butter,4036.0,dairy
6,16317.0,egg,3449.0,egg
7,1684.0,all purpose flour,3215.0,starchy
8,16406.0,pepper,3194.0,vegetables
9,6307.0,olive oil,3127.0,condiment


# Counting number of recipes per country

In [33]:
def initialize_dataFrame_countries(json_name):
    with open(json_name) as data_file:
        agg_data = json.load(data_file)
    
    df_countries = pd.DataFrame(columns=['geo_identifier', 'geo_name', 'database_name', 'nbRecipes'])
    
    for country in agg_data["per_country"]:
        df_countries.loc[len(df_countries)] = pd.Series([country["geo_identifier"], country["geo_name"], country["database_name"], country["nbRecipes"]], 
               index=['geo_identifier', 'geo_name', 'database_name', 'nbRecipes'])
  
    return df_countries.sort_values(by='geo_identifier', ascending=True)

In [34]:
df_countries = initialize_dataFrame_countries('fullAggregatedData.json')

df_countries = df_countries[df_countries['database_name'] != 'chili']
if(len(df_countries[df_countries["geo_identifier"] == 'invalid']) != 0):
    ISR_index = df_countries[df_countries["geo_identifier"] == 'invalid'].index.values[0]
    df_countries.set_value(ISR_index, 'geo_identifier', 'ISR')
    df_countries.set_value(ISR_index, 'geo_name', 'Israel')
    df_countries.sort_values(by='geo_identifier', ascending=True)

Unnamed: 0,geo_identifier,geo_name,database_name,nbRecipes
45,ARG,Argentina,argentinian,23.0
10,AUT,Austria,austrian,30.0
24,BEL,Belgium,belgian,15.0
22,BGD,Bangladesh,bangladeshi,14.0
0,BRA,Brazil,brazilian,88.0
31,CAN,Canada,canadian,1167.0
8,CHE,Switzerland,swiss,29.0
9,CHL,Chile,chilean,32.0
15,CHN,China,chinese,247.0
47,COL,Colombia,colombian,13.0


# Fill Countries DataFrames

In [36]:
def request_recipes_by_ingredient(ingredient_ids):
    recipes = collection.find(
    {"ingredients":{
            "$elemMatch": {
                "ingredientID": {"$in":ingredient_ids}
            }
        }}, 
    )
    return recipes

In [37]:
def fill_dataFrame_countries_ingredient_by_ids(df, ingredient_ids, column_label):
    recipes = request_recipes_by_ingredient(ingredient_ids)
    
    df[column_label] = 0
    
    for recipe in recipes:
        for country in recipe["ada-country"]:

            country_index = "Unfound"

            found_row = df[df['database_name'] == country]
            if(len(found_row) != 0):
                country_index = found_row.index.values[0]

            else:
                found_row = df[df['geo_name'] == country]
                if(len(found_row) != 0):
                    country_index = found_row.index.values[0]

            if(country_index != "Unfound"):
                df.set_value(country_index,
                             column_label,
                             df.get_value(country_index, column_label) + 1)
    
    df[column_label] = df[column_label]/df['nbRecipes']
    return 

## Countries and food types

In [38]:
def find_type_ids(df, type_name):
    type_ids = []
    for idx in df[df['type'] == type_name]['id']:
        type_ids.append(idx)
    
    return type_ids

In [39]:
def fill_dataFrame_countries_ingredient_type(df, df_ingredients, type_name):
    type_ids = find_type_ids(df_ingredients, type_name)
    print(type_ids)
    if len(type_ids) != 0:
        fill_dataFrame_countries_ingredient_by_ids(df, type_ids, type_name)
    return 

In [40]:
df_countries_and_types = df_countries.copy()

In [41]:
fill_dataFrame_countries_ingredient_type(df_countries_and_types, df_ingredients, 'meat')

[6494.0, 16133.0, 3103.0, 5375.0, 5460.0, 20384.0, 13968.0, 4176.0, 6493.0, 5521.0, 5267.0, 3868.0, 6522.0, 5444.0, 5516.0, 4183.0, 5520.0, 5320.0, 5278.0, 16124.0, 13974.0, 3922.0, 4133.0, 20572.0, 6527.0, 4175.0, 6531.0, 2180.0, 3910.0, 5821.0, 7184.0, 6281.0, 1944.0, 21271.0, 5679.0, 5831.0, 5869.0, 4086.0, 21025.0, 6598.0, 6506.0, 5346.0, 5376.0, 6442.0, 20671.0, 10643.0, 3024.0, 6489.0, 6496.0, 4004.0, 4306.0, 20881.0, 5515.0, 3886.0, 13405.0, 5829.0, 4181.0, 5824.0, 6498.0, 5429.0, 5338.0, 3358.0, 3000.0, 4118.0, 22104.0, 3138.0, 18907.0, 5306.0, 5324.0, 6595.0, 2980.0, 6459.0, 6497.0, 5465.0, 26706.0, 3093.0, 2976.0, 20676.0, 9358.0, 6446.0, 2988.0, 7120.0, 5263.0, 6441.0, 4040.0, 20965.0, 6443.0, 9289.0, 1866.0, 21965.0, 4074.0, 20944.0, 21031.0, 20448.0, 9445.0, 6549.0]


In [42]:
fill_dataFrame_countries_ingredient_type(df_countries_and_types, df_ingredients, 'fish and seafood')

[22005.0, 2664.0, 2750.0, 2656.0, 2533.0, 2520.0, 2666.0, 22004.0, 2672.0, 2698.0, 2748.0, 2675.0, 2740.0, 2676.0, 2658.0, 2642.0, 8746.0, 2756.0, 20888.0, 20771.0, 2595.0, 22067.0, 2605.0, 22002.0, 22006.0, 7040.0, 2667.0, 2609.0, 22019.0, 22020.0, 22010.0, 2536.0, 2628.0, 2603.0]


In [43]:
fill_dataFrame_countries_ingredient_type(df_countries_and_types, df_ingredients, 'egg')

[16317.0, 16319.0, 16318.0, 4339.0, 16323.0]


In [44]:
fill_dataFrame_countries_ingredient_type(df_countries_and_types, df_ingredients, 'dairy')

[16157.0, 16278.0, 16238.0, 16258.0, 16261.0, 16215.0, 16234.0, 16223.0, 6311.0, 16225.0, 16243.0, 16158.0, 16287.0, 16231.0, 16347.0, 16276.0, 16294.0, 16246.0, 16218.0, 16241.0, 5467.0, 16244.0, 22106.0, 445.0, 16209.0, 16340.0, 2360.0, 16232.0, 20324.0, 16248.0, 443.0, 21796.0, 16210.0, 16229.0, 19360.0, 16260.0, 19361.0, 578.0, 19356.0, 16284.0, 454.0, 8287.0, 20611.0, 14388.0, 19362.0, 16256.0, 21130.0, 16257.0, 16212.0, 16217.0, 16259.0, 16312.0, 20474.0, 257.0, 16311.0, 16360.0, 6328.0, 2879.0, 16242.0, 16366.0, 16289.0, 16226.0, 16236.0, 16282.0, 8270.0]


In [45]:
fill_dataFrame_countries_ingredient_type(df_countries_and_types, df_ingredients, 'vegetables')

[4342.0, 4397.0, 16406.0, 4405.0, 4572.0, 4432.0, 16407.0, 4279.0, 4292.0, 4786.0, 20244.0, 4378.0, 3725.0, 20269.0, 4529.0, 10498.0, 5133.0, 4267.0, 4516.0, 20453.0, 4335.0, 4657.0, 4663.0, 4574.0, 16408.0, 4372.0, 20415.0, 4252.0, 4577.0, 4421.0, 23274.0, 4664.0, 20270.0, 10895.0, 3698.0, 4520.0, 16427.0, 4380.0, 3723.0, 4575.0, 3709.0, 5489.0, 4366.0, 4784.0, 4207.0, 12341.0, 4286.0, 4371.0, 20476.0, 7822.0, 4336.0, 5855.0, 4634.0, 5159.0, 3702.0, 4273.0, 3641.0, 126.0, 4381.0, 4399.0, 3687.0, 20447.0, 4357.0, 16428.0, 10214.0, 4412.0, 4522.0, 5587.0, 12338.0, 5174.0, 4275.0, 20496.0, 18699.0, 20833.0, 4603.0, 4578.0, 10896.0, 10104.0, 20595.0, 4808.0, 2825.0, 4638.0, 4419.0, 7834.0, 4490.0, 10107.0, 7263.0, 4496.0, 9797.0, 13963.0, 4571.0, 12390.0, 3724.0, 10502.0, 3707.0, 4269.0, 12339.0, 4416.0, 22439.0, 27343.0, 4383.0, 20240.0, 4370.0, 3669.0, 26772.0, 4497.0, 5161.0, 23337.0, 7835.0, 4426.0, 4254.0, 2845.0, 20704.0, 12306.0, 20752.0, 5462.0, 2823.0, 20738.0, 16411.0, 20590.0, 

In [46]:
fill_dataFrame_countries_ingredient_type(df_countries_and_types, df_ingredients, 'fruit')

[5107.0, 5112.0, 5106.0, 5111.0, 5012.0, 5219.0, 5140.0, 5110.0, 4978.0, 5145.0, 15720.0, 5121.0, 5015.0, 5233.0, 5135.0, 5195.0, 3718.0, 5218.0, 5020.0, 5188.0, 5190.0, 18845.0, 20455.0, 5051.0, 11039.0, 10987.0, 5044.0, 11137.0, 5222.0, 1593.0, 20355.0, 22117.0, 5212.0, 5007.0, 5017.0, 4991.0, 11188.0, 3494.0, 7441.0, 11225.0, 8127.0, 18427.0, 5143.0, 3566.0, 1506.0, 20209.0, 20516.0, 20227.0, 5148.0, 5036.0, 11185.0, 3562.0, 5235.0, 3722.0, 11083.0, 3539.0, 5058.0, 18416.0, 20832.0, 3563.0, 9858.0, 5154.0, 1512.0, 5053.0]


In [47]:
fill_dataFrame_countries_ingredient_type(df_countries_and_types, df_ingredients, 'starchy')

[1684.0, 4442.0, 1636.0, 3640.0, 1650.0, 3819.0, 3795.0, 3752.0, 2779.0, 3810.0, 3767.0, 2351.0, 4314.0, 2819.0, 3789.0, 2377.0, 2858.0, 2795.0, 2104.0, 1632.0, 1686.0, 1718.0, 4311.0, 23383.0, 21785.0, 19113.0, 4237.0, 4556.0, 20316.0, 4317.0, 3814.0, 1683.0, 20468.0, 17660.0, 4607.0, 20194.0, 6057.0, 2829.0, 1709.0, 2863.0, 1320.0, 10863.0, 1651.0, 1685.0, 20726.0, 16381.0, 1533.0, 16394.0, 2073.0, 1701.0, 2940.0, 3853.0, 1728.0, 2850.0, 2805.0, 3788.0, 20287.0, 2960.0, 17673.0, 14118.0, 4537.0, 1767.0, 22082.0, 16378.0, 16351.0, 2352.0, 16383.0, 10866.0, 1687.0, 20414.0, 4315.0, 2084.0, 19258.0, 17500.0, 10041.0, 1644.0, 1648.0, 20467.0, 16409.0, 2077.0, 2812.0, 1665.0, 3797.0, 17617.0, 17677.0, 17625.0, 20486.0, 20318.0, 1631.0, 2857.0, 3758.0, 2959.0, 1637.0, 2803.0, 1658.0, 3787.0, 6818.0, 20315.0, 1654.0, 21039.0, 10429.0, 3804.0, 2769.0, 3768.0, 20518.0, 10046.0, 3792.0, 1277.0, 21010.0, 4649.0, 19347.0, 20466.0, 4511.0, 1656.0, 20522.0, 1660.0, 2793.0, 6756.0, 17675.0, 1532.0,

In [48]:
fill_dataFrame_countries_ingredient_type(df_countries_and_types, df_ingredients, 'condiment')

[16421.0, 6307.0, 6305.0, 2882.0, 20482.0, 7842.0, 9725.0, 6294.0, 6309.0, 4582.0, 6379.0, 7428.0, 5592.0, 5588.0, 3686.0, 18866.0, 16339.0, 6420.0, 20245.0, 18868.0, 18752.0, 5597.0, 18740.0, 16422.0, 5803.0, 18765.0, 18930.0, 18873.0, 18805.0, 18723.0, 18741.0, 16420.0, 5593.0, 6304.0, 16400.0, 5596.0, 16169.0, 12057.0, 2958.0, 20656.0, 5595.0, 5547.0, 20235.0, 18738.0, 5482.0, 13596.0, 16410.0, 3703.0, 20558.0, 13611.0, 4994.0, 2231.0, 13591.0, 11444.0, 20475.0, 13705.0, 9722.0, 7431.0, 18901.0, 7819.0, 20495.0, 18739.0, 18783.0, 13608.0, 6422.0, 11440.0, 2155.0, 19065.0, 7767.0, 20687.0, 7444.0, 10274.0, 18870.0, 20853.0, 25687.0, 5397.0, 5630.0, 20808.0, 6378.0, 2150.0, 6306.0, 24905.0, 7452.0, 10275.0, 7818.0, 7429.0, 20692.0, 18779.0, 17600.0, 20241.0, 11561.0, 4385.0, 21027.0, 7432.0, 5045.0, 7308.0, 19273.0, 24865.0, 18952.0, 20478.0, 20583.0, 3512.0, 20862.0, 20337.0, 2114.0, 20696.0, 18921.0, 5552.0, 18756.0, 21076.0]


In [49]:
fill_dataFrame_countries_ingredient_type(df_countries_and_types, df_ingredients, 'spicies')

[16424.0, 20551.0, 16386.0, 16396.0, 4343.0, 16385.0, 16404.0, 2356.0, 16401.0, 16418.0, 16397.0, 16391.0, 16402.0, 16387.0, 1416.0, 7785.0, 16390.0, 16377.0, 20303.0, 18794.0, 20473.0, 16382.0, 18684.0, 16413.0, 20469.0, 18748.0, 1442.0, 20678.0, 20860.0, 21196.0, 1350.0, 1417.0, 16348.0, 20631.0, 20461.0, 21211.0, 20345.0, 27038.0, 20679.0, 18680.0, 18865.0, 2216.0, 8868.0, 18849.0, 21212.0, 2357.0, 7980.0, 21263.0, 25308.0, 13201.0, 7305.0]


In [50]:
fill_dataFrame_countries_ingredient_type(df_countries_and_types, df_ingredients, 'herbs')

[3717.0, 4409.0, 16403.0, 16380.0, 16159.0, 16417.0, 18681.0, 16405.0, 18844.0, 18860.0, 16379.0, 16423.0, 5838.0, 16429.0, 16412.0, 20699.0, 18766.0, 16389.0, 5861.0, 4302.0, 16414.0, 18771.0, 18767.0, 20654.0, 21886.0, 4373.0, 5871.0, 21507.0, 20825.0, 16388.0, 20632.0, 18682.0, 18810.0, 21536.0, 5873.0, 18706.0, 25944.0, 5863.0, 23273.0, 9319.0, 18768.0]


In [51]:
fill_dataFrame_countries_ingredient_type(df_countries_and_types, df_ingredients, 'sugared')

[1526.0, 1525.0, 1527.0, 1502.0, 1338.0, 1536.0, 18521.0, 20359.0, 1336.0, 18526.0, 12150.0, 22056.0, 8610.0, 7812.0, 3507.0, 20619.0, 1429.0, 20708.0, 20199.0, 1373.0, 8558.0]


In [52]:
fill_dataFrame_countries_ingredient_type(df_countries_and_types, df_ingredients, 'alcohol')

[3474.0, 2112.0, 3472.0, 18888.0, 3452.0, 20314.0, 3467.0, 18875.0, 2363.0, 20190.0, 8317.0, 3468.0, 22318.0, 15709.0, 8336.0, 3469.0, 20234.0, 18111.0, 23388.0, 20426.0, 18874.0, 3471.0, 15738.0, 20205.0, 20224.0, 15723.0, 2503.0, 20667.0, 20492.0, 20225.0, 20421.0, 8310.0, 18953.0, 20670.0, 20201.0, 3453.0, 20356.0, 15698.0, 24419.0, 8311.0]


In [53]:
df_countries_and_types.head(10)

Unnamed: 0,geo_identifier,geo_name,database_name,nbRecipes,meat,fish and seafood,egg,dairy,vegetables,fruit,starchy,condiment,spicies,herbs,sugared,alcohol
45,ARG,Argentina,argentinian,23.0,0.173913,0.0,0.391304,0.391304,0.608696,0.347826,0.434783,0.826087,0.608696,0.434783,0.173913,0.304348
10,AUT,Austria,austrian,30.0,0.066667,0.0,0.7,0.8,0.1,0.433333,0.9,0.7,0.6,0.066667,0.766667,0.4
24,BEL,Belgium,belgian,15.0,0.466667,0.0,0.4,0.733333,0.466667,0.333333,0.933333,0.866667,0.466667,0.4,0.4,0.266667
22,BGD,Bangladesh,bangladeshi,14.0,0.357143,0.357143,0.142857,0.142857,1.0,0.071429,0.857143,1.0,1.0,0.785714,0.214286,0.0
0,BRA,Brazil,brazilian,88.0,0.261364,0.045455,0.329545,0.534091,0.465909,0.318182,0.534091,0.693182,0.295455,0.329545,0.386364,0.102273
31,CAN,Canada,canadian,1167.0,0.296487,0.040274,0.289632,0.579263,0.568123,0.32048,0.630677,0.75407,0.482434,0.231362,0.481577,0.133676
8,CHE,Switzerland,swiss,29.0,0.241379,0.0,0.310345,0.689655,0.586207,0.241379,0.931034,0.793103,0.37931,0.137931,0.344828,0.275862
9,CHL,Chile,chilean,32.0,0.4375,0.09375,0.40625,0.5625,0.5,0.4375,0.5625,0.625,0.53125,0.28125,0.28125,0.375
15,CHN,China,chinese,247.0,0.643725,0.11336,0.315789,0.093117,0.813765,0.210526,0.740891,0.898785,0.502024,0.105263,0.546559,0.182186
47,COL,Colombia,colombian,13.0,0.615385,0.0,0.076923,0.307692,0.846154,0.230769,0.615385,1.0,0.615385,0.461538,0.076923,0.076923


## Countries and Ingredients

In [54]:
def fill_dataFrame_ingredient(df, df_ingredients, ingredient_name):
    ingredient_ids = find_ingredient_ids(df_ingredients, ingredient_name)
    print(ingredient_ids)
    if len(ingredient_ids) != 0:
        fill_dataFrame_countries_ingredient_by_ids(df, ingredient_ids, ingredient_name)
    return 

In [55]:
df_countries_and_ingredients = df_countries.copy()

In [56]:
fill_dataFrame_countries_ingredient(df_countries_and_ingredients, df_ingredients, 'chicken')

[6494.0, 16133.0, 20384.0, 13968.0, 6493.0, 5521.0, 6522.0, 5467.0, 5520.0, 16124.0, 6527.0, 6531.0, 5679.0, 21025.0, 6506.0, 6442.0, 10643.0, 6489.0, 6496.0, 13405.0, 6498.0, 5429.0, 6459.0, 6497.0, 5465.0, 6446.0, 6441.0, 6443.0, 21031.0, 6549.0]


In [57]:
fill_dataFrame_countries_ingredient(df_countries_and_ingredients, df_ingredients, 'beef')

[3103.0, 5460.0, 3868.0, 5516.0, 13974.0, 4133.0, 3910.0, 4086.0, 4004.0, 5515.0, 3886.0, 22104.0, 3138.0, 9358.0, 7120.0, 3140.0, 4040.0, 4074.0, 9445.0]


In [58]:
fill_dataFrame_countries_ingredient(df_countries_and_ingredients, df_ingredients, 'pork')

[4176.0, 4183.0, 5320.0, 5278.0, 4175.0, 5346.0, 4181.0, 20632.0, 5338.0, 5397.0, 5306.0, 5324.0, 20676.0, 5263.0, 9289.0, 2773.0]


In [59]:
fill_dataFrame_countries_ingredient(df_countries_and_ingredients, df_ingredients, 'rice')

[1650.0, 18868.0, 1651.0, 1728.0, 19258.0, 8336.0, 1648.0, 1665.0, 17617.0, 17625.0, 1658.0, 1654.0, 18870.0, 1656.0, 1660.0, 1690.0, 1649.0, 1657.0]


In [60]:
fill_dataFrame_countries_ingredient(df_countries_and_ingredients, df_ingredients, 'soy sauce')

[2882.0, 2958.0, 10274.0, 10275.0]


In [61]:
fill_dataFrame_countries_ingredient(df_countries_and_ingredients, df_ingredients, 'mozzarella')

[16234.0, 16232.0, 19361.0]


In [62]:
fill_dataFrame_countries_ingredient(df_countries_and_ingredients, df_ingredients, 'garlic')

[4342.0, 16396.0, 18740.0, 20486.0, 12390.0, 20752.0, 20590.0]


In [63]:
fill_dataFrame_countries_ingredient(df_countries_and_ingredients, df_ingredients, 'butter')

[16157.0, 16339.0, 16287.0, 2858.0, 16209.0, 4537.0, 2857.0, 10848.0]


In [64]:
fill_dataFrame_countries_ingredient(df_countries_and_ingredients, df_ingredients, 'oil')

[6307.0, 6305.0, 20482.0, 9725.0, 6309.0, 6379.0, 6420.0, 6304.0, 3703.0, 20558.0, 9722.0, 6422.0, 25687.0, 6378.0, 6306.0, 24905.0, 21027.0, 20696.0]


In [65]:
fill_dataFrame_countries_ingredient(df_countries_and_ingredients, df_ingredients, 'shrimp')

[22005.0, 2664.0, 2666.0, 22004.0, 22067.0, 22002.0, 22006.0, 2667.0, 22019.0, 22020.0, 22010.0]


In [66]:
df_countries_and_ingredients.head(10)

Unnamed: 0,geo_identifier,geo_name,database_name,nbRecipes,chicken,beef,pork,rice,soy sauce,mozzarella,garlic,butter,oil,shrimp
45,ARG,Argentina,argentinian,23.0,0.043478,0.043478,0.0,0.0,0.0,0.0,0.434783,0.26087,0.521739,0.0
10,AUT,Austria,austrian,30.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.7,0.1,0.0
24,BEL,Belgium,belgian,15.0,0.133333,0.066667,0.066667,0.0,0.0,0.0,0.133333,0.6,0.2,0.0
22,BGD,Bangladesh,bangladeshi,14.0,0.071429,0.214286,0.0,0.071429,0.0,0.0,0.785714,0.142857,0.857143,0.071429
0,BRA,Brazil,brazilian,88.0,0.125,0.045455,0.045455,0.079545,0.022727,0.034091,0.329545,0.227273,0.477273,0.011364
31,CAN,Canada,canadian,1167.0,0.119966,0.057412,0.037704,0.042845,0.06341,0.023136,0.277635,0.351328,0.292202,0.012853
8,CHE,Switzerland,swiss,29.0,0.034483,0.172414,0.0,0.0,0.0,0.0,0.206897,0.448276,0.172414,0.0
9,CHL,Chile,chilean,32.0,0.125,0.15625,0.0625,0.03125,0.0,0.0,0.28125,0.3125,0.3125,0.0625
15,CHN,China,chinese,247.0,0.417004,0.020243,0.242915,0.279352,0.651822,0.0,0.473684,0.044534,0.724696,0.089069
47,COL,Colombia,colombian,13.0,0.461538,0.230769,0.076923,0.0,0.0,0.076923,0.538462,0.076923,0.769231,0.0


# Counting number of recipes per US states

In [None]:
def initialize_dataFrame_countries(json_name):
    with open(json_name) as data_file:
        agg_data = json.load(data_file)
    
    df_countries = pd.DataFrame(columns=['geo_identifier', 'geo_name', 'database_name', 'nbRecipes'])
    
    for country in agg_data["per_country"]:
        df_countries.loc[len(df_countries)] = pd.Series([country["geo_identifier"], country["geo_name"], country["database_name"], country["nbRecipes"]], 
               index=['geo_identifier', 'geo_name', 'database_name', 'nbRecipes'])
  
    return df_countries.sort_values(by='geo_identifier', ascending=True)

In [None]:
# INPROGRESS

# JSONs generation

### JSON food type

In [None]:
coutries_json = []
for i in range(len(df_countries_and_types)):
    coutries_json.append(df_countries_and_types.iloc[i].to_json())

In [None]:
type_json = {"per_country": coutries_json}

In [None]:
line = df_countries_and_types.iloc[0].to_json()
line

In [None]:
line

In [None]:
dic={"geo_name": line.name, 'name': 'mkyong.com', 'messages': ['msg 1', 'msg 2', 'msg 3']}

In [None]:
def create_line_dict(line):

In [None]:
df_countries_and_ingredients.to_json()