In [1]:
from pymongo import MongoClient
import datetime
import numpy as np
import pandas as pd
import getpass

import nltk
from nltk.stem import WordNetLemmatizer

Be sure you are connect on the **EPFL network** (connection at EPFL or via VPN)

Note on authentification:
* Your username-password pair has read-only credentials
* Use the admin user to insert, modify, or create indexes

In [2]:
database = 'ada-project'
user = input('MongoDB name: ')
password = getpass.getpass('MongoDB password: ')

MongoDB name: davidrivollet
MongoDB password: ········


In [3]:
# Mongo Client and authentification
client = MongoClient('www.cocotte-minute.ovh', 27017)
db = client[database]
db.authenticate(user, password)
collection = db['recipes']

## Finding ingredient labels

In [4]:
def tokenize_and_lemmatize(text):
    # Tokenize
    tokens_raw = nltk.regexp_tokenize(text, pattern='\w+')

    # Normalization
    tokens_norm = [t.lower() for t in tokens_raw]

    # Removing the numbers
    tokens_without_digits = [t for t in tokens_norm if t.isdigit() == False]
    
    wnl = WordNetLemmatizer()
    tokens_lemmatized = [wnl.lemmatize(t) for t in tokens_without_digits]
    
    return tokens_lemmatized

In [5]:
def count_occurences(text):
    tokens_ref = tokenize_and_lemmatize(text[0])
    
    df_occurences = pd.DataFrame({'occurence': np.zeros(len(tokens_ref))}, index = tokens_ref)
    df_occurences = df_occurences[~df_occurences.index.duplicated()]
    
    for sentence in text:
        for token in tokenize_and_lemmatize(sentence):
            found_token = False
            
            for token_ref in tokens_ref:
                if(token == token_ref):
                    occ = df_occurences.get_value(token, 'occurence')
                    df_occurences.set_value(token, 'occurence', occ + 1)
                    found_token = True
            
            if(found_token == False):
                tokens_ref.append(token)
                df_occurences.set_value(token, 'occurence', 1)

    return df_occurences

In [6]:
def define_ingredient_label(text):
    df_occurences = count_occurences(text)
    
    max_occ = df_occurences.max(axis=0).values[0]
    if max_occ < len(text):
        name_array = df_occurences[(df_occurences['occurence'] > max_occ - (max_occ/100*10))].index.values
    else:
        name_array = df_occurences[(df_occurences['occurence'] > len(text) - (len(text)/100*10))].index.values
    
    name = name_array[0]
    for word in name_array[1:]:
        name = name + " " + word
    
    return name

In [7]:
ingredients = collection.aggregate([
    {"$unwind": "$ingredients" }, 
    {"$group": 
        {"_id": "$ingredients.ingredientID",
         "doc_count": { "$sum": 1 },
         "doc_name": {"$addToSet": "$ingredients.displayValue"}
        }
    },
    {"$sort": { "doc_count":-1}}
    ])

df_ingredients = pd.DataFrame(columns=['id', 'name', 'occurence'])

i = 0;
for ingredient in ingredients:
    if(ingredient["_id"] != 0):
        i = i + 1;
        name = define_ingredient_label(ingredient["doc_name"])
        df_ingredients.loc[len(df_ingredients)] = (ingredient["_id"], name, ingredient["doc_count"])

        #if i == 10:
        #    break;
    
df_ingredients.drop(df_ingredients[df_ingredients['occurence'] < 10].index, inplace = True)

In [8]:
df_ingredients

Unnamed: 0,id,name,occurence
0,16421.0,salt,8730.0
1,4342.0,garlic,5246.0
2,4397.0,onion,4947.0
3,2496.0,water,4583.0
4,1526.0,sugar,4371.0
5,16157.0,butter,4036.0
6,16317.0,egg,3449.0
7,1684.0,all purpose flour,3215.0
8,16406.0,pepper,3194.0
9,6307.0,olive oil,3127.0


In [12]:
len(df_ingredients)

1046

In [14]:
df_ingredients_duplicates = df_ingredients.groupby('name').count()
len(df_ingredients_duplicates)

861

In [15]:
df_ingredients_duplicates = df_ingredients_duplicates[df_ingredients_duplicates['occurence'] > 2]
df_ingredients_duplicates

Unnamed: 0_level_0,id,occurence
name,Unnamed: 1_level_1,Unnamed: 2_level_1
cheese,5,5
chicken,12,12
chipotle,3,3
coconut,3,3
corn,4,4
crabmeat,3,3
cucumber,3,3
cup rice,6,6
flour tortilla,3,3
ginger,3,3


In [16]:
len(df_ingredients_duplicates)

37

## Counting number of recipes per country

In [64]:
df_countries = pd.DataFrame(columns=['total recipe num'])
df_countries.index.set_names('country name')

Index([], dtype='object', name='country name')

## Making a recipe request by ingredient

In [17]:
df_ingredients[df_ingredients['name'] == 'soy sauce']

Unnamed: 0,id,name,occurence
17,2882.0,soy sauce,1253.0
233,2958.0,soy sauce,103.0
658,10274.0,soy sauce,23.0


In [51]:
soy_sauce_recipes = collection.find(
    {"ingredients":{
            "$elemMatch": {
                "ingredientID": {"$in":[2882, 2958, 10274]}
            }
        }}, 
    )

i = 0
for recipe in soy_sauce_recipes:
    i = i+1
    #if i == 25:
    #    break;

In [52]:
recipe['title']

'Ground Pork Cake with Salty Egg (Haam Daan Ju Yoke Beng)'