In [1]:
import pandas as pd


# **Dataset in the kitchen**


### Creation of an ingredient data set by *Olivier Burgaud* (EURECOM 2019), supervised by *Pr. M. Filipone*.

I begin my project by studying past projects, and I saw that there was a lack of consistent and widespread dataset to train the algorithm of recipes. 

Here, I wanted to create a dataset, from one of the widest open source website: ***Wikipedia***. I think it was a good place to find list and set of various and different ingredients. 

The task was not easy because there is no standard format for ***Wikipedia page***, indeed we can find Table-like page, Alphabetical or ramdomly ordered list. Besides, the content of these tables was not always consistent, for instances we can find hyper text link as "classical" text or sentences instead of the ingredient name.
I choose to scrap ***Wikipedia page*** with BeautifulSoup library and then I implemented few cleaning function.

I create a dictionary with the scrapped data.

In [1]:
import requests
from bs4 import BeautifulSoup
import bs4
import time
import numpy as np
import re
import time



In [2]:
page_citrus = 'https://en.wikipedia.org/wiki/List_of_citrus_fruits'
page_salads = 'https://en.wikipedia.org/wiki/List_of_leaf_vegetables'
page_spices = 'https://en.wikipedia.org/wiki/List_of_culinary_herbs_and_spices'
page_fruit = 'https://simple.wikipedia.org/wiki/List_of_fruits'
page_herbs = 'https://simple.wikipedia.org/wiki/List_of_herbs'
page_vegetable = 'https://simple.wikipedia.org/wiki/List_of_vegetables'



In [34]:

#A function that gets the URL of the page to be scraped
#,gets the html content and uses BeautifulSoup to parse html content

def make_soup(link):
    get_page = requests.get(link)
    html = get_page.content
    soup = BeautifulSoup(html, 'html.parser')
    return  soup


#####This function create a list with all the link of the foods in a wikipedia Page and it begins the data cleaning.
def make_link_list(wiki_page_to_scrap):
    start_time = time.time()
    link_table = []
    soup = make_soup(wiki_page_to_scrap)
    table = soup.find('table',{'class':'wikitable'})
    
            ### This first loop is used to scrap Wiki table Data.
    if isinstance(table , bs4.element.Tag):        
    
        table_cells = []
        table = soup.find('table',{'class':'wikitable'})
        for row in table.find_all("tr"):
            cells = row.find_all(['th' , 'td'])
            table_cells.append(cells)
        
        ### This loop is used to locate the "Common name" column index in our table cells
        indices = []
        for j in table_cells:
            for i, elem in enumerate(j):
                elem = str(elem)
                if 'name' in elem:
                    indices.append(i)
        indice = indices[0]
        
        ### Here we implement a loop to keep only the string of the Common name column.        
        for cell in table_cells[2:]:            
            if (len(cell) < indice) == True : ## It is the condition if we have a blank cells i.e there is no common name.
                pass
             
            else:    
                link_table.append(cell[indice].text)
            
        ### We discard all '\n' tag at the END of the lines.
        ### 
        for link in range(len(link_table)):
            link_table[link] = link_table[link].strip('\n')
            
        
            
        print('cpu time for the table schema = {:.4f} sec.'.format(time.time() - start_time))
            
            ### Here is when the Wiki page is just an Alphabetical List.    
    elif (len(soup.find_all('div' , {'class':'div-col'}))>0) == True : 
        for row in soup.find_all('div' , {'class':'div-col'}):
            
            for col in row.find_all('li'):
                species = col.text
                ###We just keep the common name of the species, because only the common name is used in recipes.
                only_common_species = re.sub("[\(\[].*?[\)\]]", "", species) 
                only_common_species , sep , tail = only_common_species.partition(',')
                link_table.append(only_common_species)
                
                ###Cleaning of the list, we remove all the occurence of string begining by List.
        for word in link_table[:]:
            if (word.find('List') != -1) or (word.find('Healthline') != -1) :
                link_table.remove(word)
            
            
        print('cpu time for the  Alphabetical list schema = {:.4f} sec.'.format(time.time() - start_time))  
        
        ### For the list pattern without alphabetical list.
    elif (len(soup.find_all('div' , {'class' : 'mw-parser-output'}))> 0 ) == True:
        for row in soup.find_all('div' , {'class' : 'mw-parser-output'}):
             for col in row.find_all('li'):
                    species = col.text
        ###We just keep the common name of the species, because only the common name is used in recipes.
                    only_common_species = re.sub("[\(\[].*?[\)\]]", "", species) 
                    only_common_species , sep , tail = only_common_species.partition(',')
                    link_table.append(only_common_species)
                    
                ###Cleaning of the list, we remove all the occurence of string begining by List.
        for word in link_table[:]:
            if (word.find('List') != -1) or (word.find('Healthline') != -1):
                link_table.remove(word)
        
        
        print('cpu time for the list schema = {:.4f} sec.'.format(time.time() - start_time))            

    return link_table  





In [35]:
list_citrus = make_link_list(page_citrus)
number_citrus = len(list_citrus)
print('Number of citrus' , len(list_citrus))


list_salad = make_link_list(page_salads)
number_salad = len(list_salad)
print('Number of salads' , len(list_salad))


list_spices = make_link_list(page_spices)
number_spices = len(list_spices)
print('Number of spices' , len(list_spices))


list_fruit = make_link_list(page_fruit)
number_fruit = len(list_fruit)
print('Number of fruits', len(list_fruit))


list_herbs = make_link_list(page_herbs)
number_herbs = len(list_herbs)
print('Number of herbs', len(list_herbs))


list_vegetable = make_link_list(page_vegetable)
number_vegetable = len(list_vegetable)
print('Number of vegetables' , number_vegetable)



print('Total number' , number_citrus + number_fruit + number_herbs + number_salad + number_spices + number_vegetable )

cpu time for the table schema = 0.2371 sec.
Number of citrus 50
cpu time for the table schema = 1.0170 sec.
Number of salads 438
cpu time for the  Alphabetical list schema = 0.3597 sec.
Number of spices 203
cpu time for the  Alphabetical list schema = 0.2171 sec.
Number of fruits 114
cpu time for the  Alphabetical list schema = 0.1978 sec.
Number of herbs 49
cpu time for the list schema = 0.2500 sec.
Number of vegetables 131
Total number 985


In [37]:
### Cleaning function of the dictionary
### Few common error in the categories:
    #html tag as "\n"
    
def cleaner(list_of_ingre):
    spliter_list = []
    cleaned_list = []
    for i in list_of_ingre:
        if '\n' in i : 
            spliter_list.append(i.split('\n'))        
        else:
            cleaned_list.append(i.capitalize())
    clean_ingre_list = list(np.hstack(spliter_list))
    for ingre in clean_ingre_list:
        ingre = ingre.capitalize()
        cleaned_list.append(ingre)
    cleaned_list = list(set(cleaned_list))
    cleaned_list = list(filter(None , cleaned_list))
    cleaned_list.sort()
    return(cleaned_list)   


print(cleaner(list_salad))
print(list_citrus)

ValueError: need at least one array to concatenate

In [10]:
# Let's create a dictionary with the different categories of vegetables. 

### This function is the constructor of the dictionnary, when we want to add a category and a list (which can be empty)to the dict.
def add_cat_to_dict(list_of_ingr , category , food_dict , existency = True):
    CAT_NAME = category.upper()
    food_dict.update({CAT_NAME : list_of_ingr})
    return food_dict
    
###This function permits to the user to add an element in a category, I thought that the user will add ingredient
###one by one, so he should put one tuple (category, ingredient) as input:
#def add_instance
    
def add_ingre_to_dict(ingredient , category , food_dict):
    CTG = category.upper()
    ingre = ingredient.capitalize()
    print(ingre)
    #First we check if the category exist.
    if CTG in food_dict:        
        if ingre in food_dict[CTG]:
            print("This ingredient is already in the category.")
        else:
            food_dict[CTG].append(ingre)
            food_dict = food_dict[CTG].sort()
    else :
        print('This category does not exist, you can create a new one with the function add_cat_to_dict.')
    return food_dict
        
dic = dict()        
dico = add_cat_to_dict(list_citrus , 'citrus' , dic)
add_ingre_to_dict('citron vert' , 'citrus' , dic)
print(dic)


Citron vert
This ingredient is already in the category.
{'CITRUS': ['Balady citronIsrael citron', 'Bergamot orange', 'Bitter orangeSeville orangeSour orangeBigarade orangeMarmalade orange', 'Blood orange', "Buddha's handBushukanFingered citron", 'CalamondinCalamansi', 'Cam sành', 'Citron', 'Citron vert', 'Clementine', 'Corsican citron', 'Desert lime', 'Etrog', 'Finger lime', 'First LadyAnadomikan', 'Florentine citron', 'Grapefruit', 'Greek citron', 'HyuganatsuKonatsuTosakonatsuNew Summer Orange', 'Kabosu', 'Kaffir lime', 'Key lime', 'Kinnow', 'Kiyomi', 'Kumquat', 'Lemon', 'Lime', 'Mandarin orangeMandarinMandarine', 'Mangshanyegan', 'Meyer lemon', 'Moroccan citron', 'Myrtle-leaved orange tree', 'OrangeSweet orange', 'OroblancoSweetie', 'Papeda', 'Persian limeTahiti limeBearss lime', 'PomeloPummeloPommeloShaddock', 'Ponderosa lemon', 'RangpurLemandarin', 'Round limeAustralian limeAustralian round lime', 'SatsumaCold hardy mandarinSatsuma mandarinSatsuma orangeChristmas orangeTangerine', 

True

In [24]:
f = ['' , 'Olivier\n']
for word in f[:]:
    
    word = word.strip('\n')
            #if (word.find('List') != -1) or (word.find('Healthline') != -1):
             #   f.remove(word)
                
print(f)

['', 'Olivier\n']


In [23]:
a='PastiS\nRicard'
a= a.split('\n')
print(a)
a.append('alcool')
c = a[2].capitalize()
a[2] = c
print(c)
a.sort()
print(a)
#b = a.append('Alcool')
#print(b)

['PastiS', 'Ricard']
Alcool
['Alcool', 'PastiS', 'Ricard']


In [7]:
liste_test = ['PastiS\nRicard' , 'EURECOM' , 'OlivierIng']

h = len(liste_test)

for i in range(h):
    if '\n' in liste_test[i]:
        temp = liste_test[i].split('\n')
        print('oui' , temp)
    else:
        print('propre')

oui ['PastiS', 'Ricard']
propre
propre


In [34]:
dico=dict()
print(dico)
dico.update({'Olivier' : [185 , 22]})
print (dico)
dico['Olivier'].append('Thomas')
print(dico)
dico.update({'Alcool' : 'PAstis'})
print(dico)
dico['Alcool'].append('biere')

{}
{'Olivier': [185, 22]}
{'Olivier': [185, 22, 'Thomas']}
{'Olivier': [185, 22, 'Thomas'], 'Alcool': 'PAstis'}


AttributeError: 'str' object has no attribute 'append'