In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
from datetime import datetime

import json
import pickle

# Our generated code
import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
%load_ext autoreload
%autoreload 2
    
from libs import exploring as explore
from libs import visualising as visualize
from libs import cleansing as cleanse

In [15]:
# Import data
open_food_facts_csv_file = "./data/en.openfoodfacts.org.products.csv"

# Load list of columns (external file) that are loaded into pyspark
data = []
with open("./data/cleanse/columns_to_import.txt", "r") as json_data:
    columns_to_import = json.load(json_data)
    columns_to_import


food_facts_pd = pd.read_csv(open_food_facts_csv_file,
                            delimiter="\t",
                            usecols=columns_to_import.keys(),
                            dtype=columns_to_import,
                            index_col='code')

# Explore the data

## Display number of NaN entries per column

In [None]:
null_entries = pd.DataFrame({'columns' : food_facts_pd.columns,
                             'not nan_values' : [food_facts_pd[c].count() for c in food_facts_pd]
                            })

# Plot NaNs counts
null_entries.set_index('columns').plot(kind='barh', figsize=(10, 10))
plt.title("Not null values count in each column")
plt.show()

We see that there are a lot of null values in each column. Just dropping each row would result in a great loss of data, so before we do so, we apply some filtering to the columns.

After applying these changes, I will drop the remaining rows with NaN's to keep the data in the rest of the features more accurate.

In [None]:
# Next lets look at the data types:

In [7]:
food_facts_pd.dtypes

created_t                     object
created_datetime              object
product_name                  object
quantity                      object
packaging                     object
brands                        object
categories                    object
categories_tags               object
categories_en                 object
origins                       object
origins_tags                  object
manufacturing_places          object
manufacturing_places_tags     object
labels                        object
purchase_places               object
stores                        object
countries_en                  object
main_category                 object
energy_100g                  float64
carbon-footprint_100g        float64
nutrition-score-fr_100g       object
nutrition-score-uk_100g       object
dtype: object

Another thing that we are not really keen of are the language indicators, so we are going to remove those abbreviations.

In [32]:
def remove_language_indicator(row_str):
    tags = [tag if len(tag.split(':'))==1 else tag.split(':')[1] for tag in row_str.split(',')]
    return ",".join(tags)

In [33]:
food_facts_pd.categories_en = food_facts_pd.categories_en.apply(remove_language_indicator)
food_facts_pd.main_category = food_facts_pd.main_category.apply(remove_language_indicator)



The next issue we are going to tackel are redudant columns. Especially here, these are similarly named columns ending with "_en", "_tags". We are handling this, by only importing columns that end with "_en" if we have the choice.

In [34]:
food_facts_pd = food_facts_pd.dropna()
food_facts_pd

Unnamed: 0_level_0,created_t,created_datetime,product_name,quantity,packaging,brands,categories_en,origins,origins_tags,manufacturing_places,manufacturing_places_tags,labels,purchase_places,stores,countries_en,main_category,energy_100g,carbon-footprint_100g,nutrition-score-fr_100g,nutrition-score-uk_100g
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2000000072530,1533075976,2018-07-31T22:26:16Z,Nestle Aguitas Manzana,300ml,botella de plastico,nestle,agua-embotellada,Cd. de Mexico,cd-de-mexico,Cd. de Mexico,cd-de-mexico,ECOCE,walmart,"walmart,bodega aurrera,sams club",Mexico,es:agua-embotellada,41.3,0.0,0,0
2000000074609,1536152379,2018-09-05T12:59:39Z,Terrine de Chevreuil,180 gr,"Conserve,bocal,Verre,180gr","AJM,SARL AJM","Meat-based products,Terrine,Terrines de chevre...","France,Saveur Occitanie","france,saveur-occitanie","Bagard,Gard,Occitanie,France,30140","bagard,gard,occitanie,france,30140","Fait Maison,Carbon-footprint","Bagard,Gard,Occitanie,France","SARL AJM,AJM",France,en:meat-based-products,1653.0,0.05,21,21
2000000074660,1536214006,2018-09-06T06:06:46Z,TERRINE AUX CHÂTAIGNES,180 gr,"Conserve,Bocal,Verre,Verrine,180gr","AJM,Sarl AJM","Plant-based foods and beverages,Plant-based fo...","France,Saveur Occitanie","france,saveur-occitanie","France,Occitanie,Gard,Bagard,30,30140","france,occitanie,gard,bagard,30,30140",Fait Maison,"Bagard,Gard,Occitanie,FRANCE,30140","AJM,Sarl AJM",France,en:plant-based-foods-and-beverages,1653.0,0.05,21,21
3222471125113,1407346433,2014-08-06T17:33:53Z,Lardons Nature (2 barquettes),200 g (2 x 100 g),"Frais,Barquette,Plastique,Opercule,Film plasti...","Casino,Groupe Casino","Meats,Prepared meats,Fresh foods,Pork,Charcute...",Union Européenne,union-europeenne,Société Aubret (Filiale Groupe d'aucy) - ZI Ro...,societe-aubret-filiale-groupe-d-aucy-zi-route-...,"Empreinte carbone,Point Vert,Qualité supérieur...","Saint-Just-d'Avray,France,Carnac","Vival,Casino",France,en:meats,1016.0,945.0,20,20
3222471125120,1337519656,2012-05-20T13:14:16Z,Lardons Fumés (2 barquettes),200 g (2 x 100 g),"Frais,Barquette,Plastique,Opercule,Film plasti...","Casino,Groupe Casino","Meats,Prepared meats,Fresh foods,Pork,Charcute...",Union Européenne,union-europeenne,Société Aubret (Filiale Groupe d'aucy) - ZI Ro...,societe-aubret-filiale-groupe-d-aucy-zi-route-...,"Point Vert,Découennés,Eco-Emballages,Sans Cart...",France,Géant Casino,France,en:meats,1025.0,945.0,20,20
3222471483602,1367347875,2013-04-30T18:51:15Z,Lait Écrémé,1 L,"Brique,Carton,Bouchon","Casino,Groupe Casino","Dairies,Milks,Homogenized milks,UHT Milks,Skim...",France,france,Société Laitière des Volcans d'Auvergne (SLVA)...,societe-laitiere-des-volcans-d-auvergne-slva-t...,"FSC,Source de Calcium,Origine Française","Angers,France",Géant,France,en:dairies,134.0,175.0,0,0
3222472185338,1422190105,2015-01-25T12:48:25Z,Riz long grain Camargue,1 kg,Carton,Casino,"Plant-based foods and beverages,Plant-based fo...",France,france,France,france,IGP,France,Casino,France,en:plant-based-foods-and-beverages,1534.0,545.0,-1,-1
3222472775195,1368034607,2013-05-08T17:36:47Z,Oeufs frais datés du jour de ponte (x 6),6 (calibre Très Gros =&gt; + 73 g),"Boîte,Plastique,boîte",Casino,"Farming products,Bird eggs,Eggs",France,france,"L' Oeuf des Deux Moulins - Le Val d'Evre,49600...","l-oeuf-des-deux-moulins-le-val-d-evre,49600-le...",Indice environnemental 8.4 %,"Angers,France",Géant,France,en:farming-products,594.0,8.4,-1,-1
3222472921530,1352641097,2012-11-11T13:38:17Z,Emmental Français Est-Central IGP au lait cru ...,250 g,"Frais,Sachet plastique,Sous atmosphère protect...","Casino,Groupe Casino","Dairies,Fermented foods,Fermented milk product...",France,france,Entremont Alliance - EA (Filiale Sodiaal Inter...,entremont-alliance-ea-filiale-sodiaal-internat...,"IGP,Point Vert,Label Rouge,Au lait cru,Certifi...","Clichy,France",Casino,France,en:dairies,1660.0,1275.0,11,16
3222473631186,1404827674,2014-07-08T13:54:34Z,"Lasagnes au saumon, Surgelées",300 g (1 portion),"Surgelé,Barquette,Carton,Barquette micro-ondab...","Casino,Groupe Casino","Frozen foods,Meals,Meals with fish,Pasta dishe...",Saumon (Salmo salar),saumon-salmo-salar,Luxembourg,luxembourg,"Point Vert,Eco-Emballages,Elaboré au Luxembour...","Lyon,France",Casino,France,en:frozen-foods,537.0,430.0,2,2


## Unitize tags
Many parts of the data are categorizations based on tags. However, those tags are in a variety of languages and string formattings, so in order to use them we attempt to group tags that hint to the same property and map them to a common indicator. 

Every column of the data set requires special treatment, as follows:

### Countries tags

In [None]:
food_facts_pd = food_facts_pd.dropna(subset=['product_name', 'countries_en', 'stores'])

In [None]:
food_facts_pd = food_facts_pd.fillna("")

Note :  
- purchase_places and countries_en are the same though "countries_en" is more complete
-  manufacturing_places and origins are different

In [None]:
countries = pd.read_csv("./data/country_lookup.csv")[['name', 'cca2', 'alias', 'Forced']]

In [None]:
#example
copy_purchases_places = food_facts_pd[['purchase_places']].iloc[:2000, :]
copy_purchases_places = copy_purchases_places.replace('', "Unknown", regex=True)
copy_purchases_places['Filtered'] = copy_purchases_places.purchase_places.apply(lambda x: cleanse.country_name_filter(x, countries))
copy_purchases_places

### Labels tags

In [None]:
# Unitze labels
with open('./data/cleanse/taxonomies.json', 'r') as json_data:
    labels_lookup = cleanse.to_lookup(json.load(json_data))
food_facts_pd.labels = food_facts_pd.labels.apply(lambda x: [labels_lookup[z] for z in x.split(',')])

### Store labels tags

In [None]:
# Unitize store labels
with open('./data/cleanse/stores_lookup.json', 'r') as json_data:
    stores_lookup = cleanse.to_lookup(json.load(json_data))
food_facts_pd.stores = food_facts_pd.stores.fillna("").apply(lambda x: [stores_lookup[z] for z in x.split(',')])

### Food category tags

# Write clean data frame to CSV file

In [None]:
# Generate a dataframe that extracts all information required by the web crawler
if 1==0: # skip cell
    products = food_facts_pd

    products.to_pickle("./web_crawler/products_pd.pickle")

In [None]:
# Write to CSV file
clean_data_file_name = "./data/openfoodfacts_clean.csv"
food_facts_pd.to_csv(clean_data_file_name, sep=',', encoding='utf-8')