In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
from datetime import datetime

import json
import pickle

# Our generated code
import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
%load_ext autoreload
%autoreload 2
    
import libs.exploring as explore
import libs.visualising as visualize
import libs.cleansing as cleanse

ModuleNotFoundError: No module named 'findspark'

# Explore raw data

Packages I had to install:
    - pip install pyspark_dist_explore
    - pip install plotly
and then restart the Kernel.

In [None]:
# Import data
open_food_facts_csv_file = "./data/openfoodfacts_clean.csv"

food_facts_pd = pd.read_csv(open_food_facts_csv_file,
                            delimiter="\t",
                            usecols=columns_to_import.keys(),
                            index_col='code')

In [None]:
#pd.to_datetime(food_facts_pd.created_datetime,format="yyyy-MM-dd'T'HH:mm:ssZ")

# Analyse data

## Production / manufacture impact

### Global distribution of global food producers

#### Which are the dominant global food producers and manufacturers?

Where are those products bought?

In [None]:
# Plot distribution of stores where items were bought
visualize.plot_occurences_of_distinct_values(food_facts_pd, 'stores')

#### How is this distribution impacted when we consider neutral and large carbon footprint products? 

In [None]:
#dataset carbon footprint coming from Eaternity

### Case study: Palm oil

#### Can we observe any trend in the number of products including this oil (assuming a strong dependence between date the product was added to the database and data the product was invented)?

In [None]:
#extraction products with palm oil
food_facts_pd[food_facts_pd.categories.str.contains("palm")]


Looking at the dataset, we have to find a way to differenciate palm than palm oil according to the associated words in "categories". It seems that "Produit à tartiner" goes with palm oil but there are other products like "coeur de palmier" that have nothing to do with palm oil. 

    Solutions :
    -word2vec

#### Which country use palm oils for production?

## Good nutrition impact

### High-nutrional products

#### Has there been a surge in high graded Products in the UK / France over the past years?

In [None]:
nutrition_fr = food_facts_pd[['created_t','nutrition-score-fr_100g']]
nutrition_fr.dropna()

#### What are those products made of?
What is the composition? Do they contain many additives?  Where are these products sold? 

#### Where do these product come from and where are they manufactured?

#### Where are those products sold?

### Carbon footprint of nutrionally-high graded products
Common sense would suggest most nutritionally-high graded products are organic (plant, fruit, vegetables, …) and are therefore not manufactured, thus having a small footprint.

#### Are expansive and polluting products performing more poorly in the nutrition mark?

#### Can we establish a meaningful correlation between these product and the carbon footprint  or an estimated price (using another dataset or creating our own with web scraping)? 

In [None]:
# Plot carbon foot-print histogram over calories# 
fig, ax = plt.subplots()

food_facts_pd['energy_100g'] = food_facts_pd['energy_100g'].astype('float')
food_facts_pd['carbon-footprint_100g'] = food_facts_pd['carbon-footprint_100g'].astype('float')

food_facts_pd.plot.scatter(x='energy_100g', 
                           y='carbon-footprint_100g', 
                           c='carbon-footprint_100g', 
                           colormap='coolwarm',
                           ax=ax)
ax.set_facecolor('black')

plt.xlabel('Energy per 100g [kcal]')
plt.show()

In [None]:
# Food calories over carbon-foot print
visualize.plot_cluster_by_tags(df=food_facts_pd.dropna(subset=['labels']),
                                 plot2D_features = ["carbon-footprint_100g", "energy_100g"],
                                 cluster="labels")

#### Is there a general correlation between high carbon footprint and price? 

### Plot distribution of availability of products in countries

In [None]:
# Find all distinct countries
countries_set = set()
for index, row in food_facts_pd.iterrows():
    for country in row.countries_en:
        countries_set.add(country)

# Count the number of time each country appear in the dataframe
country_count = dict()
for country in list(countries_set):
    country_count[country] = food_facts_pd.countries_en.apply({country}.issubset).sum()

In [None]:
country_count_pd = pd.DataFrame(list(country_count.items()), columns=['Country', 'Value'])
country_count_pd['Country'] = country_count_pd['Country'].str.upper()
country_count_pd

In [None]:
import folium

## Country coordinates for plotting
country_geo = './data/world-countries.json'

map = folium.Map(location=[0, 0], tiles='Mapbox Bright', zoom_start=1.5)

# choropleth maps bind Pandas Data Frames and json geometries.
map.choropleth(geo_data=country_geo,
               data=country_count_pd,
               columns=['Country', 'Value'],
               fill_color='YlGnBu', 
               key_on='feature.id',
               fill_opacity=0.7, 
               line_opacity=0.2,
               )
map