In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns

import json
import pickle

# Our generated code
import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
%load_ext autoreload
%autoreload 2
    
import libs.exploring as explore
import libs.visualising as visualize
import libs.cleansing as cleanse

# Explore data using PySpark

Packages I had to install:
    - pip install pyspark_dist_explore
    - pip install plotly
and then restart the Kernel.

In [None]:
# Import data
open_food_facts_csv_file = "./data/en.openfoodfacts.org.products.csv"

# Load list of columns (external file) that are loaded into pyspark
data = []
with open("columns_to_import.txt", "r") as json_data:
    columns_to_import = json.load(json_data)
    columns_to_import


food_facts_pd = pd.read_csv(open_food_facts_csv_file,
                            delimiter="\t",
                            usecols=columns_to_import.keys(),
                            dtype=columns_to_import,
                            index_col='code')

#### Find NaN columns

In [None]:
null_entries = pd.DataFrame({'columns' : food_facts_pd.columns,
                             'nan_values' : [food_facts_pd[c].count() for c in food_facts_pd]
                            })

# Plot NaNs counts
null_entries.set_index('columns').plot(kind='barh', figsize=(10, 10))
plt.title("Not null values count in each column")
plt.show()

This number is way too low. We need to find a way to deal with incomplete data or a way to complete it.

# Data cleansing
Guys, the country tags in this dataset are a mess. They differ in language, typesetting, everything. I am trying to clean them up, but I could not find an automated way to assign the correct country tags. 
This is why I've started a list with correct mappings in the countries_replacement.json file, which is still incomplete and I need your help to complete this list. Take a look at it, I think it is self-explaining.

### Unitize tags


In [None]:
food_facts_pd = food_facts_pd.dropna(subset=['product_name', 'countries_en', 'stores'])

In [None]:
food_facts_pd = food_facts_pd.fillna("")
food_facts_pd

In [None]:
food_facts_pd[['origins', 'manufacturing_places', 'purchase_places', 'countries_en']].head(100)

In [None]:
copydf = food_facts_pd[['countries_en']].iloc[:100, :]
copydf

Note :  
- purchase_places and countries_en are the same though "countries_en" is more complete
-  manufacturing_places and origins are different

In [None]:
# Unitize countries names
with open('country_lookup.json', 'r') as json_data:
    countries_lookup = cleanse.to_lookup(json.load(json_data))
food_facts_pd.countries_en = food_facts_pd.countries_en.apply(lambda x: [countries_lookup[z] for z in x.split(',')])


In [None]:
countries = pd.read_csv('./data/countries.csv')
countries = countries[['name', 'translations']]
countries['alias'] = countries['name']+countries['translations']
countries['name'] = countries['name'].str.split(",", n = 1, expand = True) 
countries = countries[['name', 'alias']]

In [None]:
countries[countries['alias'].str.contains('ireland', case=False)]

In [None]:
def country_name_filter(name, countries):
    country_set = countries[countries['alias'].str.contains(name, case=False)]
    if(not country_set.empty):
        if(country_set.shape[0]==1):
            return country_set.iloc[0,0]
        else:
            sub_country_set = country_set[country_set['name'].str.match(name, case=False)]
            if(not sub_country_set.empty):
                return sub_country_set.iloc[0,0]
    else:
        return name

In [None]:
copydf.countries_en = copydf.countries_en.apply(lambda x: [country_name_filter(z, countries) for z in x])
copydf

In [None]:
# Unitze labels
with open('taxonomies.json', 'r') as json_data:
    labels_lookup = cleanse.to_lookup(json.load(json_data))
food_facts_pd.labels = food_facts_pd.labels.apply(lambda x: [labels_lookup[z] for z in x.split(',')])

In [None]:
# Store labels
with open('stores_lookup.json', 'r') as json_data:
    stores_lookup = cleanse.to_lookup(json.load(json_data))
food_facts_pd.stores = food_facts_pd.stores.fillna("").apply(lambda x: [stores_lookup[z] for z in x.split(',')])

# Visualize data

In [None]:
# Generate a dataframe that extracts all information required by the web crawler
if 1==1: # skip cell
    products = food_facts_pd

    products.to_pickle("./web_crawler/products_pd.pickle")
    

In [None]:
visualize.plot_cluster_by_tags(df=food_facts_pd.dropna(subset=['labels']),
                                 plot2D_features = ["carbon-footprint_100g", "energy_100g"],
                                 cluster="labels")

### Plot distribution of stores where items were bought

In [None]:
visualize.plot_occurences_of_distinct_values(food_facts_pd, 'stores')

### Plot distribution of availability of products in countries

In [None]:
# Find all distinct countries
countries_set = set()
for index, row in food_facts_pd.iterrows():
    for country in row.countries_en:
        countries_set.add(country)

# Count the number of time each country appear in the dataframe
country_count = dict()
for country in list(countries_set):
    country_count[country] = food_facts_pd.countries_en.apply({country}.issubset).sum()

In [None]:
country_count_pd = pd.DataFrame(list(country_count.items()), columns=['Country', 'Value'])
country_count_pd['Country'] = country_count_pd['Country'].str.upper()
country_count_pd

In [None]:
import folium

## Country coordinates for plotting
country_geo = './data/world-countries.json'

map = folium.Map(location=[0, 0], tiles='Mapbox Bright', zoom_start=1.5)

# choropleth maps bind Pandas Data Frames and json geometries.
map.choropleth(geo_data=country_geo,
               data=country_count_pd,
               columns=['Country', 'Value'],
               fill_color='YlGnBu', 
               key_on='feature.id',
               fill_opacity=0.7, 
               line_opacity=0.2,
               )
map

### Plot carbon foot-print histogram over calories

In [None]:
fig, ax = plt.subplots()

food_facts_pd['energy_100g'] = food_facts_pd['energy_100g'].astype('float')
food_facts_pd['carbon-footprint_100g'] = food_facts_pd['carbon-footprint_100g'].astype('float')

food_facts_pd.plot.scatter(x='energy_100g', 
                           y='carbon-footprint_100g', 
                           c='carbon-footprint_100g', 
                           colormap='coolwarm',
                           ax=ax)
ax.set_facecolor('black')

plt.xlabel('Energy per 100g [kcal]')
plt.show()

### Availability of products per country

# Use the API

Run to get the following command to install the OpenFoodFacts API 
    pip install git+https://github.com/openfoodfacts/openfoodfacts-python