In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns

import json
import pickle

import findspark
import os

findspark.init()

import pyspark
import pyspark.sql.functions as func

from pyspark.sql import *
from pyspark import SparkContext

In [None]:
sc = pyspark.SparkContext()
spark = SparkSession.builder.getOrCreate()

# Explore data using PySpark

Packages I had to install:
    - pip install pyspark_dist_explore
    - pip install plotly
and then restart the Kernel.

In [None]:
# Import data
open_food_facts_csv_file = "./data/en.openfoodfacts.org.products.csv"

# Import list of interesting columns
columns_to_import = []
with open("columns_to_import.txt", "r") as f:
    for line in f:
        columns_to_import.append(str(line.strip()))

food_facts = (spark.read
                  .option("header", "true")
                  .option("mode", "DROPMALFORMED")
                  .option("delimiter", "\t")
                  .csv(open_food_facts_csv_file))

food_facts = food_facts.select([c for c in food_facts.columns if c in columns_to_import])

In [None]:
food_facts.printSchema()

In [None]:
if 1==1: # skip cell
    products = food_facts.select(['code', 'product_name', 'countries_en', 'brands', 'stores'])
    products_pd = products.filter(products.brands.isNotNull())\
                            .filter(products.stores.isNotNull())\
                            .toPandas()

    products_pd.to_pickle("./data/products_pd.pickle")
    


In [None]:
    products_pd.head(50)

#### Find NaN columns

In [None]:
def count_not_null(c, nan_as_null=False):
    """Use conversion between boolean and integer
    - False -> 0
    - True ->  1
    """
    pred = func.col(c).isNotNull() & (~func.isnan(c) if nan_as_null else func.lit(True))
    return func.sum(pred.cast("integer")).alias(c)

In [None]:
null_entries = food_facts.agg(*[count_not_null(c) for c in food_facts.columns]).toPandas()

# Plot NaNs counts
null_entries.T.plot(kind='barh', figsize=(10, 10))
plt.title("NaNs count in each column")

### Distribution in every column

In [None]:
 if 1==0: #skip cell, computationally demanding
    fig = plt.figure(figsize=(12, 48))
    from collections import Counter

    names = food_facts.schema.names
    for i, name in enumerate(names):
        axis = fig.add_subplot(nrows=int(np.ceil(len(names)/3)), ncols=3, index=i)
        print("{}-{}".format(i,  name))
        letter_counts = Counter(food_facts.select(name).collect())
        df = pd.DataFrame.from_dict(letter_counts, orient='index')
        df_column.plot.bar(ax=axis, bins=100)

### Plot distribution of availability of products in countries

In [None]:
def count_words(df, colonne = 'categories_en'):
    list_words = set()
    for entry in df[colonne].astype('str'):
        if isinstance(entry, float): continue
        for word in entry.split(','):
            list_words.add(word)
    print("Nb of categories in '{}': {}".format(colonne, len(list_words)))
    return list(list_words)

In [None]:
countries_column = food_facts.select('countries_en').toPandas()
list_countries = count_words(countries_column, 'countries_en')

In [None]:
list_countries

If you are bored, you can complete the file country_replacement :D

In [None]:
with open('country_replacement.json', 'r') as json_data:
    country_replacement = json.load(json_data)

In [None]:
# Replace country tags for actual country name
for index, countries in countries_column['countries_en'].str.split(',').items():
    if countries is None or isinstance(countries, float): continue
    country_name = []
    found = False
    for s in countries:
        if s in country_replacement.keys():
            found = True
            country_name.append(country_replacement[s])
        else:
            country_name.append(s)
    if found:
        countries_column.loc[index, 'countries_en'] = ','.join(country_name)    

In [None]:
# Count the number of time each country appear in the dataframe
country_count = dict()
for country in list(list_countries):
    country_count[country] = countries_column['countries_en'].str.contains(country).sum()

In [None]:
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot

# Plot census map
data = dict(type='choropleth',
locations = list(country_count.keys()),
locationmode = 'country names', z = list(country_count.values()),
text = list(country_count.keys()), colorbar = {'title':'Product nb.'},
            
colorscale=[[0.00, 'rgb(204,255,229)'], [0.01, 'rgb(51,160,44)'],
            [0.02, 'rgb(102,178,255)'], [0.03, 'rgb(166,206,227)'],
            [0.05, 'rgb(31,120,180)'], [0.10, 'rgb(251,154,153)'],
            [0.20, 'rgb(255,255,0)'], [1, 'rgb(227,26,28)']])

layout = dict(title='Availability of products per country',
geo = dict(showframe = True, projection={'type':'mercator'}))
choromap = go.Figure(data = [data], layout = layout)
iplot(choromap, validate=False)
i

### Plot carbon foot-print histogram over calories

In [None]:
food_facts_carbon = food_facts.select(["energy_100g", "carbon-footprint_100g",]).toPandas().dropna()

food_facts_carbon['energy_100g'] = food_facts_carbon['energy_100g'].astype('float')
food_facts_carbon['carbon-footprint_100g'] = food_facts_carbon['carbon-footprint_100g'].astype('float')

In [None]:
fig, ax = plt.subplots()
food_facts_carbon.plot.scatter(x='energy_100g', 
                               y='carbon-footprint_100g', 
                               c='carbon-footprint_100g', 
                               colormap='coolwarm',
                               ax=ax)
ax.set_facecolor('black')

plt.xlabel('Energy per 100g [kcal]')
plt.show()

### Availability of products per country

# Use the API

Run to get the following command to install the OpenFoodFacts API 
    pip install git+https://github.com/openfoodfacts/openfoodfacts-python