# Openfoodfacts : analyse exploratoire des données

In [None]:
%matplotlib inline

import os
import zipfile
import urllib

import matplotlib.pyplot as plt

import numpy as np

import pandas as pd
from pandas.plotting import scatter_matrix

FOOD_PATH = os.path.join("datasets", "openfoodfacts")
FOOD_TRANSFORMED_PATH_FILE = os.path.join(FOOD_PATH, "fr.openfoodfacts.org.products_transformed.csv")

import seaborn as sns
sns.set()

pd.set_option("display.max_columns", 1000)
pd.set_option("display.max_rows",1000)



# Import des données

In [None]:
import pandas as pd

def load_food_data(csv_path=FOOD_TRANSFORMED_PATH_FILE):
    return pd.read_csv(csv_path, sep=',', header=0, encoding='utf-8', low_memory=False)



In [None]:
food = load_food_data()

In [None]:
food.head()

In [None]:
food.info()

In [None]:
scoring_features = ['nutrition_scoring', 'no_ingredients_scoring',
       'additives_nocive_scoring', 'energy_100g_scoring', 'salt_100g_scoring',
       'sugars_100g_scoring', 'saturated-fat_100g_scoring',
       'fiber_100g_scoring', 'proteins_100g_scoring', 'bio_scoring']

quantity_features = ['energy_100g', 'sugars_100g', 'salt_100g', 'saturated-fat_100g', 'fiber_100g', 'proteins_100g' ]

In [None]:
food.groupby('main_category_fr')[['main_category_fr']].count()['main_category_fr'].quantile(0.75)

In [None]:
food['main_category_fr'].value_counts()[:100]

# Anayse univariée des features de scoring

In [None]:
food['additives_nocive_scoring'].value_counts()

In [None]:
food.info()

In [None]:
food.describe()

In [None]:
food[scoring_features].hist(bins=5, figsize=(20,15))

## A partir du jeu de donnée, au niveau des valeurs qui ont renseignées, on voit que :
* La majorité des produits n'ont pas les additifs nocifs identifiés à l'étape de data clean
 => Ce résultat n'est pas définitif car il faudrait peupler la liste d'additifs nocifs de façon plus exhaustive ce qui est hors périmètre du projet

# Analyse univariée de features quantitatives

In [None]:
def log_convert(df, features_list_toconvert):
    features_list_converted = []
    for feature_name in features_list_toconvert:
        df[feature_name + '_log'] = np.log10(df[df[feature_name] > 0][feature_name])
        features_list_converted.append(feature_name + '_log')
        
    return(features_list_converted)

In [None]:
food[quantity_features].hist(bins=50)

In [None]:
features_list_log = log_convert(food, quantity_features)

In [None]:
features_list_log

In [None]:
food.describe()

In [None]:
food[['energy_100g_log']].hist(bins=10)

In [None]:
plt.figure(figsize=(16, 10))
plt.axvline(np.log10(2345), 0, 1, color='red', label='à gauche de la barre rouge : scoring énergie > 1\nà droite de la barre rouge   : scoring énergie = 1')
plt.legend()

sns.distplot(food[food['energy_100g_log'].notnull()]['energy_100g_log'], kde=True, label='Densité de probabilité', axlabel='energie pour 100g (échelle logarithmique: 1 = 10, 2=100, 3=1000, ...)')
plt.legend()

In [None]:
plt.figure(figsize=(16, 10))
plt.axvline(np.log10(1.575), 0, 1, color='red', label='à gauche de la barre rouge : scoring sel > 1\nà droite de la barre rouge   : scoring sel = 1')
plt.legend()

sns.distplot(food[food['salt_100g_log'].notnull()]['salt_100g_log'], kde=True, label='Densité de probabilité', axlabel='sel pour 100g (échelle logarithmique: -3=0.001, ..., 1 = 10, 2=100, 3=1000, ...)')
plt.legend()

In [None]:
corr_matrix = food.corr()

In [None]:
type(corr_matrix)

In [None]:
corr_matrix[quantity_features].loc[quantity_features]

In [None]:
corr_matrix[scoring_features].loc[scoring_features]

In [None]:
food.columns

In [None]:
attributes_to_analyze = ['energy_100g', 'sugars_100g', 'salt_100g']

#attributes_to_analyze = ['energy_100g', 'sugars_100g', 'salt_100g', 'saturated-fat_100g',
#       'fiber_100g', 'proteins_100g']
scatter_matrix(food[attributes_to_analyze])

In [None]:
scatter_matrix(food[features_list_log], figsize=(16,10))

In [None]:
scoring_features = ['nutrition_scoring', 'no_ingredients_scoring',
       'additives_nocive_scoring', 'energy_100g_scoring', 'salt_100g_scoring',
       'sugars_100g_scoring', 'saturated-fat_100g_scoring',
       'fiber_100g_scoring', 'proteins_100g_scoring', 'bio_scoring']

scatter_matrix(food[['nutrition_scoring', 'additives_nocive_scoring', 'sugars_100g_scoring']])

In [None]:
#sns.pairplot(food[scoring_features], height=2.5);

# Réduction dimensionnelle

In [None]:
food.columns


[      'energy_100g', 'sugars_100g', 'salt_100g', 'saturated-fat_100g',
       'fiber_100g', 'proteins_100g', 'ingredients_from_palm_oil_n',
       'pnns_groups_2', 'pnns_groups_1', 'labels_tags', 'countries_tags',
       'additives_tags', 'additives_n', 'ingredients_text', 'image_url',
       'nutrition_scoring', 'no_ingredients', 'no_ingredients_scoring',
       'additives_nocive_scoring', 'energy_100g_scoring', 'salt_100g_scoring',
       'sugars_100g_scoring', 'saturated-fat_100g_scoring',
       'fiber_100g_scoring', 'proteins_100g_scoring', 'bio_scoring']


[      'nutrition_scoring', 'no_ingredients_scoring',
       'additives_nocive_scoring', 'energy_100g_scoring', 'salt_100g_scoring',
       'sugars_100g_scoring', 'saturated-fat_100g_scoring',
       'fiber_100g_scoring', 'proteins_100g_scoring', 'bio_scoring']

In [None]:
# Comment ne garder que les valeurs not null ?
food[food[['nutrition_scoring', 'no_ingredients_scoring']].notnull() == True]

In [None]:
from sklearn import decomposition
from sklearn import preprocessing

# Import `PCA` from `sklearn.decomposition`
from sklearn.decomposition import PCA

# Build the model
pca = PCA(n_components=2)

# import de l'échantillon
data = food

# selection des colonnes à prendre en compte dans l'ACP
data_pca = food[['nutrition_scoring', 'no_ingredients_scoring',
       'additives_nocive_scoring', 'energy_100g_scoring', 'salt_100g_scoring',
       'sugars_100g_scoring', 'saturated-fat_100g_scoring',
       'fiber_100g_scoring', 'proteins_100g_scoring', 'bio_scoring']]

data_pca = data_pca.dropna()

X = data_pca.values
features = data_pca.columns

# Centrage et Réduction
std_scale = preprocessing.StandardScaler().fit(X)
X_scaled = std_scale.transform(X)

# Reduce the data, output is ndarray
reduced_data = pca.fit_transform(X_scaled)

# Inspect shape of the `reduced_data`
print(reduced_data.shape)

# print out the reduced data
print(reduced_data)

In [None]:
type(reduced_data)

In [None]:
data_pca[['nutrition_scoring']].values

In [None]:
data.columns

In [None]:
type(data_pca[['nutrition_scoring']].values)

In [None]:
data_pca[['nutrition_scoring']].values.shape

In [None]:
data_pca[['nutrition_scoring']].values[:,0].shape

In [None]:
data_pca['nutrition_scoring'].value_counts()

In [None]:

plt.scatter(reduced_data[:,0], reduced_data[:,1], c = data_pca[['nutrition_scoring']].values[:,0], label=data_pca[['nutrition_scoring']].values[:,0], cmap = 'viridis')
plt.legend()

In [None]:

for data_group_value in np.unique(data_pca[['nutrition_scoring']].values[:,0]):
    data_group_instances = np.where(data_pca[['nutrition_scoring']].values[:,0] == data_group_value)
    plt.scatter(reduced_data[:,0][data_group_instances], reduced_data[:,1][data_group_instances], label='Nutrition score: '+str(data_group_value))
    
plt.legend()
plt.show()

In [None]:
#plt.scatter(reduced_data[:,0], reduced_data[:,1], c=X_scaled[X_scaled['nutrition_scoring'].notnull()]['nutrition_scoring'], cmap = 'viridis')

In [None]:
#plt.scatter(reduced_data[:,0], reduced_data[:,1], c=food['nutrition_scoring'].notnull(), cmap = 'viridis')

In [None]:
#plt.scatter(reduced_data[:,0], reduced_data[:,1], c=food['bio_scoring'].notnull(), cmap = 'viridis')

In [None]:
plt.scatter(reduced_data[:,0], reduced_data[:,1], cmap = 'viridis')

In [None]:
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
import numpy as np
import pandas as pd
from scipy.cluster.hierarchy import dendrogram

def display_circles(pcs, n_comp, pca, axis_ranks, labels=None, label_rotation=0, lims=None):
    for d1, d2 in axis_ranks: # On affiche les 3 premiers plans factoriels, donc les 6 premières composantes
        if d2 < n_comp:

            # initialisation de la figure
            fig, ax = plt.subplots(figsize=(16,16))

            # détermination des limites du graphique
            if lims is not None :
                xmin, xmax, ymin, ymax = lims
            elif pcs.shape[1] < 30 :
                xmin, xmax, ymin, ymax = -1, 1, -1, 1
            else :
                xmin, xmax, ymin, ymax = min(pcs[d1,:]), max(pcs[d1,:]), min(pcs[d2,:]), max(pcs[d2,:])

            # affichage des flèches
            # s'il y a plus de 30 flèches, on n'affiche pas le triangle à leur extrémité
            if pcs.shape[1] < 30 :
                plt.quiver(np.zeros(pcs.shape[1]), np.zeros(pcs.shape[1]),
                   pcs[d1,:], pcs[d2,:], 
                   angles='xy', scale_units='xy', scale=1, color="grey")
                # (voir la doc : https://matplotlib.org/api/_as_gen/matplotlib.pyplot.quiver.html)
            else:
                lines = [[[0,0],[x,y]] for x,y in pcs[[d1,d2]].T]
                ax.add_collection(LineCollection(lines, axes=ax, alpha=.1, color='black'))
            
            # affichage des noms des variables  
            if labels is not None:  
                for i,(x, y) in enumerate(pcs[[d1,d2]].T):
                    if x >= xmin and x <= xmax and y >= ymin and y <= ymax :
                        plt.text(x, y, labels[i], fontsize='14', ha='center', va='center', rotation=label_rotation, color="blue", alpha=0.5)
            
            # affichage du cercle
            circle = plt.Circle((0,0), 1, facecolor='none', edgecolor='b')
            plt.gca().add_artist(circle)

            # définition des limites du graphique
            plt.xlim(xmin, xmax)
            plt.ylim(ymin, ymax)

        
            # affichage des lignes horizontales et verticales
            plt.plot([-1, 1], [0, 0], color='grey', ls='--')
            plt.plot([0, 0], [-1, 1], color='grey', ls='--')

            # nom des axes, avec le pourcentage d'inertie expliqué
            plt.xlabel('F{} ({}%)'.format(d1+1, round(100*pca.explained_variance_ratio_[d1],1)))
            plt.ylabel('F{} ({}%)'.format(d2+1, round(100*pca.explained_variance_ratio_[d2],1)))

            plt.title("Cercle des corrélations (F{} et F{})".format(d1+1, d2+1))
            plt.show(block=False)
        
def display_factorial_planes(X_projected, n_comp, pca, axis_ranks, labels=None, alpha=1, illustrative_var=None):
    for d1,d2 in axis_ranks:
        if d2 < n_comp:
 
            # initialisation de la figure       
            fig = plt.figure(figsize=(7,6))
        
            # affichage des points
            if illustrative_var is None:
                plt.scatter(X_projected[:, d1], X_projected[:, d2], alpha=alpha)
            else:
                illustrative_var = np.array(illustrative_var)
                for value in np.unique(illustrative_var):
                    selected = np.where(illustrative_var == value)
                    plt.scatter(X_projected[selected, d1], X_projected[selected, d2], alpha=alpha, label=value)
                plt.legend()

            # affichage des labels des points
            if labels is not None:
                for i,(x,y) in enumerate(X_projected[:,[d1,d2]]):
                    plt.text(x, y, labels[i],
                              fontsize='14', ha='center',va='center') 
                
            # détermination des limites du graphique
            boundary = np.max(np.abs(X_projected[:, [d1,d2]])) * 1.1
            plt.xlim([-boundary,boundary])
            plt.ylim([-boundary,boundary])
        
            # affichage des lignes horizontales et verticales
            plt.plot([-100, 100], [0, 0], color='grey', ls='--')
            plt.plot([0, 0], [-100, 100], color='grey', ls='--')

            # nom des axes, avec le pourcentage d'inertie expliqué
            plt.xlabel('F{} ({}%)'.format(d1+1, round(100*pca.explained_variance_ratio_[d1],1)))
            plt.ylabel('F{} ({}%)'.format(d2+1, round(100*pca.explained_variance_ratio_[d2],1)))

            plt.title("Projection des individus (sur F{} et F{})".format(d1+1, d2+1))
            plt.show(block=False)

def display_scree_plot(pca):
    scree = pca.explained_variance_ratio_*100
    plt.bar(np.arange(len(scree))+1, scree)
    plt.plot(np.arange(len(scree))+1, scree.cumsum(),c="red",marker='o')
    plt.xlabel("rang de l'axe d'inertie")
    plt.ylabel("pourcentage d'inertie")
    plt.title("Eboulis des valeurs propres")
    plt.show(block=False)

def plot_dendrogram(Z, names):
    plt.figure(figsize=(10,25))
    plt.title('Hierarchical Clustering Dendrogram')
    plt.xlabel('distance')
    dendrogram(
        Z,
        labels = names,
        orientation = "left",
    )
plt.show()

In [None]:

from sklearn import decomposition
from sklearn import preprocessing

# choix du nombre de composantes à calculer
n_comp = 6

# import de l'échantillon
data = food

# selection des colonnes à prendre en compte dans l'ACP
data_pca = food[['nutrition_scoring', 'no_ingredients_scoring',
       'additives_nocive_scoring', 'energy_100g_scoring', 'salt_100g_scoring',
       'sugars_100g_scoring', 'saturated-fat_100g_scoring',
       'fiber_100g_scoring', 'proteins_100g_scoring', 'bio_scoring']]

# préparation des données pour l'ACP
#data_pca = data_pca.fillna(data_pca.mean()) # Il est fréquent de remplacer les valeurs inconnues par la moyenne de la variable
data_pca = data_pca.dropna()

X = data_pca.values
#names = data["idCours"] # ou data.index pour avoir les intitulés

#features = data.columns
features = data_pca.columns

# Centrage et Réduction
std_scale = preprocessing.StandardScaler().fit(X)
X_scaled = std_scale.transform(X)

# Calcul des composantes principales
pca = decomposition.PCA(n_components=n_comp)
pca.fit(X_scaled)

# Eboulis des valeurs propres
display_scree_plot(pca)

# Cercle des corrélations
pcs = pca.components_
#plt.figure(figsize=(16,10))
plt.rcParams["figure.figsize"] = [16,9]
display_circles(pcs, n_comp, pca, [(0,1),(2,3),(4,5)], labels = np.array(features))


'''
# Projection des individus
X_projected = pca.transform(X_scaled)
display_factorial_planes(X_projected, n_comp, pca, [(0,1),(2,3),(4,5)], labels = np.array(names))
'''


# Projection des individus
X_projected = pca.transform(X_scaled)
display_factorial_planes(X_projected, n_comp, pca, [(0,1),(2,3),(4,5)])


plt.show()



Représenter le lien entre le scoring bio et le nombre d'ingrédients