F. Finding correlations

You will try to find correlations between the different data fields. Your goal is to identify
the variables which most affect the nutritional score and provide some insight into
which factors cause both a low or a high nutritional score.

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
from matplotlib import pyplot as plt
%matplotlib inline
#Importing and cleaning the data
#The full dataset has 356027 entries -> using subset 100'000
df = pd.read_csv('en.openfoodfacts.org.products.tsv',nrows=100000,low_memory=False,sep='\t')

In [2]:
# Data Cleaning einfügen
# -> many columns are empty, therefore not used anymore -> delete
#Thresold min 100
df.dropna(thresh=100, axis=1,inplace=True)
dfNum=df.select_dtypes(include=['number'])
dd=dfNum.loc[:,['salt_100g', 'zinc_100g','fat_100g','taurine_100g', 'magnesium_100g' ,'phosphorus_100g','calcium_100g','potassium_100g','folates_100g' ]]
filter0 = dd > 100
filter_any=filter0.any(axis=1)
outliers = df.loc[filter_any, :]
df = df.drop(outliers.index, axis=0)

In [3]:
dfNum.columns.values

array(['code', 'created_t', 'last_modified_t', 'additives_n',
       'ingredients_from_palm_oil_n',
       'ingredients_that_may_be_from_palm_oil_n', 'energy_100g',
       'energy-from-fat_100g', 'fat_100g', 'saturated-fat_100g',
       'monounsaturated-fat_100g', 'polyunsaturated-fat_100g',
       'trans-fat_100g', 'cholesterol_100g', 'carbohydrates_100g',
       'sugars_100g', 'fiber_100g', 'proteins_100g', 'salt_100g',
       'sodium_100g', 'vitamin-a_100g', 'vitamin-d_100g', 'vitamin-k_100g',
       'vitamin-c_100g', 'vitamin-b1_100g', 'vitamin-b2_100g',
       'vitamin-pp_100g', 'vitamin-b6_100g', 'vitamin-b9_100g',
       'folates_100g', 'vitamin-b12_100g', 'pantothenic-acid_100g',
       'potassium_100g', 'calcium_100g', 'phosphorus_100g', 'iron_100g',
       'magnesium_100g', 'zinc_100g', 'copper_100g', 'manganese_100g',
       'selenium_100g', 'nutrition-score-fr_100g',
       'nutrition-score-uk_100g'], dtype=object)

In [4]:
# You will try to find correlations between the different data fields.
#dataAllVariables=dfNum.loc[:,'Happiness Score':'Job Satisfaction']
dataCorr=dfNum.corr(method='pearson')
dataCorr

Unnamed: 0,code,created_t,last_modified_t,additives_n,ingredients_from_palm_oil_n,ingredients_that_may_be_from_palm_oil_n,energy_100g,energy-from-fat_100g,fat_100g,saturated-fat_100g,...,calcium_100g,phosphorus_100g,iron_100g,magnesium_100g,zinc_100g,copper_100g,manganese_100g,selenium_100g,nutrition-score-fr_100g,nutrition-score-uk_100g
code,1.0,-0.017225,-0.028796,0.023752,-0.009464,-0.030877,0.015063,-0.068326,0.002389,-0.002174,...,-0.003823,0.024781,0.001587,-0.020282,0.032247,-0.000654,0.058099,0.046153,0.025568,0.02651
created_t,-0.017225,1.0,0.749923,0.008245,-0.091132,-0.006444,-0.010659,0.035227,-0.01006,-0.006501,...,-0.010767,-0.021458,-0.027437,0.00471,-0.122955,-0.003675,-0.103114,0.007701,0.002697,0.005235
last_modified_t,-0.028796,0.749923,1.0,0.000972,-0.009612,-0.005945,-0.00947,0.07222,-0.008274,-0.006417,...,-0.007255,0.023264,-0.015701,0.003674,0.000578,-0.002374,-0.157237,0.00641,0.006619,0.004595
additives_n,0.023752,0.008245,0.000972,1.0,0.021221,0.248353,-0.007548,-0.13302,-0.145716,-0.060827,...,-0.004588,-0.092787,-0.002188,-0.024916,-0.006729,-0.019831,-0.02008,0.078975,0.158599,0.158609
ingredients_from_palm_oil_n,-0.009464,-0.091132,-0.009612,0.021221,1.0,0.039742,0.024202,0.063815,0.015555,0.015886,...,-0.000355,,-5.3e-05,,,,,,0.019201,0.020103
ingredients_that_may_be_from_palm_oil_n,-0.030877,-0.006444,-0.005945,0.248353,0.039742,1.0,-0.052584,0.213824,-0.01144,0.0244,...,-0.002909,0.002171,-0.001947,-0.002828,0.00244,0.002022,-0.012273,-0.004985,0.030392,0.030263
energy_100g,0.015063,-0.010659,-0.00947,-0.007548,0.024202,-0.052584,1.0,0.767088,0.763613,0.55284,...,0.010234,0.387184,0.001244,0.024777,0.000376,0.002921,0.120494,0.053855,0.625453,0.628017
energy-from-fat_100g,-0.068326,0.035227,0.07222,-0.13302,0.063815,0.213824,0.767088,1.0,0.98997,0.715668,...,0.042795,0.614601,-0.134324,0.814873,-0.281297,0.881853,0.740208,-0.654363,0.567408,0.591788
fat_100g,0.002389,-0.01006,-0.008274,-0.145716,0.015555,-0.01144,0.763613,0.98997,1.0,0.676556,...,0.015452,0.330562,-0.002764,0.034626,-0.024319,0.028732,0.157946,0.020924,0.580479,0.582409
saturated-fat_100g,-0.002174,-0.006501,-0.006417,-0.060827,0.015886,0.0244,0.55284,0.715668,0.676556,1.0,...,0.030788,0.24313,-0.003826,0.012771,-0.02456,-0.001608,0.092756,-0.017192,0.668992,0.670244


In [34]:
# Absolute Correlations >0.7
df1=dataCorr[abs(dataCorr)>0.1]
df1

Unnamed: 0,code,created_t,last_modified_t,additives_n,ingredients_from_palm_oil_n,ingredients_that_may_be_from_palm_oil_n,energy_100g,energy-from-fat_100g,fat_100g,saturated-fat_100g,...,calcium_100g,phosphorus_100g,iron_100g,magnesium_100g,zinc_100g,copper_100g,manganese_100g,selenium_100g,nutrition-score-fr_100g,nutrition-score-uk_100g
code,1.0,,,,,,,,,,...,,,,,,,,,,
created_t,,1.0,0.749923,,,,,,,,...,,,,,-0.122955,,-0.103114,,,
last_modified_t,,0.749923,1.0,,,,,,,,...,,,,,,,-0.157237,,,
additives_n,,,,1.0,,0.248353,,-0.13302,-0.145716,,...,,,,,,,,,0.158599,0.158609
ingredients_from_palm_oil_n,,,,,1.0,,,,,,...,,,,,,,,,,
ingredients_that_may_be_from_palm_oil_n,,,,0.248353,,1.0,,0.213824,,,...,,,,,,,,,,
energy_100g,,,,,,,1.0,0.767088,0.763613,0.55284,...,,0.387184,,,,,0.120494,,0.625453,0.628017
energy-from-fat_100g,,,,-0.13302,,0.213824,0.767088,1.0,0.98997,0.715668,...,,0.614601,-0.134324,0.814873,-0.281297,0.881853,0.740208,-0.654363,0.567408,0.591788
fat_100g,,,,-0.145716,,,0.763613,0.98997,1.0,0.676556,...,,0.330562,,,,,0.157946,,0.580479,0.582409
saturated-fat_100g,,,,,,,0.55284,0.715668,0.676556,1.0,...,,0.24313,,,,,,,0.668992,0.670244


In [35]:
#Drop NaN
print('# NaN = ',df1.isnull().sum().sum())
print(df1.shape)
s = df1.unstack()
df2=s.dropna(axis=0)
print()
print(df2.shape)
print(df2.head(15))
print()
print('# NaN = ',df2.isnull().sum().sum())

# NaN =  1344
(43, 43)

(505,)
code             code                                       1.000000
created_t        created_t                                  1.000000
                 last_modified_t                            0.749923
                 vitamin-d_100g                            -0.262449
                 vitamin-b6_100g                           -0.107490
                 zinc_100g                                 -0.122955
                 manganese_100g                            -0.103114
last_modified_t  created_t                                  0.749923
                 last_modified_t                            1.000000
                 vitamin-d_100g                            -0.209207
                 manganese_100g                            -0.157237
additives_n      additives_n                                1.000000
                 ingredients_that_may_be_from_palm_oil_n    0.248353
                 energy-from-fat_100g                      -0.133020
   

In [36]:
# Result of the strongest correlations
df2=df2[abs(df2)<0.999]
df2

created_t                                last_modified_t                            0.749923
                                         vitamin-d_100g                            -0.262449
                                         vitamin-b6_100g                           -0.107490
                                         zinc_100g                                 -0.122955
                                         manganese_100g                            -0.103114
last_modified_t                          created_t                                  0.749923
                                         vitamin-d_100g                            -0.209207
                                         manganese_100g                            -0.157237
additives_n                              ingredients_that_may_be_from_palm_oil_n    0.248353
                                         energy-from-fat_100g                      -0.133020
                                         fat_100g                     

In [49]:
# identify the variables which most affect the nutritional score 
# -> Sorted list
df3=df2['nutrition-score-uk_100g']
df3.sort_values(ascending=False, kind="quicksort").head(9)

nutrition-score-fr_100g     0.998607
saturated-fat_100g          0.670244
energy_100g                 0.628017
energy-from-fat_100g        0.591788
fat_100g                    0.582409
sugars_100g                 0.446731
vitamin-d_100g              0.369966
monounsaturated-fat_100g    0.346780
polyunsaturated-fat_100g    0.305259
dtype: float64

In [51]:
df3=df2['nutrition-score-fr_100g'].head(9)
df3.sort_values(ascending=False, kind="quicksort").head(9)

saturated-fat_100g          0.668992
energy_100g                 0.625453
fat_100g                    0.580479
energy-from-fat_100g        0.567408
sugars_100g                 0.446986
monounsaturated-fat_100g    0.345610
polyunsaturated-fat_100g    0.303831
carbohydrates_100g          0.224740
additives_n                 0.158599
dtype: float64

In [52]:
#provide some insight into which factors cause both a low or a high nutritional score.
# positive Values cause a high nutritional score, like
s=df2['nutrition-score-uk_100g']
df3=s[s>0]
df3.sort_values(ascending=False, kind="quicksort").head(9)

nutrition-score-fr_100g     0.998607
saturated-fat_100g          0.670244
energy_100g                 0.628017
energy-from-fat_100g        0.591788
fat_100g                    0.582409
sugars_100g                 0.446731
vitamin-d_100g              0.369966
monounsaturated-fat_100g    0.346780
polyunsaturated-fat_100g    0.305259
dtype: float64

In [55]:
# negative Values cause a low nutritional score, like
s=df2['nutrition-score-uk_100g']
df3=s[s<0].head(9)
df3.sort_values(ascending=True, kind="quicksort").head(9)

vitamin-k_100g   -0.360809
fiber_100g       -0.202255
dtype: float64