# IMPORT BACK THE SESSION

In [17]:
# Installation in Colab environment only
!pip install python-stdnum

# importation
import json
import pandas as pd
import numpy as np
from google.colab import files
from stdnum.exceptions import *
from stdnum.util import *
from stdnum.ean import *
import scipy.stats as st
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from ipywidgets import interact, interact_manual

Collecting python-stdnum
[?25l  Downloading https://files.pythonhosted.org/packages/6b/57/5d1ee250a9c4e780d57d7dcc1b9553c4e386c759ed3087f813965c4534d2/python_stdnum-1.13-py2.py3-none-any.whl (839kB)
[K     |▍                               | 10kB 26.2MB/s eta 0:00:01[K     |▉                               | 20kB 2.2MB/s eta 0:00:01[K     |█▏                              | 30kB 2.8MB/s eta 0:00:01[K     |█▋                              | 40kB 2.1MB/s eta 0:00:01[K     |██                              | 51kB 2.3MB/s eta 0:00:01[K     |██▍                             | 61kB 2.7MB/s eta 0:00:01[K     |██▊                             | 71kB 3.0MB/s eta 0:00:01[K     |███▏                            | 81kB 3.2MB/s eta 0:00:01[K     |███▌                            | 92kB 3.6MB/s eta 0:00:01[K     |████                            | 102kB 3.4MB/s eta 0:00:01[K     |████▎                           | 112kB 3.4MB/s eta 0:00:01[K     |████▊                           | 122kB

In [7]:
files.upload() # OFF_up_to_1_3.csv

{}

In [0]:
f = open('d_types.txt', 'r')
dict_str = f.read()
d_types = json.loads(dict_str)

In [0]:
dfcopy = pd.read_csv('/content/OFF_up_to_1_3.csv',
                     dtype=d_types, low_memory=False, encoding ='utf-8')

In [0]:
# print(dfcopy.info(memory_usage='deep'))

  ### 1.4 Dropping inoperable rows

We need to get rid of inoperable rows. We will define operability of a row as follows:

- condition 1: the product can be identified, at least by a unique combination of name, brand and quantity ('product_name', 'quantity', 'brands'), OR by a valid and unique barcode ('code')

AND
- condition 2: at least one numerical information is provided i.e. one of the features 'XXX_100g' or one type of nutritional score

#### *Condition 1*:

Let's first check the 'code' column for completion and validity of the data:


In [23]:
# Checking nb of null values in 'code' column
dfcopy['code'].isna().sum()

0

No null value in the 'code' column.

We will now create a new column 'code_val' indicating if the barcode is valid.

In [24]:
# Verification of barcodes (EAN-13, EAN-8 and UPC (12-digit) format)
# Creation of a new column : barcode valid or not
dfcopy.insert(1,'code_val', dfcopy['code'].apply(is_valid))
print("nb of valid codes: ", dfcopy['code_val'].sum())
print("nb of invalid codes: ", (~dfcopy['code_val']).sum())
print("pctage of valid codes: ", dfcopy['code_val'].sum()*100/dfcopy.shape[0], "%")

'''nb of valid codes:  1190822
nb of invalid codes:  23153
pctage of valid codes:  98.09279433266748 %'''

nb of valid codes:  1192439
nb of invalid codes:  23212
pctage of valid codes:  98.09057040219602 %


'nb of valid codes:  1190822\nnb of invalid codes:  23153\npctage of valid codes:  98.09279433266748 %'

When the code is not valid, we will check if the product can be identified by a unique combination of "product_name", "quantity", "brands"  that allow to detect possible duplicates. (At least one of the three must be filled i.e. not NaN).

In [0]:
key_cols = ['product_name', 'quantity', 'brands']

In [26]:
# Checking for invalid 'product_name' (less than 2 characters)
df_prod_notna = pd.DataFrame(dfcopy['product_name'].dropna())
li_ind_all = df_prod_notna.index.to_list()
li_len_prod = list(map(len, df_prod_notna))
li_ind_len = list(zip(li_ind_all,li_len_prod))
ind_wrong_names = [ind for ind, length in li_ind_len if length < 2]
len(ind_wrong_names), dfcopy.loc[ind_wrong_names]['product_name'].to_list()

(0, [])

In [0]:
# Replacing invalid names with np.NaN
dfcopy.loc[ind_wrong_names,['product_name']] = np.nan

In [30]:
# Checking for 'brands' (less than 1 characters)
df_brands_notna = dfcopy['brands'].dropna()
li_ind_all = df_brands_notna.index.to_list()
li_len_brands = list(map(len, df_brands_notna))
li_ind_len = list(zip(li_ind_all,li_len_brands))
ind_wrong_names = [ind for ind, length in li_ind_len if length < 1]
len(ind_wrong_names), dfcopy.loc[ind_wrong_names]['brands'].unique()

(0, [], Categories (0, object): [])

In [0]:
### Function decomposing 'quantity' in groups of (numerical value, unit)

# To ensure safe execution of function 'float'
def safe_exe(def_val, function, *args):
    try: return function(*args)
    except: return def_val

# Conversion of string in float if possible, else 0 (accepts 'n1 x n2' string)
def conv_float(my_str):
  idx = 0
  if 'x' in my_str:
    idx = my_str.find('x')
    n1 = safe_exe(0, float, my_str[:idx])
    n2 = safe_exe(0, float, my_str[idx+1:])
    return n1*n2
  else:
    return safe_exe(0, float, my_str)

def num_units(my_str):
  my_str = my_str.lower().strip()
  regex = r'([0-9.,x ]*)\s*([^()0-9 !,\-±=\*\+/.-\?\[\]]*\s*)'
  res = re.findall(regex, my_str)
  res.remove(('', ''))
  num = [conv_float(gr[0].replace(' ','').replace(',','.')) for gr in res]
  unit = [gr[1].strip()  for gr in res]
  res=list(zip(num,unit))
  return num, unit

In [34]:
# Creating a new database with the main features of the 'quantity' data
# (avoiding rows with empty 'quantity')

df_quant_notna = dfcopy.dropna(subset=['quantity'])
df_quantity = pd.DataFrame([])
df_quantity['quantity'] = df_quant_notna['quantity']
print("nb of row with non empty 'quantity':", df_quant_notna.shape[0]," on ", dfcopy.shape[0])

df_quantity['analyse'] = df_quant_notna['quantity'].apply(num_units)
# df_quantity['num_gr'] = [t[0] for t in df_quantity['analyse']]
# df_quantity['unit_gr'] = [t[1] for t in df_quantity['analyse']]

nb of row with non empty 'quantity': 379515  on  1215651


TypeError: ignored

In [0]:
# Displaying the main units in 'quantity'
units = df_quantity['unit_gr']
all_units = units.values
all_units = [x[i] for x in all_units for i in range(len(x))]
print("Total nb of not NaN rows: ", units.shape[0])
print("Total nb of identified units: ", len(all_units))
print("Nb of unique units: " , len(list(set(all_units))))
print("List of unique units: \n", list(set(all_units)))

In [0]:
# Classifying main relevant units in 'mass' and 'volume' units
li_u_mass = ['g', 'kg', 'gr', 'grammes', 'grs','st', 'mg', 'gramm', 'lb','gram',
             'grams', 'gramos', 'lbs', 'gm', 'lt', 'lts','gramme', 'kilo','公克',
             'grammi', 'kgs', 'kgr', 'gms', 'g-', 'grms','pound', 'pounds', 
             'grm', 'grames','غرام', 'جرام','غ', 'غم','جم','g℮', 'г', 'кг', '克', 
             'грамм', 'גרם','kilogramm','gramas','kilogrammae','livres',
             'grame', 'kilos'] 
li_u_vol = ['ml','dl','l','cl', 'oz', 'litre', 'fl', 'litres', 'liter','litro',
            'litri','litr', 'ounces','ounce', 'ltr', 'gallon','half-gallon',
            'litros','litroe', 'liters', 'cc', 'kl', 'pint','pints', 'gal',
            'mls', 'centilitres', 'لتر','مل','ل','ليتر', 'มล', 'ลิตร', 'мл', 'л',
            'litrè', 'milliliter','millilitre', 'γρ', 'литр', 'литра', 'mml',
            'מ״ל','millilitres','λίτρο', 'mĺ', 'cm', 'cm³' ]

In [0]:
# Function selecting the most relevant couple (num,unit)
# by order of priority : unit in li_prio1, then in li_prio2
def sel_gr(li, li_prio1, li_prio2):
  res = 0
  nums = li[0]
  units = li[1]
  tab_t_u = []
  ind = np.nan
  for i in units:
    if  i in li_prio1:
      tab_t_u.append(2)
    elif  i in li_prio2:
      tab_t_u.append(1)
    else:
      tab_t_u.append(0) if i!='' else tab_t_u.append(np.nan)
  itab1 = [i for i in range(len(tab_t_u)) if tab_t_u[i]==2] # indexes of all volumes (prio1)
  itab2 = [i for i in range(len(tab_t_u)) if tab_t_u[i]==1] # indexes of all masses (prio2)
  itab0 = [i for i in range(len(tab_t_u)) if tab_t_u[i]==0] # indexes of all others (prio3)
  if len(itab1)>0: # prio1 (vol)
    ind = itab1[np.argmax([nums[i] for i in itab1])]
  elif len(itab2)>0: # prio2 (mass)
    ind = itab2[np.argmax([nums[i] for i in itab2])]
  else:
    ind = itab0[np.argmax([nums[i] for i in itab0])] if len(itab0)>0 else np.nan
  return (nums[ind], units[ind]) if ind is not np.nan else (np.nan, np.nan)

In [0]:
my_fun = lambda x:sel_gr(x, li_u_vol, li_u_mass)
li_best_num_unit = df_quantity['analyse'].apply(my_fun)

df_quantity['num'] = [gr1 for gr1, gr2 in li_best_num_unit]
df_quantity['unit'] = [gr2 for gr1, gr2 in li_best_num_unit]

df_quantity['nb_groups'] = [len(gr[0]) for gr in df_quantity['analyse']]
df_quantity['nb_char'] = [len(s) if s is not np.nan else 0 for s in df_quantity['quantity']]
df_quantity['nb_char_unit'] = [len(s) if s is not np.nan else 0 for s in df_quantity['unit']]

df_quantity['unit_bool'] = [(1 if n!=0 else 0) for n in df_quantity['nb_char_unit']]
df_quantity['num_bool'] = [(1 if n!=0 else 0)for n in df_quantity['num']]

In [0]:
# Function that links 'mass','volume'and 'other' to numerical values (1,2,0)
def quantity_type(my_string):
  if my_string is None:
    return np.nan
  else:
    return 1 if my_string in li_u_mass else 2 if my_string in li_u_vol else 0

In [0]:
# New column identifying 'mass' (1), 'volume' (2) and 'other' (0)
i = df_quantity.columns.to_list().index('unit') + 1 # after 'unit'
df_quantity.insert(i, 'q_unit_type',
                   [quantity_type(s) if s is not np.nan else np.nan \
                    for s in df_quantity['unit']])
df_quantity.head(5)

In [0]:
# Pick one of each randomly to display

idx=[]
sel_idx=[]
my_dict=dict(zip([0,1,2,3],[0,1,2,np.nan]))
cpt=0
for i in my_dict:
  if my_dict[i] is not np.nan:
    idx.append(df_quantity[df_quantity['q_unit_type']==my_dict[i]].index.to_list())
  else :
    idx.append(df_quantity[df_quantity['q_unit_type'].isna()].index.to_list())
  r = randint(0,len(idx[cpt]))
  sel_idx.append(idx[cpt][r])
  cpt+=1

print("mass : ",len(idx[1]), "volume : ",len(idx[2]),\
      "other : ",len(idx[0]),"nan : ",len(idx[3]),\
      "Total : ", df_quantity.shape[0])
df_quantity.loc[sel_idx,['quantity' ,'num' ,'unit' , 'q_unit_type']]

In [0]:
# Defining limits to facilitate clustering algorithm
# despite outliers in 'num' values
(num_min, num_max) = (0,10000) #(mean-2*std, mean+2*std) #(0,10000)
# Creating a filtered version of df_quantity without outliers 
df_quantity_f = df_quantity[(df_quantity['num']<num_max)&\
                            (df_quantity['num']>num_min)]

In [0]:
# Statistics to assess dispersion of 'num_max' data
(mean, std) = df_quantity['num'].mean(), df_quantity['num'].std()
(median, q_01, q_99) = df_quantity['num'].median(),\
                       df_quantity['num'].quantile(0.01),\
                       df_quantity['num'].quantile(0.99)
print("Nb rows in original df vs filtered df:\n",
      (df_quantity.shape[0], df_quantity_f.shape[0]) )
print("Mean and standard error of 'unit' column:\n", (mean, std))
print("1st centile, median and 99h centile:\n", (q_01, median, q_99))

In [0]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
n_clust = 7
X = df_quantity_f[['nb_groups',	'nb_char', 'num',	'nb_char_unit',\
                   'q_unit_type','num_bool',	'unit_bool']]
scaler = StandardScaler()
X = scaler.fit_transform(X)
kmeans = KMeans(n_clusters=n_clust, random_state=0).fit(X)
df_quantity_f['cluster'] = kmeans.labels_

In [0]:
fig = plt.figure(figsize = (21, 6))
grid = plt.GridSpec(1, 1, wspace=0.2, hspace=0.1)
# grid = plt.GridSpec(1, 3, wspace=0.2, hspace=0.1)
np.warnings.filterwarnings('ignore')

mkr_dict = {'mass' : (1,'s'), 'volume' : (2,'o'), 'other' : (0,'*')}

ax1 = plt.subplot(grid[0, 0:1])
for mkr in mkr_dict:
  mask = df_quantity_f['q_unit_type'] == mkr_dict[mkr][0]
  log_num_max = 100*np.log(df_quantity_f[mask]['num'])
  df_quantity_f[mask].plot(ax = ax1, kind = 'scatter',
                           s = df_quantity_f[mask]['nb_char'],
                           y='nb_groups',x='num', c='cluster',
                           marker = mkr_dict[mkr][1], colormap='viridis',
                           xlim = (0,3000), ylim = (0,20), logx = True,
                           alpha=0.3, legend = False,) # figsize = (18,6))

plt.show()

In [0]:
### Function giving description of data in a cluster
def analyses_cluster (df,n):
  df_cl = df[df['cluster']==n]
  print("o---- CLUSTER ", str(n), " ----o")
  print("nb rows", df_cl.shape[0])
  print("min and max num: ", df_cl['num'].min(), df_cl['num'].max())
  print("nb of empty unit: ", df_cl[(df_cl['unit']=='')].shape[0])
  print(df_cl['nb_groups'].value_counts())
 # print("----------- Data in the cluster with empty unit_max : --------")
 # return df_cl[(df_cl['unit_max']=='')]

In [0]:
# Checking the repartition of rows between clusters
for i in range(n_clust):
  print("cluster "+str(i)+" :",df_quantity_f[df_quantity_f['cluster']==i].shape)
# Then giving a summary of main characteristics
for i in range(n_clust):
  analyses_cluster(df_quantity_f,i)

In [0]:
# indexes of rows with empty 'unit'
idx_unit_na = df_quantity[df_quantity['unit'].isna()].index.to_list()
print("Found {} empty 'unit'.\n".format(len(idx_unit_na)))

# indexes of rows with 'num' 0, negative or values exceeding 6000
cond1 = df_quantity['num'].isna()
cond2 = df_quantity['num']>10000
cond3 = df_quantity['num']<=0
my_cond = (cond1|cond2|cond3)
idx_num_outl = df_quantity[my_cond].index.to_list()
print("Total {} outliers found in 'num'.".format(len(idx_num_outl)))
print("--> NaN: {0} - Outliers: {1} - Zero or negative: {2}"\
      .format(df_quantity[cond1].shape[0],
              df_quantity[cond2].shape[0],
              df_quantity[cond3].shape[0]))

In [0]:
# Dropping rows where unit or num is invalid in df_quantity
df_quantity.drop(index=idx_unit_na+idx_num_outl, inplace=True)
# Setting value of 'quantity' in dfcopy also to NaN if unit or num is invalid 
dfcopy.loc[idx_unit_na+idx_num_outl, 'quantity'] = np.nan

In [0]:
# Merging df_quantity with dfcopy
print("dfcopy before: \n", dfcopy.shape,
      "\ndf_quantity: \n", df_quantity.shape)
dfcopy = dfcopy.merge(df_quantity[['num', 'unit','q_unit_type']],
                      how = 'left',  left_index=True, right_index=True)
dfcopy.rename(columns = {'num':'quantity_num',
                         'unit':'quantity_unit',
                          'q_unit_type':'quantity_type'},
                inplace=True)
# ---- NB : join on index works, whereas join on column 'quantity' doesn't.
cols = dfcopy.columns.to_list()
cols = cols[:9]+cols[-3:]+cols[9:-3]
dfcopy = dfcopy[cols]
print("---> dfcopy after: \n", dfcopy.shape)
dfcopy.head(3)

In [0]:
# Checking rows with insufficient identification data
# Indexes of rows with invalid codes AND all key_cols NaN
key_cols = ['product_name', 'quantity_num','quantity_unit', 'brands']
m_wcode = dfcopy['code_val']==False
m_wkeys = dfcopy.loc[:,key_cols].isna().all(axis=1)

ind_drop = dfcopy[m_wcode&m_wkeys].index.to_list()

print("nb of rows with invalid code", m_wcode.sum())
print("nb of rows with all key-columns NaN:", m_wkeys.sum())
print("nb of rows with invalid code AND all key-columns NaN (to drop):", (m_wcode&m_wkeys).sum())

Data that have no key column filled have very few data anyway.
We drop all the 22183 rows.

In [0]:
# Dropping rows with invalid codes AND all key_cols NaN
print("nb of rows before:", dfcopy.shape[0])
dfcopy.drop(index=ind_drop, inplace=True)
print("nb of rows after:", dfcopy.shape[0])

#### *Condition 2*:

Let's now check for the second condition that is "at least one numerical columns filled" among a selection of numerical data OR nutritional score

In [0]:
def nb_rows_allna (df, selcol):
  mask = dfcopy[selcol].isna().all(axis=1)
  ind_drop = dfcopy[mask].index.to_list()
  return len(ind_drop), ind_drop
  
def print_nb(df, selcol):
  print("------ columns: ", selcol, "\nnb of rows all NaN : ", nb_rows_allna(df, selcol)[0])

In [0]:
list(dfcopy.columns)

In [0]:
# Display amount of rows to drop depending on selection of columns considered
cruc_num_col = ['energy_100g']
print_nb(dfcopy, cruc_num_col)
cruc_num_col += li_cat_comp
print_nb(dfcopy, cruc_num_col)
cruc_num_col += ['salt_100g']
print_nb(dfcopy, cruc_num_col)
cruc_num_col += li_cat_vit_oligo
print_nb(dfcopy, cruc_num_col)
cruc_num_col += li_cat_nutri
print_nb(dfcopy, cruc_num_col)

We drop the 184 115 rows that do not have either one of the following 36 numerical '**XXX_100g**' columns:

['energy_100g'] + li_cat_comp + ['salt_100g'] + li_cat_vit_oligo

However, we keep the data with no nutritional data in **li_cat_nutri** columns, as we are going to try to reconsruct those data later.

In [0]:
# Dropping rows with no numerical data filled among 'cruc_num_col' columns
cruc_num_col = ['energy_100g'] + li_cat_comp + ['salt_100g'] + li_cat_vit_oligo
print("nb of rows before:", dfcopy.shape[0])
print(nb_rows_allna(dfcopy, cruc_num_col)[0], "rows to drop")
dfcopy.drop(index=nb_rows_allna(dfcopy, cruc_num_col)[1], inplace=True)
print("nb of rows after:", dfcopy.shape[0])

In [0]:
speak('Dropped inoperable rows')

### 1.5 Dealing with duplicates
At first we are going to deal with 'absolute' duplicates (i.e. duplicate on each row). We'll simply keep the first row.


In [0]:
# 1 - Checking for duplicates on all columns
df_dup_all = dfcopy[dfcopy.duplicated(keep=False)]
print("Nb of duplicated rows on all columns:", df_dup_all.shape[0])

In [0]:
# -> Keeping the first duplicated row
print("before", dfcopy.shape[0])
dfcopy = dfcopy[~dfcopy.duplicated(keep='first')]
print("after", dfcopy.shape[0])

Then, to deal with the duplicates on differents subsets of the columns (['code'] and key_cols), we are going to define a function that combines all the duplicated rows in one. The main row will be the one that has the greater number of unique not null values, and it will be unriched by date from the others rows if available (using pd.DataFrame.combine_first).

In [0]:
# Function combining all dupl. rows,
# '1st' is the one with highest unique notna val.
''' Among each group of duplicates :
 - finds nb of unique notna for each row
 - selects the one with the most unique notna
 - combines the rows (1st is the selected row)
 - replaces all the rows by combined row
 - returns nb of grps of dupl., indexes of rows to drop, list of rows for update
 '''
def comb_dup_rows(df, cols):
  uniq_notna = lambda x:pd.notna(x.unique()).sum()
  df_f = pd.DataFrame({})
  l_ind = []
  cpt = 0
  for k,df_g in df.groupby(by=cols): # loop on all grps of dup.
    df_comb = pd.DataFrame({})
    ser_gr = df_g.apply(uniq_notna, axis=1)
    idx_max = np.argmax(ser_gr)
    ind = list(ser_gr.index)
    ind.remove(idx_max)
    df_comb = df_g.loc[idx_max]
    for i in ind: # loop on all dupl. except that with max unique not na
      df_comb = df_comb.combine_first(df_g.loc[i])
    df_f = df_f.append(df_comb)
    l_ind += ind
    cpt += 1
  df_f.reindex(columns=df.columns) #reorder the columns (bug in pd.concat)
  return cpt, l_ind, df_f 

Let's apply this function to dfcopy, with the subset ['code'] for the duplicates:

In [0]:
# 2 - Checking for duplicates on 'code'
df_dup = dfcopy[dfcopy.duplicated(subset=['code'],keep=False)]
print("Nb of duplicated rows on 'code':", df_dup.shape[0])

In [0]:
# duplicates on 'code' column
# -> Keeping the combined row, deleting the others
print("Shape of dfcopy, before", df_dup.shape[0])
nb_gr, ind_drop, df_dup_f = comb_dup_rows(df_dup, ['code'])
df_dup.drop(ind_drop, inplace=True)
df_dup.update(df_dup_f, overwrite=True)
print("Nb of groups of duplicates :", nb_gr)
print("Nb of duplicates :", df_dup.shape[0])
print("Nb of rows to delete:", len(ind_drop))
print("Shape of dfcopy, after", df_dup.shape[0])

In [0]:
# Changing dfcopy
dfcopy.drop(ind_drop, inplace=True)
dfcopy.update(df_dup_f, overwrite=True)

In [0]:
speak('dropping duplicates (on column code)')

29 duplicates has be deleted, and 29 rows updated.

Let's apply this function to dfcopy, with the subset key_cols for the duplicates:

In [0]:
dfcopy.groupby(['code_val']).size()

In [0]:
# 3 - Checking for duplicates on key_cols
key_cols = ['product_name', 'quantity_num', 'quantity_unit', 'brands']
m_dup_keys = dfcopy.duplicated(subset=key_cols,keep=False)
m_c_inval = dfcopy['code_val']==False
df_dup_k_c_val = dfcopy[m_dup_keys & ~m_c_inval]
df_dup_k_c_inval= dfcopy[m_dup_keys & m_c_inval]
print(f"Nb of duplicated rows on key_cols with VALID code: {df_dup_k_c_val.shape[0]}" )
print(f"Nb of duplicated rows on key_cols with INVALID code: {df_dup_k_c_inval.shape[0]}")

If there is duplicates with different code, and at least one of them is valide and one invalid, we want to keep only the rows with valide codes:

In [0]:
# Finding duplicates with at least 1 row with valid code AND 1 invalid code
df_mixed_dup = dfcopy[m_dup_keys].groupby(by=key_cols)\
                .filter(lambda x:sum((x['code_val']==True).values)>1\
                        and sum((x['code_val']==False).values)>1)

In [0]:
# df_mixed_dup.sort_values(by=key_cols)
gb = df_mixed_dup.groupby(by=key_cols)
print(f"Nb of rows with mixed val/inval codes: {sum(gb.size())}")
print(f"Nb of groups of duplicates: {len(gb)}")

In [0]:
# shows example of duplicates on key_cols with mixed 'code_val'
gb.get_group(list(gb.groups.keys())[0])

In [0]:
# Dropping the rows with invalid codes in each group
print("Shape of dfcopy, before", dfcopy.shape[0])
for i in range(len(gb)):
  d = gb.get_group(list(gb.groups.keys())[i])
  ind = d[d['code_val']==False].index
  dfcopy.drop(index=ind, inplace=True)
print("Shape of dfcopy, after", dfcopy.shape[0])

In [0]:
# Refreshing the list
m_dup_keys = dfcopy.duplicated(subset=key_cols,keep=False)
m_c_inval = dfcopy['code_val']==False
df_dup_k_c_val = dfcopy[m_dup_keys & ~m_c_inval]
df_dup_k_c_inval= dfcopy[m_dup_keys & m_c_inval]
print(f"Nb of duplicated rows on key_cols with VALID code: {df_dup_k_c_val.shape[0]}" )
print(f"Nb of duplicated rows on key_cols with INVALID code: {df_dup_k_c_inval.shape[0]}")

In [0]:
# Duplicates on key_cols column with invalid codes
# -> Keeping the combined row, deleting the others
print("Shape of df_dup_k_c_inval, before", df_dup_k_c_inval.shape[0])
nb_gr, ind_drop, df_dup_f = comb_dup_rows(df_dup_k_c_inval, key_cols)
df_dup_k_c_inval.drop(ind_drop, inplace=True)
df_dup_k_c_inval.update(df_dup_f, overwrite=True)
print("Nb of groups of duplicates :", nb_gr)
print("Nb of duplicates :", df_dup_k_c_inval.shape[0])
print("Nb of rows to delete:", len(ind_drop))
print("Shape of df_dup_k_c_inval, after", df_dup_k_c_inval.shape[0])

In [0]:
# Changing dfcopy
dfcopy.drop(ind_drop, inplace=True)
dfcopy.update(df_dup_k_c_inval, overwrite=True)

In [0]:
# list(df.columns[df.columns.str.contains('nutri')]) 'nutriscore_score', 'nutrition-score-fr_100g'

## 2 Cleaning numerical data

In [0]:
dfcopy.select_dtypes(include=[float]).head()

### 2.1 Dealing with outliers

In [0]:
cols_100g = list(dfcopy.columns[dfcopy.columns.str.contains('_100g')])
df_100g = dfcopy.loc[:,cols_100g].astype(float).copy()
df_cat_nutri = pd.concat([dfcopy.loc[:,'nutriscore'].astype(float), dfcopy.loc[:,['nutrigrade','pnns_gp1', 'main_categ_en']]]).copy()
df_cat_nutri.columns=['nutriscore','nutrigrade','pnns_gp1', 'main_categ_en']
df_cat_nutri

In [0]:
dfcopy.loc[:,'nutriscore'].isna().sum(), df_cat_nutri.loc[:,'nutriscore'].isna().sum()

#### 2.1.1 columns about nutriscores

In [0]:
speak("attention savastoppé")

In [0]:
# Checking if ther are rows where 'nutriscore' is different from 'nutriscore-fr'
cond = dfcopy['nutriscore']==dfcopy['nutriscore-fr']
dfcopy.dropna(subset=['nutriscore', 'nutriscore-fr'])[~cond].shape[0]

Column 'nutriscore' is not different from 'nutriscore-fr'.
 We merge the columns and delete 'nutriscore-fr'




In [0]:
dfcopy['nutriscore'] = dfcopy["nutriscore-fr"].combine_first(dfcopy["nutriscore"])
dfcopy.drop(columns=["nutriscore-fr"], inplace=True)

The nutrigrade is assigned to a product after consideration of :
- its nutriscore
- whether it's solid food or beverage 

In [0]:
dfcopy.groupby('quantity_type').size()

In [0]:
# dfcopy.boxplot(by=['nutrigrade', 'quantity_type'])

In [0]:
gb = dfcopy.groupby(by=['nutrigrade', 'quantity_type'])

"Outil_Calcul_Nutri-Score_SpF-030120" :
https://www.santepubliquefrance.fr/determinants-de-sante/nutrition-et-activite-physique/articles/nutri-score (tableur excel)

1823 products are not identified as liquid/beverage or solid/food.
Let's see if we can guess from the categories columns. 

#### 2.2.2 Columns about energy

en

In [0]:
list(df_100g.columns)

In [0]:
df_100g.boxplot(['energy_100g', 'energy-from-fat_100g',
                 'energy-kj_100g', 'energy-kcal_100g'],
                vert=False, figsize=(21,3))

In [0]:
# Distinguish situation 3 columns filled ?
gb = df_100g.notna().groupby(by=['energy_100g', 'energy-kcal_100g', 'energy-kj_100g'])
gb.size()

In [0]:
# group1: all nan, can't do anything for the moment
df_gp1 = df_100g.loc[gb.groups.get((False, False, False))]
df_gp1.shape

In [0]:
# group2: group to send to the model -> is it kj or kcal ?
df_gp2 = df_100g.loc[gb.groups.get((True, False, False))]
df_gp2.shape

In [0]:
# group3: group where e is j
df_gp3 = df_100g.loc[gb.groups.get((True, False, True))]
print(df_gp3.shape)
m_id3 = df_gp3.apply(lambda x: x['energy-kj_100g']==x['energy_100g'], axis=1)
df_gp3[~m_id3].shape # cas où ils sont différents (2) : prévoir remplacement (fonction)

In [0]:
# group4: group where e is c 
df_gp4 = df_100g.loc[gb.groups.get((True, True, False))]
print(df_gp4.shape)
m_id4 = df_gp4.apply(lambda x: x['energy-kcal_100g']==x['energy_100g'], axis=1)
df_gp4[~m_id4].shape # cas où ils sont différents (0) : prévoir remplacement (fonction)

In [0]:
# group5: group where e, c and j are both filled : are they the same ? which is the correct one ? to be submitted to the model
df_gp5 = df_100g.loc[gb.groups.get((True, True, True))]
print(df_gp5.shape)
f_e_eq_c = lambda x: x['energy-kcal_100g']==x['energy_100g']
f_e_eq_j = lambda x: x['energy-kj_100g']==x['energy_100g']
m_id_c5 = df_gp5.apply(f_e_eq_c, axis=1)
m_id_j5 = df_gp5.apply(f_e_eq_j, axis=1)
df_gp5_A = df_gp5[(m_id_c5&m_id_j5)]
df_gp5_B = df_gp5[(~m_id_c5)&(~m_id_j5)]
df_gp5_C = df_gp5[(~(m_id_c5&m_id_j5))&(~((~m_id_c5)&(~m_id_j5)))]
df_gp5_C_c = df_gp5_C[m_id_c5]
df_gp5_C_j = df_gp5_C[m_id_j5]
print(df_gp5_A.shape) # cas où j, c et e sont égaux (env 2 000)
print(df_gp5_B.shape) # cas où e, c et j sont tous différents (0)
print(df_gp5_C.shape) # cas où j, e et c ne sont pas tous égaux mais pas tous différents (au moins un couple égaux) (env 20 000)
print(df_gp5_C_c.shape)
print(df_gp5_C_j.shape)

In [0]:
'''données False True True inexistantes 
préparer un modèle pour détecter la catégorie de calorie (True False False)
éliminer données False False False
d'abord modifier les données de type True True True en : True True False ou True, False true
puis garder les données de type
True, False, True ou
True, True, False
------------------
les données auront 4 colonnes et 1 target
target à créer : kcal 0 ou 1
mettre en place un modèle random forest

In [0]:
# Gathering rows and columns to forme a dataset
sel_en_cols = ['energy_100g','fat_100g', 'carbohydrates_100g', 'proteins_100g',
            'fiber_100g', 'alcohol_100g', 'pnns_gp1', 'main_categ_en', 'nutriscore', 'nutrigrade']
train_set = pd.concat([df_gp3[m_id3], df_gp3[m_id3]])[sel_en_cols]

In [0]:
train_set.drop(columns=['energy_calc_kcal_100g', 'energy_calc_kj_100g'], inplace=True)

In [0]:
train_set[['fat_100g', 'proteins_100g','carbohydrates_100g']].astype(float)

In [0]:
# Creating two calculated versions of energy (kcal) and (kj)
def calc_en_from_c_f_p(df):
  res_c = df['fat_100g'].fillna(0)*9\
          +(df['carbohydrates_100g'].fillna(0)+df['proteins_100g'.fillna(0)])*4\
          +df['fiber_100g'].fillna(0)*1.9+df['alcohol_100g'].fillna(0)*7
  res_j = res_c*418.4
  return res_c, res_j

ser_c,ser_j = calc_en_from_c_f_p(train_set)
# train_set.insert(1, 'energy_calc_kcal_100g', ser_c)
# train_set.insert(2, 'energy_calc_kj_100g', ser_j)
# train_set.sample(5)

calc_en_from_c_f_p(train_set)[0]


In [0]:
df_100g.boxplot(column=['energy_100g'], by='cal_j',vert=False, figsize=(21,3))

In [0]:
# @interact
# def adjust_a(a=(0,1,0.1)):
#   return [a*x for x in range(0,35000)]
  
# ax0 = df_100g.plot(kind='scatter', x='energy-kj_100g', y='energy-kcal_100g', c='red', s=5)
# df_100g.plot(kind='line', x='energy-kj_100g', y=adjust_a(), c='grey', ax=ax0)

In [0]:
ax1 = df_100g.plot(kind='scatter', x='energy_100g', y='energy-kcal_100g', c='red', s=50)
for col,c,s in [('energy-kj_100g','b',25), ('energy_100g','grey',1)]:
  df_100g.plot.scatter(x='energy_100g', y=col, s=s, c=c,ax=ax1,)

In [0]:
speak("Abracadabraaaaa é voilà ")