### Import

In [104]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.impute import KNNImputer

import sys
sys.path.append('../scripts')

from preprocessing import clear_missing_data, delete_columnns_treshold, non_useful_columns, clear_missing_line, get_numerical, get_categorical, removal_of_duplicates
from pretraitement import imputation_of_categorical_val, imputation_of_numerical_val

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Collecte de données

### Chargement du dataset

In [105]:
# Je charge le dataset en ne prenant que les 100 premières lignes afin de tester les fonctions
path = "../data/dataset.csv" # Chemin du dataset 
df = pd.read_csv(path, nrows=1000, sep=',',encoding="utf-8")

In [106]:
df.shape

(1000, 207)

### Préparation des données

##### Nettoyage des colonnes

In [107]:
# Suppressions des colonnes avec 100% données manquantes
df = clear_missing_data(df)

Nombre de colonnes vides (100% de valeurs manquantes) : 73


In [108]:
# Suppressions des colonnes avec 70% données manquantes
df = delete_columnns_treshold(df)

Les colonnes supprimées sont : Index(['starch_100g', 'omega-3-fat_100g',
       'fruits-vegetables-nuts-estimate_100g', 'cocoa_100g', 'molybdenum_100g',
       'chromium_100g', 'polyunsaturated-fat_100g', 'iodine_100g',
       'added-sugars_100g', 'polyols_100g', 'vitamin-k_100g', 'selenium_100g',
       'packaging_text', 'monounsaturated-fat_100g', 'caffeine_100g',
       'manganese_100g', 'phosphorus_100g', 'biotin_100g',
       'pantothenic-acid_100g', 'vitamin-b9_100g', 'copper_100g',
       'vitamin-e_100g', 'vitamin-b12_100g', 'vitamin-pp_100g',
       'vitamin-b2_100g', 'brand_owner', 'zinc_100g', 'vitamin-b1_100g',
       'first_packaging_code_geo', 'cities_tags', 'vitamin-c_100g',
       'vitamin-b6_100g', 'vitamin-a_100g', 'vitamin-d_100g', 'magnesium_100g',
       'alcohol_100g', 'emb_codes_tags', 'emb_codes', 'generic_name',
       'iron_100g', 'trans-fat_100g', 'calcium_100g', 'cholesterol_100g',
       'potassium_100g', 'traces', 'traces_tags', 'traces_en', 'allergens',
 

In [109]:
# Suppressions des colonnes non pertinentes
df = non_useful_columns(df)



Les colonnes supprimées sont : ['url', 'created_t', 'created_datetime', 'last_modified_t', 'last_modified_datetime', 'last_modified_by', 'last_updated_t', 'brands_tags', 'last_updated_datetime', 'countries_tags', 'countries_en', 'states_tags', 'states_en', 'image_url', 'image_small_url', 'image_nutrition_url', 'image_nutrition_small_url']


##### Nettoyage des lignes

In [110]:
# Suppressions des lignes vides

df = clear_missing_line(df)

Nombre de lignes vides (100% de valeurs manquantes) : 0


#### Supression des doublons

In [111]:
# Netoyage des doublons
df = removal_of_duplicates(df)

### Imputation des données

In [112]:
# Récupération des colonnes numériques
df_num = df[get_numerical(df)]

# Récupération des colonnes catégorielles
df_cat = df[get_categorical(df)]



In [113]:
# Imputation des valeurs manquantes pour les colonnes numériques
df_num = imputation_of_numerical_val(df_num)

In [114]:
# Imputation des valeurs manquantes pour les colonnes catégorielles
df_cat = imputation_of_categorical_val(df_cat)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_params[categorical_cols] = imputer.fit_transform(df_params[categorical_cols])


In [115]:
# Assurer que df_num et df_cat ont les mêmes index que df
df[df_num.columns] = df_num
df[df_cat.columns] = df_cat


In [116]:
df

Unnamed: 0.1,Unnamed: 0,code,creator,product_name,quantity,brands,categories,categories_tags,categories_en,labels,...,energy_100g,fat_100g,saturated-fat_100g,carbohydrates_100g,sugars_100g,fiber_100g,proteins_100g,salt_100g,sodium_100g,fruits-vegetables-nuts-estimate-from-ingredients_100g
0,0,54,kiliweb,Limonade artisanale a la rose,700ml,Phillips Gourmet,Beverages,"en:beverages-and-beverages-preparations,en:bev...","Beverages and beverages preparations,Beverages","No GMOs, Non GMO project",...,819.4,7.517876,2.578927,12.114000,5.311864,3.013989,26.270000,0.989011,0.395605,19.166958
1,1,63,kiliweb,Tablette Tanzanie,700ml,Phillips Gourmet,Beverages,"en:beverages-and-beverages-preparations,en:bev...","Beverages and beverages preparations,Beverages","No GMOs, Non GMO project",...,819.4,7.517876,2.578927,12.114000,5.311864,3.013989,26.270000,0.780011,0.312005,0.000000
2,2,114,kiliweb,Chocolate n 3,80 g,Jeff de Bruges,Beverages,"en:beverages-and-beverages-preparations,en:bev...","Beverages and beverages preparations,Beverages","Point Vert, Fabriqué en France",...,2415.0,44.000000,28.000000,30.000000,27.000000,8.572500,7.100000,0.025000,0.010000,19.166667
3,3,1,inf,KOJI MISO PASTE,300g,UMAMI,"Supplements, Vegtable","en:supplements,en:vegtable","Supplements,Vegtable","No gluten, Vegetarian, No artificial flavors, ...",...,874.0,6.000000,1.000000,21.400000,11.100000,2.000000,11.200000,11.800000,4.720000,100.000000
4,4,105,kiliweb,Paleta gran reserva - Sierra nevada-,750ml,AdvoCare,Beverages,"en:beverages-and-beverages-preparations,en:bev...","Beverages and beverages preparations,Beverages","No GMOs, Non GMO project",...,913.4,7.897876,2.468927,10.215853,2.831864,3.313989,34.686904,1.063011,0.425205,0.011335
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,105000066,kiliweb,Herbal Tea Cinnamon,700ml,Phillips Gourmet,"Plant-based foods and beverages, Beverages, Ho...","en:plant-based-foods-and-beverages,en:beverage...","Plant-based foods and beverages,Beverages,Hot ...","No GMOs, Non GMO project",...,0.0,0.000000,0.000000,60.000000,0.000000,0.000000,0.000000,1.000000,0.400000,0.000000
996,996,105000073,usda-ndb-import,"Herbal Tea, Hibiscus",700ml,Lagg's,Beverages,"en:beverages-and-beverages-preparations,en:bev...","Beverages and beverages preparations,Beverages","No GMOs, Non GMO project",...,1117.0,0.000000,2.918000,60.000000,4.066667,5.573333,66.670000,0.337820,0.135128,0.000000
997,997,105000196,usda-ndb-import,Apple & Cinnamon Tea,700ml,Lagg's,"Plant-based foods and beverages, Beverages, Ho...","en:plant-based-foods-and-beverages,en:beverage...","Plant-based foods and beverages,Beverages,Hot ...","No GMOs, Non GMO project",...,0.0,0.000000,2.918000,60.000000,4.066667,5.573333,66.670000,0.337820,0.135128,0.000000
998,998,105000219,usda-ndb-import,"Lagg's, green tea",700ml,Lagg's,"Plant-based foods and beverages, Beverages, Ho...","en:plant-based-foods-and-beverages,en:beverage...","Plant-based foods and beverages,Beverages,Hot ...","No GMOs, Non GMO project",...,0.0,0.000000,2.918000,2.670000,4.066667,5.573333,0.000000,0.000000,0.000000,0.000000


### Entrainement 

In [None]:
#test