# Cleaning notebook

## Imports

In [None]:
import os
import re
from typing import List

import pandas as pd
import numpy as np
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, precision_score, recall_score
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

## Dataset loading and printing

In [None]:
if not os.path.exists('fr.openfoodfacts.org.products.csv.zip'):
    !wget https://s3-eu-west-1.amazonaws.com/static.oc-static.com/prod/courses/files/parcours-data-scientist/P2/fr.openfoodfacts.org.products.csv.zip

In [None]:
if not os.path.exists('dataset.csv'):
    !unzip fr.openfoodfacts.org.products.csv.zip
    !mv fr.openfoodfacts.org.products.csv dataset.csv
!head -n 3 dataset.csv

In [None]:
if 'df' not in locals():
    df = pd.read_csv('./dataset.csv', sep='\t')
print(df.info(verbose=True))
df.head()

## Features filtering

* Keep only the products sold in France
* Keep certain columns

In [None]:
def fr_filter(s: str) -> bool:
    try:
        s_part = re.split('; |, | :', s.lower())
        return 'fr' in s_part or 'france' in s_part
    except:
        return False
        
def column_filter(columns: List[str], to_keep: List[str]=None):
    filtered_columns = []
    if to_keep is None:
        to_keep = ['image', 'name', 'category', '_100g', 'grade', 'score']
    for column in columns:
        for to_keep_frag in to_keep:
            if to_keep_frag in column:
                filtered_columns.append(column)
                break
    return filtered_columns

In [None]:
df_fr = df[df["countries"].apply(fr_filter)]
del df

df_fr = msno.nullity_filter(df_fr, 'top', 0.01)
df_fr_filtered = df_fr[column_filter(df_fr.columns.values)]
del df_fr

df_fr_filtered.head()

In [None]:
msno.matrix(df_fr_filtered)

In [None]:
msno.bar(df_fr_filtered)

In [None]:
msno.heatmap(df_fr_filtered)

In [None]:
msno.dendrogram(df_fr_filtered)

In [None]:
df_fr_filtered.isna().sum()

In [None]:
print((df_fr_filtered['nutrition-score-fr_100g'] != df_fr_filtered['nutrition-score-uk_100g']).sum())
print((df_fr_filtered['nutrition-score-fr_100g'].isna() != df_fr_filtered['nutrition-score-uk_100g'].isna()).sum())

In [None]:
sns.relplot(data=df_fr_filtered, x="nutrition-score-fr_100g", y="nutrition-score-uk_100g")

In [None]:
print((df_fr_filtered['nutrition-score-fr_100g'].isna() != df_fr_filtered['nutrition_grade_fr'].isna()).sum())
print((df_fr_filtered['nutrition-score-fr_100g'].isna() != df_fr_filtered['energy_100g'].isna()).sum())

In [None]:
sns.catplot(data=df_fr_filtered, x="nutrition-score-fr_100g", y="nutrition_grade_fr", kind="box")

## Row filtering

* Based on numerical features

In [None]:
numerical_columns = column_filter(df_fr_filtered.columns.values, to_keep=['_100g'])
print(df_fr_filtered[numerical_columns].isna().all(axis='columns').sum())
print(df_fr_filtered[numerical_columns[:-2]].isna().all(axis='columns').sum())
print(df_fr_filtered[numerical_columns[1:-2]].isna().all(axis='columns').sum())
mask = df_fr_filtered[numerical_columns[1:-2]].isna().all(axis='columns')

In [None]:
df_fr_filtered = df_fr_filtered[~mask]

In [None]:
print((df_fr_filtered[numerical_columns[:-2]] < 0).any(axis='columns').sum())
mask = (df_fr_filtered[numerical_columns[:-2]] < 0).any(axis='columns')

In [None]:
df_fr_filtered = df_fr_filtered[~mask]

In [None]:
print((df_fr_filtered[numerical_columns[1:-2]] > 100).any(axis='columns').sum())
mask = (df_fr_filtered[numerical_columns[1:-2]] > 100).any(axis='columns')
df_fr_filtered.loc[mask, ["product_name"]+numerical_columns]

In [None]:
df_fr_filtered = df_fr_filtered[~mask]

## Missing values

### Zero Imputation

* Composition missing elements are replaced by 0

In [None]:
imputed_columns = [c for c in numerical_columns if 'nutrition-score' not in c and 'energy' not in c]
df_fr_filtered.loc[:, imputed_columns] = df_fr_filtered[imputed_columns].fillna(0)
df_fr_filtered.isna().sum() 

In [None]:
df_desc = df_fr_filtered.describe()
df_desc

### Label Fusion

* Fusion the different labels
* Fusion the image urls

In [None]:
(df_fr_filtered['product_name'].isna() & ~df_fr_filtered['generic_name'].isna()).sum()

In [None]:
(df_fr_filtered['main_category_fr'].isna() & ~df_fr_filtered['main_category'].isna()).sum()

## ML imputation

* Recover the nutrition scores (numerical) and the nutrition grade (categorical)

The two features are linked but not directly. The numerical feature needs a regression and the categorical needs a classification

In [None]:
bool_index = (df_fr_filtered['nutrition-score-fr_100g'].notna() & df_fr_filtered['energy_100g'].notna())

X_num = np.array(df_fr_filtered.loc[bool_index, numerical_columns])
y_clf = np.array(df_fr_filtered.loc[bool_index, 'nutrition_grade_fr'])

X_num_train, X_num_test, y_clf_train, y_clf_test = train_test_split(X_num, y_clf, test_size=0.2, random_state=42)

In [None]:
def reg_metrics(y_true, y_pred):
    metric_list = ['mean_squared_error', 'mean_absolute_error', 'r2_score']
    for metric_name in metric_list:
        score = eval(f"{metric_name}(y_true, y_pred)")
        print(f"{metric_name}: {score:.2f}")

In [None]:
def clf_metrics(y_true, y_pred):
    metric_list = ['accuracy', 'balanced_accuracy', 'f1', 'precision', 'recall']
    kwargs = {'accuracy':{}, 'balanced_accuracy':{}, 'f1':{'average':'weighted'}, 'precision':{'average':'weighted'}, 'recall':{'average':'weighted'}}
    for metric_name in metric_list:
        score = eval(f"{metric_name}_score(y_true, y_pred, **kwargs[metric_name])")
        print(f"{metric_name}: {score:.2f}")

### KKN imputation

Imputer based on a KNN regressor.

In [None]:
%%time
imputer = KNNImputer(n_neighbors=3, weights='distance')
imputer.fit(X_num_train)

In [None]:
%%time
X_num_test_knn = np.copy(X_num_test)
X_num_test_knn[:,(0,-2,-1)] = np.nan
y_pred_knn = imputer.transform(X_num_test_knn)[:,(0,-2,-1)]

In [None]:
reg_metrics(X_num_test[:,(0,-2,-1)], y_pred_knn)

### RandomForest Regressor

In [None]:
%%time
regr = RandomForestRegressor(max_depth=50, random_state=0)
regr.fit(X_num_train[:,1:-2], X_num_train[:,(0,-2,-1)])

In [None]:
%%time
y_pred_rfg = regr.predict(X_num_test[:,1:-2])

In [None]:
reg_metrics(X_num_test[:,(0,-2,-1)], y_pred_rfg)

### Classifier imputation

In [None]:
%%time
clf = RandomForestClassifier(max_depth=50, random_state=0, class_weight='balanced')
clf.fit(X_num_train[:,1:-2], y_clf_train)

In [None]:
%%time
y_pred_rfc = clf.predict(X_num_test[:,1:-2])

In [None]:
clf_metrics(y_clf_test, y_pred_rfc)

In [None]:
%%time
clf_full = RandomForestClassifier(max_depth=50, random_state=0, class_weight='balanced')
clf_full.fit(X_num_train, y_clf_train)

In [None]:
%%time
y_pred_rfc_full = clf_full.predict(X_num_test)

In [None]:
clf_metrics(y_clf_test, y_pred_rfc_full)

### Actual imputation

In [None]:
bool_index = (df_fr_filtered['nutrition-score-fr_100g'].isna() | df_fr_filtered['energy_100g'].isna())
df_fr_filtered.loc[bool_index, numerical_columns] = imputer.transform(df_fr_filtered.loc[bool_index, numerical_columns])

In [None]:
bool_index = df_fr_filtered['nutrition_grade_fr'].isna()
df_fr_filtered.loc[bool_index, 'nutrition_grade_fr'] = clf_full.predict(df_fr_filtered.loc[bool_index, numerical_columns])

## Outliers

In [None]:
df_imputed_desc = df_fr_filtered.describe()
df_imputed_desc

In [None]:
tau = 5
upper_bound = df_imputed_desc.loc['mean',:] + tau * df_imputed_desc.loc['std',:]
lower_bound = df_imputed_desc.loc['mean',:] - tau * df_imputed_desc.loc['std',:]

In [None]:
print((df_fr_filtered < lower_bound).any(axis='columns').sum())
print((df_fr_filtered > upper_bound).any(axis='columns').sum())

In [None]:
test_feature = 'fiber_100g'

upper_bound_en = df_imputed_desc.loc['mean', test_feature] + tau * df_imputed_desc.loc['std', test_feature]
lower_bound_en = df_imputed_desc.loc['mean', test_feature] - tau * df_imputed_desc.loc['std', test_feature]

In [None]:
print((df_fr_filtered[test_feature] < lower_bound_en).sum())
print((df_fr_filtered[test_feature] > upper_bound_en).sum())

## Saving

In [None]:
df_fr_filtered.to_csv('cleaned_dataset.csv')