# Exploration notebook

* [Imports](#imports)
* [Clean dataset loading](#clean-dataset-loading)
* [Univariate analysis](#univariate-analysis)
    * [Numerical features](#numerical-features)
    * [Categorical features](#categorical-features)
* [Bivariate analysis](#bivariate-analysis)
    * [Nutrigrade analysis](#nutrigrade-analysis)
    * [Correlations](#correlations)
$ $ 
    * [$\chi^2$ test](#chi2-test)
    * [ANOVA](#anova)
* [PCA](#pca)
    * [Plot helpers](#plot-helpers)
    * [Composition features only](#composition-features-only)
    * [All features](#all-features)

<a name="imports"></a>
## Imports

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from typing import List

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from scipy import stats

<a name="clean-dataset-loading"></a>
## Clean dataset loading

In [None]:
clean_datset_path = './clean_dataset.csv'
!wget "https://drive.google.com/uc?export=download&id=1q6sYl_Xlny4NfeqazT4top6ujtxh2qE3" -q --show-progress -O "$clean_datset_path"

In [None]:
df = pd.read_csv(clean_datset_path)

In [None]:
def column_filter(columns: List[str], paterns_to_keep: List[str]=None):
    filtered_columns = []
    if paterns_to_keep is None:
        paterns_to_keep = ['image', 'name', 'category', '_100g', 'grade', 'score', 'group']
    for column in columns:
        for patern_to_keep in paterns_to_keep:
            if patern_to_keep in column:
                filtered_columns.append(column)
                break
    return filtered_columns

In [None]:
print(df.columns.values)
numerical_columns = column_filter(df.columns.values, ['_100g'])

<a name="univariate-analysis"></a>
## Univariate analysis

<a name="numerical-features"></a>
### Numerical features

In [None]:
df_desc = df[numerical_columns].describe()
df_desc.loc['var'] = df[numerical_columns].var().tolist()
df_desc.loc['skew'] = df[numerical_columns].skew().tolist()
df_desc.loc['kurt'] = df[numerical_columns].kurtosis().tolist()
df_desc

In [None]:
sns.displot(df, x="energy_100g", bins=30)

In [None]:
sns.displot(df[["nutrition-score-fr_100g", "nutrition-score-uk_100g"]], bins=30)

In [None]:
fig = plt.figure(figsize=(12,14))
ax = plt.gca()
sns.boxplot(data=df[numerical_columns[1:-2]], orient="h", ax=ax, showfliers=False)

In [None]:
fig = plt.figure(figsize=(12,14))
ax = plt.gca()
sns.violinplot(data=df[numerical_columns[1:-2]], orient="h", ax=ax, showfliers=False, scale="count", cut=0)

<a name="categorical-features"></a>
### Categorial features

In [None]:
def circular_plot(values, labels=None):
    # Reorder the dataframe
    if labels is None:
        labels = [str(i) for i in range(len(values))]
    sorted_values, sorted_labels = zip(*sorted(zip(values, labels)))
    sorted_values = np.array(sorted_values)

    # initialize the figure
    plt.figure(figsize=(20,10))
    ax = plt.subplot(111, polar=True)
    plt.axis('off')

    # Constants = parameters controling the plot layout:
    upper_limit = 100
    lower_limit = 30
    label_padding = 4
    value_padding = 1000

    # Compute max and min in the dataset
    max = sorted_values.max()
    total = sorted_values.sum()

    # Let's compute heights: they are a conversion of each item value in those new coordinates
    # In our example, 0 in the dataset will be converted to the lowerLimit (10)
    # The maximum will be converted to the upperLimit (100)
    slope = (max - lower_limit) / max
    heights = slope * sorted_values + lower_limit

    # Compute the width of each bar. In total we have 2*Pi = 360°
    width = 2*np.pi / len(sorted_values)

    # Compute the angle each bar is centered on:
    indexes = list(range(1, len(sorted_values)+1))
    angles = [element * width for element in indexes]
    angles

    # Draw bars
    bars = ax.bar(
        x=angles, 
        height=heights, 
        width=width, 
        bottom=lower_limit,
        linewidth=2, 
        edgecolor="white",
        color="#61a4b2",
        label=heights,
    )

    # Add labels
    for bar, angle, height, label, value in zip(bars, angles, heights, sorted_labels, sorted_values):

        # Labels are rotated. Rotation must be specified in degrees :(
        rotation = np.rad2deg(angle)

        # Flip some labels upside down
        alignment = ""
        if angle >= np.pi/2 and angle < 3*np.pi/2:
            alignment = "right"
            rotation = rotation + 180
        else: 
            alignment = "left"

        # Finally add the labels
        ax.text(
            x=angle, 
            y=lower_limit + bar.get_height() + label_padding, 
            s=label + f" ({value/total*100:.0f}%)", 
            ha=alignment, 
            va='center', 
            rotation=rotation, 
            rotation_mode="anchor") 

In [None]:
col_name = 'pnns_groups_2'
mask = df[col_name] != "unknown"
df_count = df[mask].groupby([col_name])[col_name].count()
circular_plot(df_count.values, df_count.index.values)

In [None]:
col_name = 'nutrition_grade_fr'
df_count = df.groupby([col_name])[col_name].count()
circular_plot(df_count.values, df_count.index.values)

<a name="bivariate-analysis"></a>
## Bivariate analysis

<a name="nutrigrade-analysis"></a>
### Nutrigrade analysis

In [None]:
sns.catplot(data=df, x="nutrition-score-fr_100g", y="nutrition_grade_fr", kind="box", order=['a','b','c','d','e'])

In [None]:
sns.catplot(data=df, x="energy_100g", y="nutrition_grade_fr", kind="box", order=['a','b','c','d','e'])

In [None]:
sns.catplot(data=df, x="energy_100g", y="nutrition_grade_fr", kind="violin", cut=0, scale="area", order=['a','b','c','d','e'])

In [None]:
sns.catplot(data=df, x="fat_100g", y="nutrition_grade_fr", kind="box", order=['a','b','c','d','e'], showfliers=False)

In [None]:
sns.catplot(data=df, x="sugars_100g", y="nutrition_grade_fr", kind="box", order=['a','b','c','d','e'], showfliers=False)

In [None]:
sns.catplot(data=df, x="fiber_100g", y="nutrition_grade_fr", kind="box", order=['a','b','c','d','e'], showfliers=False)

In [None]:
sns.catplot(data=df, x="proteins_100g", y="nutrition_grade_fr", kind="box", order=['a','b','c','d','e'], showfliers=False)

<a name="correlations"></a>
### Correlations

In [None]:
corr = df[numerical_columns].corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5});

In [None]:
sns.relplot(data=df, x="energy_100g", y="nutrition-score-fr_100g")

In [None]:
res = stats.linregress(df['energy_100g'],df['nutrition-score-fr_100g']+df['nutrition-score-uk_100g'])
res

In [None]:
res = stats.linregress(df['nutrition-score-fr_100g'], df['nutrition-score-uk_100g'])
res

<a name="chi2-test"></a>
### $\chi^2$ test 

In [None]:
ct_table_ind = pd.crosstab(df['nutrition_grade_fr'],df['pnns_groups_2'])

In [None]:
stat, p, dof, expected = stats.chi2_contingency(ct_table_ind)

print('dof=%d' % dof)
prob = 0.95
critical = stats.chi2.ppf(prob, dof)
print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))
if abs(stat) >= critical:
 print('Dependent (reject H0)')
else:
 print('Independent (fail to reject H0)')
alpha = 1.0 - prob
print('significance=%.3f, p=%.3f' % (alpha, p))
if p <= alpha:
 print('Dependent (reject H0)')
else:
 print('Independent (fail to reject H0)')

<a name="anova"></a>
### ANOVA

In [None]:
def eta_squared(x,y):
    moyenne_y = y.mean()
    classes = []
    for classe in x.unique():
        yi_classe = y[x==classe]
        classes.append({'ni': len(yi_classe),
                        'moyenne_classe': yi_classe.mean()})
    SCT = sum([(yj-moyenne_y)**2 for yj in y])
    SCE = sum([c['ni']*(c['moyenne_classe']-moyenne_y)**2 for c in classes])
    return SCE/SCT

In [None]:
k2, p = stats.normaltest(df['nutrition-score-fr_100g'])
p

In [None]:
k2, p = stats.normaltest(df['energy_100g'])
p

In [None]:
eta_squared(df['nutrition_grade_fr'],df['nutrition-score-fr_100g'])

In [None]:
eta_squared(df['pnns_groups_2'],df['nutrition-score-fr_100g'])

<a name="pca"></a>
## PCA

<a name="plot-helpers"></a>
### Plot helpers

In [None]:
def correlation_graph(pca, 
                      x_y, 
                      features) : 
    """Affiche le graphe des correlations

    Positional arguments : 
    -----------------------------------
    pca : sklearn.decomposition.PCA : notre objet PCA qui a été fit
    x_y : list ou tuple : le couple x,y des plans à afficher, exemple [0,1] pour F1, F2
    features : list ou tuple : la liste des features (ie des dimensions) à représenter
    """

    # Extrait x et y 
    x,y=x_y

    # Taille de l'image (en inches)
    fig, ax = plt.subplots(figsize=(10, 9))

    # Pour chaque composante : 
    for i in range(0, pca.components_.shape[1]):

        # Les flèches
        ax.arrow(0,0, 
                pca.components_[x, i],  
                pca.components_[y, i],  
                head_width=0.07,
                head_length=0.07, 
                width=0.02, )

        # Les labels
        plt.text(pca.components_[x, i] + 0.05,
                pca.components_[y, i] + 0.05,
                features[i])
        
    # Affichage des lignes horizontales et verticales
    plt.plot([-1, 1], [0, 0], color='grey', ls='--')
    plt.plot([0, 0], [-1, 1], color='grey', ls='--')

    # Nom des axes, avec le pourcentage d'inertie expliqué
    plt.xlabel('F{} ({}%)'.format(x+1, round(100*pca.explained_variance_ratio_[x],1)))
    plt.ylabel('F{} ({}%)'.format(y+1, round(100*pca.explained_variance_ratio_[y],1)))

    # J'ai copié collé le code sans le lire
    plt.title("Cercle des corrélations (F{} et F{})".format(x+1, y+1))

    # Le cercle 
    an = np.linspace(0, 2 * np.pi, 100)
    plt.plot(np.cos(an), np.sin(an))  # Add a unit circle for scale

    # Axes et display
    plt.axis('equal')
    plt.show(block=False)

In [None]:
def display_factorial_planes(   X_projected, 
                                x_y, 
                                pca=None, 
                                labels = None,
                                clusters=None, 
                                alpha=1,
                                figsize=[10,8], 
                                marker="." ):
    """
    Affiche la projection des individus

    Positional arguments : 
    -------------------------------------
    X_projected : np.array, pd.DataFrame, list of list : la matrice des points projetés
    x_y : list ou tuple : le couple x,y des plans à afficher, exemple [0,1] pour F1, F2

    Optional arguments : 
    -------------------------------------
    pca : sklearn.decomposition.PCA : un objet PCA qui a été fit, cela nous permettra d'afficher la variance de chaque composante, default = None
    labels : list ou tuple : les labels des individus à projeter, default = None
    clusters : list ou tuple : la liste des clusters auquel appartient chaque individu, default = None
    alpha : float in [0,1] : paramètre de transparence, 0=100% transparent, 1=0% transparent, default = 1
    figsize : list ou tuple : couple width, height qui définit la taille de la figure en inches, default = [10,8] 
    marker : str : le type de marker utilisé pour représenter les individus, points croix etc etc, default = "."
    """

    # Transforme X_projected en np.array
    X_ = np.array(X_projected)

    # On définit la forme de la figure si elle n'a pas été donnée
    if not figsize: 
        figsize = (7,6)

    # On gère les labels
    if  labels is None : 
        labels = []
    try : 
        len(labels)
    except Exception as e : 
        raise e

    # On vérifie la variable axis 
    if not len(x_y) ==2 : 
        raise AttributeError("2 axes sont demandées")   
    if max(x_y )>= X_.shape[1] : 
        raise AttributeError("la variable axis n'est pas bonne")   

    # on définit x et y 
    x, y = x_y

    # Initialisation de la figure       
    fig, ax = plt.subplots(1, 1, figsize=figsize)

    # On vérifie s'il y a des clusters ou non
    c = None if clusters is None else clusters
 
    # Les points    
    # plt.scatter(   X_[:, x], X_[:, y], alpha=alpha, 
    #                     c=c, cmap="Set1", marker=marker)
    sns.scatterplot(data=None, x=X_[:, x], y=X_[:, y], hue=c)

    # Si la variable pca a été fournie, on peut calculer le % de variance de chaque axe 
    if pca : 
        v1 = str(round(100*pca.explained_variance_ratio_[x]))  + " %"
        v2 = str(round(100*pca.explained_variance_ratio_[y]))  + " %"
    else : 
        v1=v2= ''

    # Nom des axes, avec le pourcentage d'inertie expliqué
    ax.set_xlabel(f'F{x+1} {v1}')
    ax.set_ylabel(f'F{y+1} {v2}')

    # Valeur x max et y max
    x_max = np.abs(X_[:, x]).max() *1.1
    y_max = np.abs(X_[:, y]).max() *1.1

    # On borne x et y 
    ax.set_xlim(left=-x_max, right=x_max)
    ax.set_ylim(bottom= -y_max, top=y_max)

    # Affichage des lignes horizontales et verticales
    plt.plot([-x_max, x_max], [0, 0], color='grey', alpha=0.8)
    plt.plot([0,0], [-y_max, y_max], color='grey', alpha=0.8)

    # Affichage des labels des points
    if len(labels) : 
        # j'ai copié collé la fonction sans la lire
        for i,(_x,_y) in enumerate(X_[:,[x,y]]):
            plt.text(_x, _y+0.05, labels[i], fontsize='14', ha='center',va='center') 

    # Titre et display
    plt.title(f"Projection des individus (sur F{x+1} et F{y+1})")
    plt.show()

In [None]:
def display_factorial_planes_3D(X_proj, 
                             pca=None, 
                             labels = None,
                             clusters=None, 
                             alpha=1,
                             figsize=[10,8], 
                             title=""
                             ):
    fig = plt.figure(figsize=figsize)
    ax = fig.add_subplot(111, projection = '3d')

    x = X_proj[:,0]
    y = X_proj[:,1]
    z = X_proj[:,2]
    if clusters is None:
        clusters = np.array(["samples"]*len(X_proj))
    labels = sorted(clusters.unique())

    ax.set_xlabel("F1")
    ax.set_ylabel("F2")
    ax.set_zlabel("F3")
    for i, label in enumerate(labels):
        mask = clusters == label
        ax.scatter(x[mask], y[mask], z[mask], label=label, s=40, marker='o')
    plt.legend(loc="best")
    plt.title(title)
    plt.show()

<a name="composition-features-only"></a>
### Composition features only

In [None]:
features = numerical_columns[1:-2]
X = df[features]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
n_components = 10
pca = PCA(n_components=n_components)
X_proj = pca.fit_transform(X_scaled)
pca.explained_variance_ratio_

In [None]:
scree = (pca.explained_variance_ratio_*100)
scree_cum = scree.cumsum()
x_list = range(1, n_components+1)
plt.bar(x_list, scree)
plt.plot(x_list, scree_cum,c="red",marker='o')
plt.xlabel("rang de l'axe d'inertie")
plt.ylabel("pourcentage d'inertie")
plt.title("Eboulis des valeurs propres")
plt.show(block=False)

In [None]:
pcs = pca.components_
pcs = pd.DataFrame(pcs)
pcs.columns = features
pcs.index = [f"F{i}" for i in x_list]

In [None]:
fig, ax = plt.subplots(figsize=(20, 6))
sns.heatmap(pcs.T, vmin=-1, vmax=1, annot=True, cmap="coolwarm", fmt="0.2f")

In [None]:
x_y = (0,1)
correlation_graph(pca, x_y, features)

In [None]:
x_y = (2,3)
correlation_graph(pca, x_y, features)

In [None]:
x_y = (0,1)
N = 1000
display_factorial_planes(X_proj[:N], x_y, clusters=df.loc[:N-1,'nutrition_grade_fr'])

In [None]:
x_y = (2,3)
display_factorial_planes(X_proj[:N], x_y, clusters=df.loc[:N-1,'nutrition_grade_fr'])

In [None]:
total_var = pca.explained_variance_ratio_[:3].sum() * 100

fig = px.scatter_3d(
    X_proj[:N], x=0, y=1, z=2, color=df.loc[:N-1,'nutrition_grade_fr'],
    title=f'Total Explained Variance: {total_var:.2f}%',
    labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3'},
)
fig.show()

In [None]:
clusters = df.loc[:N-1,'nutrition_grade_fr']
display_factorial_planes_3D(
    X_proj[:N], 
    clusters=clusters, 
    title=f'Total Explained Variance: {total_var:.2f}%',
    )

<a name="all-features"></a>
### All features

In [None]:
features = numerical_columns
X = df[features]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
n_components = 10
pca = PCA(n_components=n_components)
X_proj = pca.fit_transform(X_scaled)
pca.explained_variance_ratio_

In [None]:
scree = (pca.explained_variance_ratio_*100)
scree_cum = scree.cumsum()
x_list = range(1, n_components+1)
plt.bar(x_list, scree)
plt.plot(x_list, scree_cum,c="red",marker='o')
plt.xlabel("rang de l'axe d'inertie")
plt.ylabel("pourcentage d'inertie")
plt.title("Eboulis des valeurs propres")
plt.show(block=False)

In [None]:
pcs = pca.components_
pcs = pd.DataFrame(pcs)
pcs.columns = features
pcs.index = [f"F{i}" for i in x_list]

In [None]:
fig, ax = plt.subplots(figsize=(20, 6))
sns.heatmap(pcs.T, vmin=-1, vmax=1, annot=True, cmap="coolwarm", fmt="0.2f")

In [None]:
x_y = (0,1)
N = 1000
correlation_graph(pca, x_y, features)

In [None]:
x_y = (2,3)
correlation_graph(pca, x_y, features)

In [None]:
x_y = (0,1)
display_factorial_planes(X_proj[:N], x_y, clusters=df.loc[:N-1,'nutrition_grade_fr'])

In [None]:
x_y = (2,3)
display_factorial_planes(X_proj[:N], x_y, clusters=df.loc[:N-1,'nutrition_grade_fr'])

In [None]:
total_var = pca.explained_variance_ratio_[:3].sum() * 100

fig = px.scatter_3d(
    X_proj[:N], x=0, y=1, z=2, color=df.loc[:N-1,'nutrition_grade_fr'],
    title=f'Total Explained Variance: {total_var:.2f}%',
    labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3'},
)
fig.show()

In [None]:
clusters = df.loc[:N-1,'nutrition_grade_fr']
display_factorial_planes_3D(
    X_proj[:N], 
    clusters=clusters, 
    title=f'Total Explained Variance: {total_var:.2f}%',
    )