In [64]:
import pandas as pd
import numpy as np
import ast

from nltk import FreqDist, bigrams

import plotly.express as px


# source work : https://github.com/nonlocal-lia/sentiment-analysis-project/blob/main/main_notebook.ipynb

In [4]:
arcom_df = pd.read_csv('data/cross_arcom_off/arcom_data_with_nutriscore.csv')
arcom_df['clean_script'] = [ast.literal_eval(script) for script in arcom_df['clean_script']]

In [67]:
def clean_themes(themes):
    if type(themes) == str:
        themes = themes.split(',')
        themes = [theme.strip() for theme in themes]
        themes = [theme.lower() for theme in themes]
        return themes
    return []

themes = arcom_df['Thème'].apply(clean_themes)
arcom_df['clean_theme'] = themes

In [54]:
arcom_df.columns

Index(['Unnamed: 0', 'N° Fiche', 'year', 'week', 'N° Version', 'Annonceur',
       'Marque', 'Produit', 'Date', 'Date VP', 'Média', 'Medium', 'Catégorie',
       'Support', 'Format', 'Secteur', 'Classe', 'Groupe', 'Variété',
       'N° Groupe/Variété', 'Signature', 'Script', 'Incrustation', 'Titre',
       'Libellé Version', 'Visuel', 'Mots Clés', 'Thème',
       'Musique / Auteur compositeur', 'Musique / Titre', 'Agence',
       'Réalisateur', 'Producteur', 'Récompense', 'Musique / Version',
       'Type de campagne', 'Musique / Interprète', 'Partenaires',
       'Titre émission', 'Hashtag', 'clean_script', 'mots_clefs_themes',
       'mots_clefs_themes_str', 'visuel_str', 'visuel_clean',
       'visuel_clean_bigram', 'clean_bigram_total', 'date_iso', 'month_nb',
       'Produit_cleaned', 'nutriscore_inferred', 'script_decoded',
       'date_formatted', '6_month_interval'],
      dtype='object')

In [5]:
arcom_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5564 entries, 0 to 5563
Data columns (total 54 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Unnamed: 0                    5564 non-null   int64  
 1   N° Fiche                      5564 non-null   int64  
 2   year                          5564 non-null   int64  
 3   week                          5564 non-null   int64  
 4   N° Version                    5564 non-null   float64
 5   Annonceur                     5564 non-null   object 
 6   Marque                        5564 non-null   object 
 7   Produit                       5564 non-null   object 
 8   Date                          5564 non-null   object 
 9   Date VP                       5061 non-null   object 
 10  Média                         5564 non-null   object 
 11  Medium                        5564 non-null   object 
 12  Catégorie                     5564 non-null   object 
 13  Sup

# Value count per score

In [7]:
for score in ['a','b','c','d','e']:
    sub_score_df = arcom_df[arcom_df['nutriscore_inferred'] == score]
    fig = px.histogram(sub_score_df,
                x='Variété',
                marginal='rug', 
                color='Annonceur', title=f'Type of products graded {score.upper()}')
    fig.show()

In [38]:
nutriscore_order = ['a', 'b', 'c', 'd', 'e']

fig = px.histogram(arcom_df,
            x='nutriscore_inferred',
            color='Variété', 
            title=f'Count of ads per nutriscore',
            category_orders={'nutriscore_inferred': nutriscore_order})
fig.show()

In [47]:
category_counts = arcom_df['Variété'].value_counts()
sorted_categories = category_counts.index.tolist()

fig = px.histogram(arcom_df,
            x='Variété',
            color='nutriscore_inferred', 
            title=f'Count of nutriscore per category',
            color_discrete_sequence=px.colors.sequential.Jet,
            category_orders={'Variété': sorted_categories, 'nutriscore_inferred': nutriscore_order})
fig.show()

In [63]:
kw_list = ['CEREALES', 'DESSERTS FRAIS', 'YAOURTS', 'BISCUITS FOURRES-TARTINES', 'ALIMENTS DIETETIQUES']
for kw in kw_list:
    sub_score_df = arcom_df[arcom_df['Variété'] == kw]

    fig = px.histogram(sub_score_df,
                x='Produit',
                color='nutriscore_inferred', 
                title=f'Product for {kw}',
                color_discrete_sequence=px.colors.sequential.Jet,
                category_orders={'nutriscore_inferred': nutriscore_order})
    fig.show()

In [48]:
category_counts = arcom_df['Annonceur'].value_counts()
sorted_categories = category_counts.index.tolist()

fig = px.histogram(arcom_df,
            x='Annonceur',
            color='nutriscore_inferred', 
            title=f'Count of nutriscore per brand',
            color_discrete_sequence=px.colors.sequential.Jet,
            category_orders={'Annonceur': sorted_categories, 'nutriscore_inferred': nutriscore_order})
fig.show()

In [20]:
sorted_categories

['TABLETTES DE CHOCOLAT',
 'CEREALES',
 'BISCUITS FOURRES-TARTINES',
 'BISCUITS CHOCOLATES',
 'DESSERTS FRAIS',
 'VIENNOISERIES',
 'DOSETTES-CAPSULES A CAFE',
 'PAINS',
 'YAOURTS',
 'CONFISERIES DE CHOCOLAT',
 'BONBONS-SUCETTES',
 'PETITE CONFISERIE POCHE',
 'PATES SECHES',
 'PATES PRESSEES NON CUITES',
 'GLACES INDIVIDUELLES',
 'PATISSERIE INDUSTRIELLE',
 'PATES PRESSEES CUITES',
 'SNACKS SALES',
 'FROMAGES FRAIS SALES AROMATISES',
 'LAITS AROMATISES',
 'PATES A TARTINER',
 'FROMAGES PATES MOLLES',
 'FERMENTS ACTIFS',
 'FROMAGES FONDUS',
 'FROMAGES POUR APERITIF',
 'PATES FRAICHES',
 'POTAGES UHT',
 'BOITES DE CHOCOLATS',
 'BEURRES',
 'ALIMENTS DIETETIQUES',
 'BISCUITS SUCRES',
 'HUILES D OLIVE',
 'BARRES CHOCOLATEES',
 'BISCUITS SECS',
 'SAUCES CUISINEES',
 'CHEVRES',
 'GAMME ALIMENTAIRE',
 'VIANDES',
 'CAFE GRAIN MOULU NORMAL',
 'CREMES UHT',
 'COMPOTES',
 'CHIPS',
 'CAFE + MACHINE A CAFE',
 'BISCUITS SALES',
 'YAOURTS A BOIRE',
 'CHOCOLATS GAMME',
 'JAMBONS CUITS',
 'AUTRES CUISINE

# Top words frequency

In [69]:
freq_dist_themes = FreqDist(arcom_df['clean_theme'].explode())