# Dashboard final — visualisations avancées 

Ce notebook lit les artefacts produits par l'analyse (dans `outputs/analysis_results`) et génère :

- Projection UMAP
- Barplot de couverture par thème
- Heatmap lexical vs sémantique
- Distribution de similarité par thème
- Wordcloud des entités Forbes
- Tableaux filtrables
- Interprétations automatiques

Aucune écriture n'est faite par défaut; exécute cellule par cellule.

In [None]:
# 1. Imports & chemins
import os, json
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from IPython.display import display, HTML
sns.set(style='whitegrid')


RES = os.path.join('./outputs/analysis_results')
ARTIFACTS = {}
if os.path.exists(RES):
    ARTIFACTS = {fn: os.path.join(RES,fn) for fn in os.listdir(RES)}
print('Results folder:', RES)
print('Found artifacts:', sorted(list(ARTIFACTS.keys())))

## Charger les fichiers produits

In [None]:
def load_if(fn):
    p = os.path.join(RES, fn)
    return pd.read_csv(p) if os.path.exists(p) else None

coords = load_if('umap_coords_with_meta.csv')
cov = load_if('coverage_combined_forbes.csv')
sim = load_if('semantic_similarity_forbes.csv')
lex = load_if('lexical_coverage_forbes.csv')
sent = load_if('document_sentiment.csv')
aspect = load_if('aspect_sentiment_forbes.csv')
fr = load_if('framing_forbes.csv')
ents = load_if('forbes_entities.csv')
pairs = load_if('oms_to_forbes_pairs.csv')
art = None
if os.path.exists(os.path.join(OUT_DIR,'all_articles_processed.csv')):
    art = pd.read_csv(os.path.join(OUT_DIR,'all_articles_processed.csv'))
elif os.path.exists(os.path.join(OUT_DIR,'all_articles_processed.csv')):
    art = pd.read_csv(os.path.join(OUT_DIR,'all_articles_processed.csv'))

print('Loaded: coords', coords is not None, 'cov', cov is not None, 'sim', sim is not None, 'lex', lex is not None)
print('Loaded: sent', sent is not None, 'aspect', aspect is not None, 'fr', fr is not None, 'ents', ents is not None, 'pairs', pairs is not None)
print('Articles DF loaded:', art is not None if art is not None else False)

## Projection UMAP — Visualisation des articles

In [None]:
if coords is None:
    print('UMAP coordinates not found. Skipping UMAP plot.')
else:
    hover_cols = []
    for c in ['preview','title','source','date']:
        if c in coords.columns:
            hover_cols.append(c)
    fig = px.scatter(coords, x='umap_x', y='umap_y', color='source' if 'source' in coords.columns else None,
                     hover_data=hover_cols, height=700, title='Projection UMAP — Visualisation des articles')
    fig.update_traces(marker=dict(size=6, opacity=0.8))
    fig.show()

## Couverture des thèmes OMS par Forbes

In [None]:
if cov is None:
    print('Coverage file missing — skipping coverage bar.')
else:
    topic_cols = [c for c in cov.columns if c.startswith('covered_topic_')]
    cover_counts = cov[topic_cols].sum().reset_index()
    cover_counts.columns = ['metric','count']
    cover_counts['topic'] = cover_counts['metric'].str.replace('covered_topic_','').astype(int)
    fig = px.bar(cover_counts, x='topic', y='count', title="Nombre d’articles Forbes couvrant chaque thème de l’OMS")
    fig.show()

## Heatmap — Moyenne lexicale et sémantique par thématique OMS

In [None]:
import numpy as np
if lex is None and sim is None:
    print('Lexical and semantic files missing — skipping heatmap.')
else:
    rows = []
    topic_ids = set()
    if lex is not None:
        topic_ids |= {int(c.replace('lex_topic_','')) for c in lex.columns if c.startswith('lex_topic_')}
    if sim is not None:
        topic_ids |= {int(c.replace('sim_topic_','')) for c in sim.columns if c.startswith('sim_topic_')}
    topic_ids = sorted(topic_ids)
    for t in topic_ids:
        mean_lex = lex[f'lex_topic_{t}'].mean() if (lex is not None and f'lex_topic_{t}' in lex.columns) else np.nan
        mean_sim = sim[f'sim_topic_{t}'].mean() if (sim is not None and f'sim_topic_{t}' in sim.columns) else np.nan
        rows.append({'topic':t,'mean_lex':mean_lex,'mean_sim':mean_sim})
    df_metrics = pd.DataFrame(rows).set_index('topic')
    plt.figure(figsize=(6, max(4, len(df_metrics)*0.5)))
    sns.heatmap(df_metrics.fillna(0), annot=True, fmt='.2f', cmap='viridis')
    plt.title('Moyenne lexicale et sémantique par thématique OMS')
    plt.tight_layout()
    plt.show()

## Distribution de la similarité sémantique par thématique OMS

In [None]:
if sim is None:
    print('Semantic similarity file missing — skipping similarity distributions.')
else:
    sim_cols = [c for c in sim.columns if c.startswith('sim_topic_')]
    sim_long = sim.melt(id_vars=['global_index'], value_vars=sim_cols, var_name='topic', value_name='sim')
    sim_long['topic'] = sim_long['topic'].str.replace('sim_topic_','').astype(int)
    fig = px.box(sim_long, x='topic', y='sim', points='outliers', title='Distribution de la similarité sémantique par thématique OMS')
    fig.update_layout(xaxis_title='ID thématique', yaxis_title='Similarité cosinus')
    fig.show()

## Wordcloud / Top entités (Forbes)

In [None]:
if ents is None:
    print('Entities file missing — skipping wordcloud.')
else:
    try:
        from wordcloud import WordCloud
        have_wordcloud = True
    except Exception:
        have_wordcloud = False
        print('WordCloud lib not installé — affichage du top entities à la place.')

    ent_counts = ents.groupby(['entity','label']).size().reset_index(name='count').sort_values('count', ascending=False)
    display(ent_counts.head(30))

    if have_wordcloud:
        freq = {row['entity']: int(row['count']) for _, row in ent_counts.head(500).iterrows()}
        wc = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(freq)
        plt.figure(figsize=(12,6))
        plt.imshow(wc, interpolation='bilinear')
        plt.axis('off')
        plt.title('Top entités (Forbes)')
        plt.show()

## Tableaux filtrables (Source / Mot-clé / Sentiment / Framing)

In [None]:
from IPython.display import display
if art is None:
    print('Articles CSV non trouvé — impossible d\'afficher la table interactive.')
else:
    df_articles = art.copy()
    if 'preview' not in df_articles.columns:
        df_articles['preview'] = df_articles.get('title','').astype(str)
    if sent is not None:
        sd = sent[['global_index','label']].rename(columns={'global_index':'index','label':'sent_label'})
        df_articles = df_articles.reset_index().rename(columns={'index':'index'}).merge(sd, left_on='index', right_on='index', how='left').set_index('index')
    if fr is not None:
        frs = fr[['global_index','framing']].rename(columns={'global_index':'index'})
        df_articles = df_articles.reset_index().merge(frs, left_on='index', right_on='index', how='left').set_index('index')
    display(df_articles[['source','title','preview']].head(50))

## Interprétations automatiques (brouillon)

In [None]:
interpretations = []
if cov is not None:
    topic_cols = [c for c in cov.columns if c.startswith('covered_topic_')]
    cover_counts = cov[topic_cols].sum().rename(lambda x: int(x.replace('covered_topic_','')))
    top_topics = cover_counts.sort_values(ascending=False).head(3)
    interpretations.append(f"Top thèmes couverts par Forbes : {', '.join([f'Theme {t} ({int(c)})' for t,c in top_topics.items()])}.")
else:
    interpretations.append('Aucune donnée de couverture disponible.')

if sent is not None:
    s = sent['label'].value_counts()
    total = s.sum()
    pos = s.filter(like='POS').sum() if any(['POS' in str(x).upper() for x in s.index]) else s.get('POS',0) if 'POS' in s.index else 0
    neg = s.filter(like='NEG').sum() if any(['NEG' in str(x).upper() for x in s.index]) else s.get('NEG',0) if 'NEG' in s.index else 0
    interpretations.append(f"Sentiment documents : POS ~{int(pos)}/{int(total)}, NEG ~{int(neg)}/{int(total)} (comptes bruts).")
else:
    interpretations.append('Pas de fichier de sentiment disponible.')

if fr is not None and not fr.empty:
    framing_counts = fr['framing'].value_counts()
    dominant = framing_counts.idxmax()
    interpretations.append(f"Angle dominant parmi les articles couverts : {dominant} (counts: {framing_counts.to_dict()}).")
else:
    interpretations.append('Pas de fichier de framing disponible.')

if ents is not None:
    top_ent = ents.groupby('entity').size().sort_values(ascending=False).head(10)
    interpretations.append('Top entités mentionnées : ' + ', '.join([f"{e} ({c})" for e,c in top_ent.items()]) + '.')
else:
    interpretations.append('Pas de fichier d\'entités disponible.')

from IPython.display import HTML, display
display(HTML('<h3>Interprétations automatiques (brouillon)</h3>'))
for p in interpretations:
    display(HTML(f"<p>{p}</p>"))