# Analyse de Sentiment des News Hybrides

Ce notebook r√©cup√®re les news collect√©es via le syst√®me hybride et applique l'analyse de sentiment avec le mod√®le FinBERT fine-tun√©.

**Workflow:**
1. Chargement des news depuis `hybrid_news_mapped.csv`
2. Initialisation du mod√®le FinBERT de sentiment
3. Analyse de sentiment pour chaque article
4. Enrichissement des donn√©es avec les scores de sentiment
5. Export des r√©sultats

## 1. Import des biblioth√®ques

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from pathlib import Path
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print("‚úì Biblioth√®ques import√©es avec succ√®s")
print(f"‚úì PyTorch version: {torch.__version__}")
print(f"‚úì Device disponible: {'GPU' if torch.cuda.is_available() else 'CPU'}")

  from .autonotebook import tqdm as notebook_tqdm


‚úì Biblioth√®ques import√©es avec succ√®s
‚úì PyTorch version: 2.5.1+cu121
‚úì Device disponible: GPU


## 2. Chargement des donn√©es de news

In [2]:
# Chemins des fichiers
NEWS_CSV_PATH = Path("../Pipeline_Recup_Donnees/data/raw/news/hybrid_news_mapped.csv")

# Chargement du fichier CSV
print(f"üìÇ Chargement des news depuis: {NEWS_CSV_PATH}")
df_news = pd.read_csv(NEWS_CSV_PATH)

# Affichage des informations de base
print(f"\n‚úì {len(df_news)} lignes charg√©es")
print(f"‚úì Colonnes: {list(df_news.columns)}")
print(f"\nüìä Dimensions: {df_news.shape}")
print(f"üì∞ News uniques: {df_news['url'].nunique()}")
print(f"üìà Actifs concern√©s: {df_news['asset'].nunique()}")

# Affichage d'un √©chantillon
print("\nüìã Aper√ßu des donn√©es:")
df_news.head()

üìÇ Chargement des news depuis: ..\Pipeline_Recup_Donnees\data\raw\news\hybrid_news_mapped.csv

‚úì 4280 lignes charg√©es
‚úì Colonnes: ['date', 'title', 'url', 'source', 'language', 'event_type', 'event_category', 'base_impact_score', 'affects', 'asset', 'relevance_score', 'matched_events']

üìä Dimensions: (4280, 12)
üì∞ News uniques: 377
üìà Actifs concern√©s: 18

üìã Aper√ßu des donn√©es:


Unnamed: 0,date,title,url,source,language,event_type,event_category,base_impact_score,affects,asset,relevance_score,matched_events
0,2026-01-09 06:15:00,Mercosur - kauppasopimus ratkaisevassa nestyks...,https://yle.fi/a/74-20203286,yle.fi,Finnish,trade_policy,macro,8,,TESLA,19.2,"technology, automotive"
1,2026-01-09 06:15:00,Mercosur - kauppasopimus ratkaisevassa nestyks...,https://yle.fi/a/74-20203286,yle.fi,Finnish,trade_policy,macro,8,,APPLE,9.6,technology
2,2026-01-09 06:15:00,Mercosur - kauppasopimus ratkaisevassa nestyks...,https://yle.fi/a/74-20203286,yle.fi,Finnish,trade_policy,macro,8,,AMAZON,9.6,technology
3,2026-01-09 06:15:00,Mercosur - kauppasopimus ratkaisevassa nestyks...,https://yle.fi/a/74-20203286,yle.fi,Finnish,trade_policy,macro,8,,STELLANTIS,9.6,automotive
4,2026-01-09 06:15:00,Mercosur - kauppasopimus ratkaisevassa nestyks...,https://yle.fi/a/74-20203286,yle.fi,Finnish,trade_policy,macro,8,,CASIC,9.6,technology


In [3]:
# V√©rification des donn√©es manquantes dans les colonnes cl√©s
print("üîç V√©rification des valeurs manquantes:")
missing_data = df_news[['title', 'url', 'asset', 'event_type']].isnull().sum()
print(missing_data)

# D√©dupliquer par URL pour avoir des news uniques
print(f"\nüì∞ Avant d√©duplication: {len(df_news)} entr√©es")
df_news_unique = df_news.drop_duplicates(subset=['url']).copy()
print(f"üì∞ Apr√®s d√©duplication: {len(df_news_unique)} news uniques")

# Afficher quelques exemples de titres
print("\nüìå Exemples de titres de news:")
for idx, title in enumerate(df_news_unique['title'].head(5), 1):
    print(f"  {idx}. {title[:80]}...")

üîç V√©rification des valeurs manquantes:
title         0
url           0
asset         0
event_type    0
dtype: int64

üì∞ Avant d√©duplication: 4280 entr√©es
üì∞ Apr√®s d√©duplication: 377 news uniques

üìå Exemples de titres de news:
  1. Mercosur - kauppasopimus ratkaisevassa nestyksess√§ 25 vuoden neuvottelujen j√§lke...
  2. Iran in 2026 : Isolated in a Turbulent Ocean...
  3. R√ºstung im W√ºrgegriff : Warum Lockheed und Boeing ohne Antimon am Boden bleiben ...
  4. Tariff shock could weigh on jobs while easing inflation , Fed Research suggests...
  5. Aktien von Advantest , Renesas und Softbank brechen ein : Politischer Schock aus...


## 3. Initialisation du mod√®le FinBERT

In [10]:
# Configuration du device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"üñ•Ô∏è  Device: {device}")

# Chemin du mod√®le
MODEL_PATH = "./news_finbert_sentiment_model"

# Chargement du mod√®le et du tokenizer
print(f"\nüì• Chargement du mod√®le depuis: {MODEL_PATH}")
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_PATH,
    num_labels=2,
    use_safetensors=True
)
model.to(device)
model.eval()

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

print("‚úì Mod√®le FinBERT charg√© et pr√™t")
print(f"‚úì Configuration: {model.config.num_labels} labels (0=Negative, 1=Positive)")

üñ•Ô∏è  Device: cuda

üì• Chargement du mod√®le depuis: ./news_finbert_sentiment_model


OSError: Error no file named model.safetensors found in directory ./news_finbert_sentiment_model.

In [None]:
# Test rapide du mod√®le
test_text = "Stock market reaches new all-time high as economic growth surges"
print(f"\nüß™ Test du mod√®le avec: '{test_text}'")

# Tokenization de test
test_encoding = tokenizer(
    test_text,
    add_special_tokens=True,
    max_length=512,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt'
)

# Pr√©diction de test
with torch.no_grad():
    test_outputs = model(
        input_ids=test_encoding['input_ids'].to(device),
        attention_mask=test_encoding['attention_mask'].to(device)
    )
    test_probs = F.softmax(test_outputs.logits, dim=1)[0]
    test_prediction = torch.argmax(test_outputs.logits, dim=1).item()

print(f"‚úì Pr√©diction: {'Positive' if test_prediction == 1 else 'Negative'}")
print(f"‚úì Prob Negative: {test_probs[0].item():.2%}")
print(f"‚úì Prob Positive: {test_probs[1].item():.2%}")

## 4. Fonction d'analyse de sentiment

In [None]:
def analyze_sentiment(text, model, tokenizer, device, max_length=512):
    """
    Analyse le sentiment d'un texte avec le mod√®le FinBERT
    
    Args:
        text: Texte √† analyser (titre de la news)
        model: Mod√®le FinBERT
        tokenizer: Tokenizer
        device: CPU ou GPU
        max_length: Longueur maximale (512 pour FinBERT)
    
    Returns:
        dict avec sentiment, confiance, et probabilit√©s
    """
    
    # Gestion des textes vides ou null
    if not text or pd.isna(text) or len(str(text).strip()) == 0:
        return {
            'sentiment': 'Unknown',
            'confidence': 0.0,
            'prob_negative': 0.5,
            'prob_positive': 0.5
        }
    
    # Tokenization
    encoding = tokenizer(
        str(text),
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    # Pr√©diction
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probabilities = F.softmax(logits, dim=1)[0]
        
        prediction = torch.argmax(logits, dim=1).item()
        confidence = probabilities[prediction].item()
    
    # Labels: 0=Negative, 1=Positive
    sentiment_label = "Positive" if prediction == 1 else "Negative"
    
    return {
        'sentiment': sentiment_label,
        'confidence': confidence,
        'prob_negative': probabilities[0].item(),
        'prob_positive': probabilities[1].item()
    }

print("‚úì Fonction analyze_sentiment d√©finie")

## 5. Analyse de sentiment sur toutes les news

In [None]:
print("="*80)
print("ü§ñ D√âBUT DE L'ANALYSE DE SENTIMENT")
print("="*80)

# Pr√©parer les listes pour stocker les r√©sultats
sentiments = []
confidences = []
prob_negatives = []
prob_positives = []

total_news = len(df_news_unique)
print(f"\nüì∞ Analyse de {total_news} news uniques...\n")

# Analyser chaque news
for idx, row in df_news_unique.iterrows():
    # Utiliser le titre pour l'analyse
    title = row['title']
    
    # Analyser le sentiment
    result = analyze_sentiment(title, model, tokenizer, device)
    
    # Stocker les r√©sultats
    sentiments.append(result['sentiment'])
    confidences.append(result['confidence'])
    prob_negatives.append(result['prob_negative'])
    prob_positives.append(result['prob_positive'])
    
    # Afficher la progression tous les 10%
    progress = (len(sentiments) / total_news) * 100
    if len(sentiments) % max(1, total_news // 10) == 0:
        print(f"   ‚è≥ Progression: {progress:.0f}% ({len(sentiments)}/{total_news})")

# Ajouter les colonnes au DataFrame
df_news_unique['sentiment'] = sentiments
df_news_unique['confidence'] = confidences
df_news_unique['prob_negative'] = prob_negatives
df_news_unique['prob_positive'] = prob_positives

print(f"\n‚úì Analyse termin√©e pour {total_news} news")
print("="*80)

In [None]:
# Afficher un aper√ßu des r√©sultats
print("\nüìä Aper√ßu des r√©sultats:")
print(df_news_unique[['title', 'sentiment', 'confidence', 'asset']].head(10))

## 6. Statistiques et analyse des r√©sultats

In [None]:
print("="*80)
print("üìà STATISTIQUES GLOBALES")
print("="*80)

# Statistiques de sentiment
sentiment_counts = df_news_unique['sentiment'].value_counts()
print(f"\nüé≠ Distribution des sentiments:")
for sentiment, count in sentiment_counts.items():
    percentage = (count / len(df_news_unique)) * 100
    print(f"  {sentiment:10s}: {count:4d} news ({percentage:.1f}%)")

# Statistiques de confiance
print(f"\nüéØ Confiance du mod√®le:")
print(f"  Moyenne     : {df_news_unique['confidence'].mean():.2%}")
print(f"  M√©diane     : {df_news_unique['confidence'].median():.2%}")
print(f"  Min         : {df_news_unique['confidence'].min():.2%}")
print(f"  Max         : {df_news_unique['confidence'].max():.2%}")

# Distribution par niveau de confiance
high_conf = (df_news_unique['confidence'] > 0.8).sum()
medium_conf = ((df_news_unique['confidence'] > 0.6) & (df_news_unique['confidence'] <= 0.8)).sum()
low_conf = (df_news_unique['confidence'] <= 0.6).sum()

print(f"\nüìä Distribution par confiance:")
print(f"  Haute (>80%)    : {high_conf:4d} news ({(high_conf/len(df_news_unique)*100):.1f}%)")
print(f"  Moyenne (60-80%): {medium_conf:4d} news ({(medium_conf/len(df_news_unique)*100):.1f}%)")
print(f"  Faible (<60%)   : {low_conf:4d} news ({(low_conf/len(df_news_unique)*100):.1f}%)")

print("="*80)

In [None]:
# Analyse par type d'√©v√©nement
print("\nüì∞ Sentiment par type d'√©v√©nement:")
event_sentiment = df_news_unique.groupby(['event_type', 'sentiment']).size().unstack(fill_value=0)
print(event_sentiment)

# Sentiment moyen par actif
print("\nüìà Top 10 actifs par nombre de news:")
top_assets = df_news_unique['asset'].value_counts().head(10)
for rank, (asset, count) in enumerate(top_assets.items(), 1):
    asset_data = df_news_unique[df_news_unique['asset'] == asset]
    positive_pct = (asset_data['sentiment'] == 'Positive').sum() / len(asset_data) * 100
    avg_conf = asset_data['confidence'].mean()
    print(f"  {rank:2d}. {asset:15s}: {count:3d} news | {positive_pct:.1f}% positive | Conf: {avg_conf:.2%}")

In [None]:
# Exemples de news avec sentiment positif et confiance √©lev√©e
print("\n‚úÖ Top 5 news POSITIVES (haute confiance):")
positive_high = df_news_unique[
    (df_news_unique['sentiment'] == 'Positive') & 
    (df_news_unique['confidence'] > 0.8)
].nlargest(5, 'confidence')

for idx, row in positive_high.iterrows():
    print(f"\n  üìå {row['title'][:70]}...")
    print(f"     Asset: {row['asset']} | Confiance: {row['confidence']:.2%}")

# Exemples de news avec sentiment n√©gatif et confiance √©lev√©e
print("\n‚ùå Top 5 news N√âGATIVES (haute confiance):")
negative_high = df_news_unique[
    (df_news_unique['sentiment'] == 'Negative') & 
    (df_news_unique['confidence'] > 0.8)
].nlargest(5, 'confidence')

for idx, row in negative_high.iterrows():
    print(f"\n  üìå {row['title'][:70]}...")
    print(f"     Asset: {row['asset']} | Confiance: {row['confidence']:.2%}")

## 7. R√©int√©gration avec toutes les lignes du dataset original

In [None]:
# Merger les r√©sultats de sentiment avec le dataset complet (avec duplicatas)
# Cela permet d'avoir le sentiment pour chaque association news-actif

print("üîÑ Fusion des r√©sultats avec le dataset complet...")

# S√©lectionner les colonnes de sentiment
sentiment_cols = ['url', 'sentiment', 'confidence', 'prob_negative', 'prob_positive']
df_sentiment = df_news_unique[sentiment_cols].copy()

# Fusionner avec le dataset original
df_news_enriched = df_news.merge(df_sentiment, on='url', how='left')

print(f"‚úì Dataset enrichi: {len(df_news_enriched)} lignes")
print(f"‚úì Colonnes ajout√©es: sentiment, confidence, prob_negative, prob_positive")

# V√©rifier les valeurs manquantes
missing_sentiment = df_news_enriched['sentiment'].isna().sum()
if missing_sentiment > 0:
    print(f"‚ö†Ô∏è  {missing_sentiment} lignes sans sentiment (probablement des URLs manquantes)")

# Afficher un aper√ßu
print("\nüìã Aper√ßu du dataset enrichi:")
df_news_enriched[['date', 'title', 'asset', 'sentiment', 'confidence', 'relevance_score']].head()

## 8. Export des r√©sultats

In [None]:
# G√©n√©rer un timestamp pour les fichiers de sortie
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# Chemin de sortie
output_dir = Path("../Pipeline_Recup_Donnees/data/raw/news")
output_dir.mkdir(parents=True, exist_ok=True)

# Export CSV complet (avec tous les duplicatas actif-news)
output_csv_full = output_dir / f"hybrid_news_sentiment_full_{timestamp}.csv"
df_news_enriched.to_csv(output_csv_full, index=False)
print(f"‚úì Dataset complet sauvegard√©: {output_csv_full}")
print(f"  ({len(df_news_enriched)} lignes)")

# Export CSV news uniques uniquement
output_csv_unique = output_dir / f"hybrid_news_sentiment_unique_{timestamp}.csv"
df_news_unique.to_csv(output_csv_unique, index=False)
print(f"\n‚úì News uniques sauvegard√©es: {output_csv_unique}")
print(f"  ({len(df_news_unique)} news uniques)")

# Export JSON pour int√©gration API
output_json = output_dir / f"hybrid_news_sentiment_{timestamp}.json"
df_news_unique.to_json(output_json, orient='records', indent=2)
print(f"\n‚úì Format JSON sauvegard√©: {output_json}")

In [None]:
# Cr√©er un rapport r√©capitulatif
summary_report = {
    "analysis_date": datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    "total_news": len(df_news_unique),
    "total_associations": len(df_news_enriched),
    "sentiment_distribution": {
        "positive": int(sentiment_counts.get('Positive', 0)),
        "negative": int(sentiment_counts.get('Negative', 0)),
        "unknown": int(sentiment_counts.get('Unknown', 0))
    },
    "confidence_stats": {
        "mean": float(df_news_unique['confidence'].mean()),
        "median": float(df_news_unique['confidence'].median()),
        "min": float(df_news_unique['confidence'].min()),
        "max": float(df_news_unique['confidence'].max())
    },
    "top_assets": df_news_enriched['asset'].value_counts().head(10).to_dict(),
    "event_types": df_news_enriched['event_type'].value_counts().to_dict()
}

# Export du rapport en JSON
output_summary = output_dir / f"sentiment_analysis_summary_{timestamp}.json"
import json
with open(output_summary, 'w', encoding='utf-8') as f:
    json.dump(summary_report, f, indent=2, ensure_ascii=False)

print(f"\n‚úì Rapport r√©capitulatif sauvegard√©: {output_summary}")

print("\n" + "="*80)
print("‚úÖ ANALYSE DE SENTIMENT TERMIN√âE")
print("="*80)
print("\nüìÅ Fichiers g√©n√©r√©s:")
print(f"  1. {output_csv_full.name}")
print(f"  2. {output_csv_unique.name}")
print(f"  3. {output_json.name}")
print(f"  4. {output_summary.name}")

## 9. Visualisation des r√©sultats (Optionnel)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Configuration du style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 10)

# Cr√©er une figure avec plusieurs sous-graphiques
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Distribution des sentiments
sentiment_counts.plot(kind='bar', ax=axes[0, 0], color=['green', 'red', 'gray'])
axes[0, 0].set_title('Distribution des Sentiments', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('Sentiment')
axes[0, 0].set_ylabel('Nombre de News')
axes[0, 0].tick_params(axis='x', rotation=0)

# 2. Distribution de la confiance
axes[0, 1].hist(df_news_unique['confidence'], bins=30, color='skyblue', edgecolor='black')
axes[0, 1].set_title('Distribution de la Confiance', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('Confiance')
axes[0, 1].set_ylabel('Fr√©quence')
axes[0, 1].axvline(df_news_unique['confidence'].mean(), color='red', linestyle='--', label='Moyenne')
axes[0, 1].legend()

# 3. Top 10 actifs
top_10_assets = df_news_enriched['asset'].value_counts().head(10)
top_10_assets.plot(kind='barh', ax=axes[1, 0], color='coral')
axes[1, 0].set_title('Top 10 Actifs les Plus Mentionn√©s', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('Nombre de News')
axes[1, 0].set_ylabel('Actif')

# 4. Sentiment par type d'√©v√©nement
event_sentiment_pct = df_news_unique.groupby('event_type')['sentiment'].value_counts(normalize=True).unstack()
event_sentiment_pct.plot(kind='bar', stacked=True, ax=axes[1, 1], color=['green', 'red', 'gray'])
axes[1, 1].set_title('Sentiment par Type d\'√âv√©nement', fontsize=14, fontweight='bold')
axes[1, 1].set_xlabel('Type d\'√âv√©nement')
axes[1, 1].set_ylabel('Proportion')
axes[1, 1].legend(title='Sentiment')
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig(output_dir / f"sentiment_analysis_viz_{timestamp}.png", dpi=300, bbox_inches='tight')
print(f"\n‚úì Visualisations sauvegard√©es: sentiment_analysis_viz_{timestamp}.png")
plt.show()