## 1. Installation & Configuration

In [1]:
import pandas as pd
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from newsapi.newsapi_client import NewsApiClient
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

print("‚úì Biblioth√®ques charg√©es")

  from .autonotebook import tqdm as notebook_tqdm


‚úì Biblioth√®ques charg√©es


In [2]:
import os
from dotenv import load_dotenv
from pathlib import Path
load_dotenv(Path(".env"))
newsapi = NewsApiClient(api_key=os.getenv("NEWSAPI_API_KEY"))

# Device (CPU ou GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

Device: cuda


## 2. Chargement de votre mod√®le FinBERT

In [3]:
# Charger votre mod√®le fine-tun√©
MODEL_PATH = "./news_finbert_sentiment_model"

print("Chargement du mod√®le FinBERT...")
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_PATH,
    num_labels=2,
    use_safetensors=True
)
model.to(device)
model.eval()

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
print("‚úì Mod√®le FinBERT charg√© et pr√™t")

Chargement du mod√®le FinBERT...
‚úì Mod√®le FinBERT charg√© et pr√™t


## 3.1. Configuration des actifs et mots-cl√©s

### Configuration des actifs avec leurs mots-cl√©s sp√©cifiques

In [4]:
ASSETS_CONFIG = {
    # Indices
    'SP500': {
        'type': 'Index',
        'keywords': ['S&P 500', 'SP500', 'Wall Street', 'Dow Jones', 'Nasdaq', 'Trump', 'Fed', 'Federal Reserve', 'US economy', 'American stocks']
    },
    'CAC40': {
        'type': 'Index',
        'keywords': ['CAC 40', 'CAC40', 'Paris Bourse', 'Euronext Paris', 'Macron', 'French economy', 'France stocks']
    },
    'GER30': {
        'type': 'Index',
        'keywords': ['DAX', 'DAX 30', 'GER30', 'Frankfurt', 'German economy', 'Scholz', 'Germany stocks']
    },
    
    # Entreprises Tech
    'AAPL': {
        'type': 'Company',
        'keywords': ['Apple', 'AAPL', 'iPhone', 'iPad', 'Mac', 'Tim Cook', 'iOS', 'App Store']
    },
    'AMZN': {
        'type': 'Company',
        'keywords': ['Amazon', 'AMZN', 'AWS', 'Bezos', 'Prime', 'e-commerce', 'cloud computing']
    },
    'TSLA': {
        'type': 'Company',
        'keywords': ['Tesla', 'TSLA', 'Elon Musk', 'electric vehicle', 'EV', 'Model 3', 'Model Y', 'Cybertruck']
    },
    
    # Entreprises Fran√ßaises
    'SAN': {
        'type': 'Company',
        'keywords': ['Sanofi', 'SAN', 'pharmaceutical', 'vaccine', 'healthcare']
    },
    'HO': {
        'type': 'Company',
        'keywords': ['Thales', 'HO', 'defense', 'aerospace', 'cybersecurity']
    },
    'MC': {
        'type': 'Company',
        'keywords': ['LVMH', 'Louis Vuitton', 'Bernard Arnault', 'luxury', 'Dior', 'Mo√´t Hennessy']
    },
    'ENGI': {
        'type': 'Company',
        'keywords': ['Engie', 'ENGI', 'energy', 'renewable', 'gas', 'electricity']
    },
    'TTE': {
        'type': 'Company',
        'keywords': ['TotalEnergies', 'Total', 'TTE', 'oil', 'gas', 'energy', 'petroleum']
    },
    'AIR': {
        'type': 'Company',
        'keywords': ['Airbus', 'AIR', 'aircraft', 'A320', 'A350', 'aviation', 'aerospace']
    },
    
    # Commodit√©s
    'OIL': {
        'type': 'Commodity',
        'keywords': ['crude oil', 'oil price', 'petroleum', 'Brent', 'WTI', 'OPEC', 'energy prices']
    },
    'GOLD': {
        'type': 'Commodity',
        'keywords': ['gold', 'gold price', 'precious metal', 'bullion', 'gold market']
    }
}

def build_query_from_keywords(keywords):
    """Construit une requ√™te OR pour NewsAPI"""
    return ' OR '.join([f'"{kw}"' for kw in keywords])

print(f"‚úì {len(ASSETS_CONFIG)} actifs configur√©s")

‚úì 14 actifs configur√©s


## 3.2 Fonction de r√©cup√©ration de news

In [5]:
def fetch_financial_news(query=None, 
                         country=None, 
                         language="en",
                         page_size=100,
                         categories=None,
                         days_back=7,
                         use_asset_keywords=False):
    """
    R√©cup√®re des news financi√®res via NewsAPI
    
    Args:
        query: Mots-cl√©s personnalis√©s (ex: "Apple OR Tesla")
        country: Code pays ISO (ex: "us", "fr", "gb")
        language: Code langue ("en", "fr", etc.)
        page_size: Nombre d'articles (max 100)
        categories: Liste de cat√©gories ['business', 'health', 'science', 'technology']
        days_back: Nombre de jours en arri√®re
        use_asset_keywords: Si True, utilise les mots-cl√©s des actifs configur√©s
    
    Returns:
        Liste de dictionnaires avec les news + asset mapping
    """
    
    from_date = (datetime.now() - timedelta(days=days_back)).strftime('%Y-%m-%d')
    
    if categories is None:
        categories = ['business', 'health', 'science', 'technology']
    
    all_news = []
    
    # PARTIE 1: R√©cup√©ration par CAT√âGORIES (top-headlines)
    print("\n" + "="*80)
    print("üì∞ R√âCUP√âRATION PAR CAT√âGORIES (Top Headlines)")
    print("="*80)
    
    for category in categories:
        try:
            print(f"\nüîç Cat√©gorie: {category}...")
            
            articles = newsapi.get_top_headlines(
                q=query if query and not use_asset_keywords else None,
                language=language,
                page_size=page_size,
                category=category,
                country=country
            )
            
            for article in articles.get('articles', []):
                all_news.append({
                    'title': article.get('title', ''),
                    'description': article.get('description', ''),
                    'content': article.get('content', ''),
                    'source': article.get('source', {}).get('name', 'Unknown'),
                    'url': article.get('url', ''),
                    'published_at': article.get('publishedAt', ''),
                    'author': article.get('author', 'Unknown'),
                    'news_category': category,
                    'query_keyword': 'CATEGORY_' + category.upper(),
                    'asset_ticker': None,  # Sera rempli apr√®s
                    'asset_type': None
                })
            
            print(f"   ‚úì {len(articles.get('articles', []))} articles")
        
        except Exception as e:
            print(f"   ‚ùå Erreur: {e}")
            continue
    
    # PARTIE 2: R√©cup√©ration par MOTS-CL√âS des actifs (everything)
    if use_asset_keywords:
        print("\n" + "="*80)
        print("üéØ R√âCUP√âRATION PAR ACTIFS (Mots-cl√©s sp√©cifiques)")
        print("="*80)
        
        for ticker, config in ASSETS_CONFIG.items():
            try:
                asset_query = build_query_from_keywords(config['keywords'])
                print(f"\nüîç Actif: {ticker} ({config['type']})...")
                print(f"   Query: {asset_query[:100]}...")
                
                articles = newsapi.get_everything(
                    q=asset_query,
                    language=language,
                    from_param=from_date,
                    page_size=page_size,
                    sort_by='publishedAt'
                )
                
                for article in articles.get('articles', []):
                    all_news.append({
                        'title': article.get('title', ''),
                        'description': article.get('description', ''),
                        'content': article.get('content', ''),
                        'source': article.get('source', {}).get('name', 'Unknown'),
                        'url': article.get('url', ''),
                        'published_at': article.get('publishedAt', ''),
                        'author': article.get('author', 'Unknown'),
                        'news_category': 'KEYWORD_SEARCH',
                        'query_keyword': ticker,
                        'asset_ticker': ticker,
                        'asset_type': config['type']
                    })
                
                print(f"   ‚úì {len(articles.get('articles', []))} articles")
            
            except Exception as e:
                print(f"   ‚ùå Erreur pour {ticker}: {e}")
                continue
    
    # Supprimer les doublons bas√©s sur l'URL
    unique_news = []
    seen_urls = set()
    for news in all_news:
        if news['url'] not in seen_urls:
            seen_urls.add(news['url'])
            unique_news.append(news)
    
    print(f"\n{'='*80}")
    print(f"‚úÖ TOTAL: {len(unique_news)} articles uniques (sur {len(all_news)} r√©cup√©r√©s)")
    print(f"{'='*80}\n")
    
    return unique_news

In [6]:
# Ajouter apr√®s la cellule fetch_financial_news

## 3.2. Fonction de matching automatique des actifs

def match_news_to_assets(news_text):
    """
    D√©tecte automatiquement les actifs mentionn√©s dans un texte
    
    Args:
        news_text: Titre + description de la news
    
    Returns:
        Liste de tuples (ticker, type) des actifs trouv√©s
    """
    text_lower = news_text.lower()
    matched_assets = []
    
    for ticker, config in ASSETS_CONFIG.items():
        for keyword in config['keywords']:
            if keyword.lower() in text_lower:
                matched_assets.append((ticker, config['type']))
                break  # Un seul match par actif suffit
    
    return matched_assets

# Test
test_text = "Apple's iPhone sales surge as Tesla announces new Model Y"
matches = match_news_to_assets(test_text)
print(f"Test: '{test_text}'")
print(f"Actifs d√©tect√©s: {matches}")

Test: 'Apple's iPhone sales surge as Tesla announces new Model Y'
Actifs d√©tect√©s: [('AAPL', 'Company'), ('TSLA', 'Company'), ('HO', 'Company')]


In [7]:
# √Ä ajouter apr√®s la cellule "3.2 Fonction de matching automatique des actifs"

## 3.3 Filtrage des News Non-Financi√®res

# Cat√©gories et sources √† exclure
EXCLUDED_SOURCES = {
    'DogTime', 'Parade', 'AOL.com', 'Yahoo Entertainment',
    'Slate Magazine', 'Gizmodo.com', 'Tom\'s Guide', 'Mashable',
    'Variety', 'The Verge', 'Wired', 'Gaming', 'Kotaku',
    'IGN', 'PC Gamer', 'Polygon'
}

EXCLUDED_KEYWORDS = {
    'gaming', 'video game', 'esports', 'game', 'player',
    'celebrity', 'entertainment', 'movie', 'music', 'actor', 'actress',
    'sports', 'football', 'basketball', 'soccer', 'tennis',
    'health tips', 'diet', 'workout', 'fitness', 'recipe',
    'pet', 'animal', 'dog', 'cat', 'puppy',
    'dating', 'relationship', 'advice', 'love',
    'celebrity gossip', 'reality tv', 'streaming service'
}

def filter_non_financial_news(df):
    """
    Filtre les news non-financi√®res (Niveau 1 & 2)
    
    Args:
        df: DataFrame avec les news
    
    Returns:
        DataFrame filtr√© (news financi√®res uniquement)
    """
    if df.empty:
        return df
    
    before_count = len(df)
    
    # NIVEAU 1: Exclure par source
    print("\nüîç FILTRAGE NIVEAU 1 - Sources exclues")
    print(f"   Avant: {len(df)} news")
    
    df_filtered = df[~df['source'].isin(EXCLUDED_SOURCES)].copy()
    removed_by_source = before_count - len(df_filtered)
    print(f"   Apr√®s: {len(df_filtered)} news (-{removed_by_source})")
    
    # NIVEAU 2: Exclure par mots-cl√©s
    print("\nüîç FILTRAGE NIVEAU 2 - Mots-cl√©s non-financiers")
    before_keywords = len(df_filtered)
    
    title_description = (
        df_filtered['title'].str.lower().fillna('') + ' ' + 
        df_filtered['description'].str.lower().fillna('')
    )
    
    # Exclure si un mot-cl√© non-financier est pr√©sent
    mask = ~title_description.str.contains(
        '|'.join(EXCLUDED_KEYWORDS),
        regex=True,
        na=False
    )
    df_filtered = df_filtered[mask].copy()
    removed_by_keywords = before_keywords - len(df_filtered)
    print(f"   Avant: {before_keywords} news")
    print(f"   Apr√®s: {len(df_filtered)} news (-{removed_by_keywords})")
    
    print(f"\n‚úÖ TOTAL FILTR√â: {before_count} ‚Üí {len(df_filtered)} news")
    print(f"   √âlimin√©es: {before_count - len(df_filtered)} ({(before_count - len(df_filtered)) / before_count * 100:.1f}%)")
    
    return df_filtered


# Test du filtre
print("="*80)
print("TEST DU FILTRE NON-FINANCIER")
print("="*80)

test_articles = pd.DataFrame({
    'title': [
        'Apple Stock Hits Record High',
        'Top 10 Gaming Trends in 2026',
        'Tesla Announces New Factory',
        'Celebrity Gossip: Brangelina Reunion?',
        'Fed Raises Interest Rates to 5.5%',
        'Best Workout Tips for Summer'
    ],
    'description': [
        'AAPL closes at $250',
        'Best video games to play now',
        'Gigafactory expansion plans',
        'Hollywood news update',
        'Federal Reserve monetary policy decision',
        'Fitness advice for beginners'
    ],
    'source': ['Bloomberg', 'IGN', 'Reuters', 'Variety', 'CNBC', 'Fitness Magazine'],
    'sentiment': ['Positive', 'Positive', 'Positive', 'Neutral', 'Negative', 'Positive'],
    'confidence': [0.9, 0.95, 0.88, 0.92, 0.91, 0.87],
    'url': [f'http://example.com/{i}' for i in range(6)]
})

print("\nüì∞ AVANT FILTRAGE:")
print(test_articles[['title', 'source']])

test_filtered = filter_non_financial_news(test_articles)

print("\nüì∞ APR√àS FILTRAGE:")
print(test_filtered[['title', 'source']])

TEST DU FILTRE NON-FINANCIER

üì∞ AVANT FILTRAGE:
                                   title            source
0           Apple Stock Hits Record High         Bloomberg
1           Top 10 Gaming Trends in 2026               IGN
2            Tesla Announces New Factory           Reuters
3  Celebrity Gossip: Brangelina Reunion?           Variety
4      Fed Raises Interest Rates to 5.5%              CNBC
5           Best Workout Tips for Summer  Fitness Magazine

üîç FILTRAGE NIVEAU 1 - Sources exclues
   Avant: 6 news
   Apr√®s: 4 news (-2)

üîç FILTRAGE NIVEAU 2 - Mots-cl√©s non-financiers
   Avant: 4 news
   Apr√®s: 2 news (-2)

‚úÖ TOTAL FILTR√â: 6 ‚Üí 2 news
   √âlimin√©es: 4 (66.7%)

üì∞ APR√àS FILTRAGE:
                               title     source
0       Apple Stock Hits Record High  Bloomberg
4  Fed Raises Interest Rates to 5.5%       CNBC


## 4. Fonction d'analyse de sentiment

In [8]:
def analyze_sentiment(text, model, tokenizer, device, max_length=512):
    """
    Analyse le sentiment d'un texte avec votre mod√®le FinBERT
    
    Args:
        text: Texte √† analyser
        model: Mod√®le FinBERT
        tokenizer: Tokenizer
        device: CPU ou GPU
        max_length: Longueur max (512 pour news)
    
    Returns:
        dict avec sentiment, confiance, probabilit√©s
    """
    
    if not text or len(text.strip()) == 0:
        return {
            'sentiment': 'Unknown',
            'confidence': 0.0,
            'prob_negative': 0.5,
            'prob_positive': 0.5
        }
    
    # Tokenization
    encoding = tokenizer(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    # Pr√©diction
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probabilities = F.softmax(logits, dim=1)[0]
        
        prediction = torch.argmax(logits, dim=1).item()
        confidence = probabilities[prediction].item()
    
    sentiment_label = "Positive" if prediction == 1 else "Negative"
    
    return {
        'sentiment': sentiment_label,
        'confidence': confidence,
        'prob_negative': probabilities[0].item(),
        'prob_positive': probabilities[1].item()
    }

## 5. Pipeline complet : Fetch + Analyze

In [9]:
# Remplacer la cellule "5. Pipeline complet : Fetch + Analyze"

def analyze_news_pipeline(query=None, country=None, page_size=25, categories=None, use_asset_keywords=True, apply_filter=True):
    """
    Pipeline complet : r√©cup√®re les news et analyse le sentiment
    
    Args:
        query: Mots-cl√©s de recherche personnalis√©s
        country: Code pays ISO
        page_size: Nombre d'articles par cat√©gorie/actif
        categories: Liste de cat√©gories
        use_asset_keywords: Si True, boucle sur tous les actifs configur√©s
        apply_filter: Si True, applique le filtre non-financier
    
    Returns:
        DataFrame avec news + analyse + mapping actifs
    """
    
    print("="*80)
    print("üöÄ PIPELINE COMPLET - NEWS + SENTIMENT + ASSET MAPPING")
    print("="*80)
    
    # 1. R√©cup√©rer les news
    news_list = fetch_financial_news(
        query=query, 
        country=country, 
        page_size=page_size,
        categories=categories,
        use_asset_keywords=use_asset_keywords
    )
    
    if not news_list:
        print("‚ùå Aucune news r√©cup√©r√©e")
        return pd.DataFrame()
    
    # 2. Analyser chaque news + mapper aux actifs
    print("\nü§ñ ANALYSE DE SENTIMENT + MAPPING ACTIFS")
    print("="*80)
    
    results = []
    
    for i, news in enumerate(news_list, 1):
        full_text = f"{news['title']} {news['description']}"
        
        # Analyse de sentiment
        sentiment_result = analyze_sentiment(full_text, model, tokenizer, device)
        
        # Si asset_ticker n'est pas d√©j√† d√©fini (cas des cat√©gories), on le d√©tecte
        if news['asset_ticker'] is None:
            matched_assets = match_news_to_assets(full_text)
            
            # Si aucun actif d√©tect√©, on cr√©e une seule ligne "Unknown"
            if not matched_assets:
                matched_assets = [('UNKNOWN', 'Unknown')]
            
            # Cr√©er une ligne pour chaque actif d√©tect√©
            for ticker, asset_type in matched_assets:
                results.append({
                    'asset_ticker': ticker,
                    'asset_type': asset_type,
                    'query_keyword': news['query_keyword'],
                    'news_category': news['news_category'],
                    'title': news['title'],
                    'description': news.get('description', ''),
                    'source': news['source'],
                    'published_at': news['published_at'],
                    'sentiment': sentiment_result['sentiment'],
                    'confidence': sentiment_result['confidence'],
                    'prob_negative': sentiment_result['prob_negative'],
                    'prob_positive': sentiment_result['prob_positive'],
                    'url': news['url']
                })
        else:
            # Asset d√©j√† d√©fini (cas des mots-cl√©s)
            results.append({
                'asset_ticker': news['asset_ticker'],
                'asset_type': news['asset_type'],
                'query_keyword': news['query_keyword'],
                'news_category': news['news_category'],
                'title': news['title'],
                'description': news.get('description', ''),
                'source': news['source'],
                'published_at': news['published_at'],
                'sentiment': sentiment_result['sentiment'],
                'confidence': sentiment_result['confidence'],
                'prob_negative': sentiment_result['prob_negative'],
                'prob_positive': sentiment_result['prob_positive'],
                'url': news['url']
            })
        
        if i % 25 == 0:
            print(f"   ‚úì Analys√© {i}/{len(news_list)} news...")
    
    df = pd.DataFrame(results)
    
    # 3. Appliquer le filtre non-financier (NOUVEAU)
    if apply_filter and not df.empty:
        print(f"\n{'='*80}")
        print("üßπ FILTRAGE NON-FINANCIER")
        print(f"{'='*80}")
        df = filter_non_financial_news(df)
    
    print(f"\n{'='*80}")
    print(f"‚úÖ {len(df)} associations news-actifs analys√©es")
    print(f"{'='*80}\n")
    
    return df

## 6. Exemples d'utilisation

In [10]:
# Remplacer la cellule "6. Exemples d'utilisation"

## 6. Exemples d'utilisation avec filtrage

# EXEMPLE 1: R√©cup√©ration compl√®te avec FILTRAGE ACTIF
df_finance = analyze_news_pipeline(
    page_size=100,
    categories=['business', 'health', 'science', 'technology'],
    use_asset_keywords=True,
    apply_filter=True  # ‚úÖ Active le filtrage
)

# Aper√ßu des r√©sultats
print("\nüìä APER√áU DES R√âSULTATS")
print(df_finance[['asset_ticker', 'asset_type', 'query_keyword', 'title', 'sentiment', 'confidence']].head(20))

# Statistiques globales
print("\nüìà STATISTIQUES GLOBALES:")
print(f"Total news-actifs: {len(df_finance)}")
print(f"Actifs uniques: {df_finance['asset_ticker'].nunique()}")
print(f"Sources uniques: {df_finance['source'].nunique()}")

# Statistiques par actif
print("\nüìä STATISTIQUES PAR ACTIF:")
asset_stats = df_finance.groupby('asset_ticker').agg({
    'title': 'count',
    'sentiment': lambda x: (x == 'Positive').sum() / len(x) if len(x) > 0 else 0,
    'confidence': 'mean'
}).rename(columns={
    'title': 'count',
    'sentiment': 'positive_ratio',
    'confidence': 'avg_confidence'
}).sort_values('count', ascending=False)

print(asset_stats.head(15))

# Statistiques par type d'actif
print("\nüìä STATISTIQUES PAR TYPE:")
type_stats = df_finance.groupby('asset_type').agg({
    'title': 'count',
    'confidence': 'mean'
})
print(type_stats)

üöÄ PIPELINE COMPLET - NEWS + SENTIMENT + ASSET MAPPING

üì∞ R√âCUP√âRATION PAR CAT√âGORIES (Top Headlines)

üîç Cat√©gorie: business...
   ‚úì 30 articles

üîç Cat√©gorie: health...
   ‚úì 55 articles

üîç Cat√©gorie: science...
   ‚úì 55 articles

üîç Cat√©gorie: technology...
   ‚úì 68 articles

üéØ R√âCUP√âRATION PAR ACTIFS (Mots-cl√©s sp√©cifiques)

üîç Actif: SP500 (Index)...
   Query: "S&P 500" OR "SP500" OR "Wall Street" OR "Dow Jones" OR "Nasdaq" OR "Trump" OR "Fed" OR "Federal Res...
   ‚úì 87 articles

üîç Actif: CAC40 (Index)...
   Query: "CAC 40" OR "CAC40" OR "Paris Bourse" OR "Euronext Paris" OR "Macron" OR "French economy" OR "France...
   ‚úì 93 articles

üîç Actif: GER30 (Index)...
   Query: "DAX" OR "DAX 30" OR "GER30" OR "Frankfurt" OR "German economy" OR "Scholz" OR "Germany stocks"...
   ‚úì 95 articles

üîç Actif: AAPL (Company)...
   Query: "Apple" OR "AAPL" OR "iPhone" OR "iPad" OR "Mac" OR "Tim Cook" OR "iOS" OR "App Store"...
   ‚úì 94 articles

ü

In [None]:
# Exemple 2 : News sur des entreprises sp√©cifiques
df_tech = analyze_news_pipeline(
    query="Apple OR Tesla OR Microsoft OR Google",
    page_size=100
)

# Filtrer par confiance √©lev√©e (> 80%)
df_high_confidence = df_tech[df_tech['confidence'] > 0.8]

print(f"\nüéØ News avec confiance > 80% : {len(df_high_confidence)} articles")
print(df_high_confidence[['title', 'sentiment', 'confidence']])

üîÑ D√âBUT DU PIPELINE

üì° R√©cup√©ration des news (query='Apple OR Tesla OR Microsoft OR Google')...
‚úì 0 articles r√©cup√©r√©s
‚ùå Aucune news r√©cup√©r√©e


KeyError: 'confidence'

In [None]:
# Exemple 3 : Analyse par sentiment
print("\nüìà STATISTIQUES")
print(f"Sentiment positif : {(df_tech['sentiment'] == 'Positive').sum()} articles")
print(f"Sentiment n√©gatif : {(df_tech['sentiment'] == 'Negative').sum()} articles")
print(f"\nConfiance moyenne : {df_tech['confidence'].mean():.2%}")


üìà STATISTIQUES
Sentiment positif : 52 articles
Sentiment n√©gatif : 46 articles

Confiance moyenne : 87.96%


In [11]:
# Nouvelle cellule apr√®s les exemples

## BONUS: Comparaison avec/sans filtre

# Sans filtre
df_no_filter = analyze_news_pipeline(
    page_size=50,
    categories=['business'],
    use_asset_keywords=False,
    apply_filter=False  # D√©sactive le filtre
)

# Avec filtre
df_with_filter = analyze_news_pipeline(
    page_size=50,
    categories=['business'],
    use_asset_keywords=False,
    apply_filter=True  # Active le filtre
)

print("\n" + "="*80)
print("üìä COMPARAISON AVANT/APR√àS FILTRE")
print("="*80)
print(f"\nSans filtre: {len(df_no_filter)} articles")
print(f"Avec filtre: {len(df_with_filter)} articles")
print(f"Bruit √©limin√©: {len(df_no_filter) - len(df_with_filter)} ({(len(df_no_filter) - len(df_with_filter)) / len(df_no_filter) * 100:.1f}%)")

print("\nüì∞ Exemples d'articles √âLIMIN√âS par le filtre:")
removed = df_no_filter[~df_no_filter['url'].isin(df_with_filter['url'])]
print(removed[['title', 'source', 'asset_ticker']].head(10))

üöÄ PIPELINE COMPLET - NEWS + SENTIMENT + ASSET MAPPING

üì∞ R√âCUP√âRATION PAR CAT√âGORIES (Top Headlines)

üîç Cat√©gorie: business...
   ‚úì 46 articles

‚úÖ TOTAL: 46 articles uniques (sur 46 r√©cup√©r√©s)


ü§ñ ANALYSE DE SENTIMENT + MAPPING ACTIFS
   ‚úì Analys√© 25/46 news...

‚úÖ 67 associations news-actifs analys√©es

üöÄ PIPELINE COMPLET - NEWS + SENTIMENT + ASSET MAPPING

üì∞ R√âCUP√âRATION PAR CAT√âGORIES (Top Headlines)

üîç Cat√©gorie: business...
   ‚úì 46 articles

‚úÖ TOTAL: 46 articles uniques (sur 46 r√©cup√©r√©s)


ü§ñ ANALYSE DE SENTIMENT + MAPPING ACTIFS
   ‚úì Analys√© 25/46 news...

üßπ FILTRAGE NON-FINANCIER

üîç FILTRAGE NIVEAU 1 - Sources exclues
   Avant: 67 news
   Apr√®s: 64 news (-3)

üîç FILTRAGE NIVEAU 2 - Mots-cl√©s non-financiers
   Avant: 64 news
   Apr√®s: 58 news (-6)

‚úÖ TOTAL FILTR√â: 67 ‚Üí 58 news
   √âlimin√©es: 9 (13.4%)

‚úÖ 58 associations news-actifs analys√©es


üìä COMPARAISON AVANT/APR√àS FILTRE

Sans filtre: 67 articles
Av

## 7. Export des r√©sultats (pour int√©gration √©quipe)

In [11]:
# Sauvegarder en CSV
output_file = f"sentiment_analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
df_finance.to_csv(output_file, index=False)
print(f"‚úì R√©sultats sauvegard√©s dans : {output_file}")

# Ou en JSON pour API
output_json = output_file.replace('.csv', '.json')
df_finance.to_json(output_json, orient='records', indent=2)
print(f"‚úì R√©sultats JSON dans : {output_json}")

‚úì R√©sultats sauvegard√©s dans : sentiment_analysis_20260204_113632.csv
‚úì R√©sultats JSON dans : sentiment_analysis_20260204_113632.json


## 8. Classification Financi√®re des News (Gatekeeper DistilBERT)

### Chargement du mod√®le de classification binaire Financial/Non-Financial

In [13]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

class FinancialNewsClassifier:
    """Classification binaire: Financi√®re vs Non-Financi√®re"""
    
    def __init__(self, model_path, device='cpu'):
        self.device = device
        print(f"üì¶ Chargement du classificateur DistilBERT depuis {model_path}...")
        self.tokenizer = DistilBertTokenizer.from_pretrained(model_path)
        self.model = DistilBertForSequenceClassification.from_pretrained(model_path)
        self.model = self.model.to(device)
        self.model.eval()
        print("‚úì Classificateur charg√© et pr√™t")
    
    def predict(self, text, threshold=0.5):
        """
        Pr√©dit si le texte est une news financi√®re
        
        Args:
            text: Texte √† classifier
            threshold: Seuil de d√©cision (d√©faut: 0.5)
        
        Returns:
            dict avec is_financial (bool), confidence (float), label (str)
        """
        if not text or len(text.strip()) == 0:
            return {
                'is_financial': 0,
                'confidence': 0.5,
                'label': 'Unknown'
            }
        
        # Tokenization
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=256,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        input_ids = encoding['input_ids'].to(self.device)
        attention_mask = encoding['attention_mask'].to(self.device)
        
        # Pr√©diction
        with torch.no_grad():
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
            probabilities = torch.softmax(outputs.logits, dim=1)
            financial_prob = probabilities[0][1].item()
        
        is_financial = int(financial_prob >= threshold)
        
        return {
            'is_financial': is_financial,
            'confidence': financial_prob,
            'label': 'Financial' if is_financial else 'Non-Financial'
        }
    
    def batch_predict(self, texts, threshold=0.5, batch_size=16):
        """Pr√©diction par batch pour optimiser les performances"""
        results = []
        
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            
            # Tokenization du batch
            encodings = self.tokenizer(
                batch_texts,
                add_special_tokens=True,
                max_length=256,
                padding='max_length',
                truncation=True,
                return_attention_mask=True,
                return_tensors='pt'
            )
            
            input_ids = encodings['input_ids'].to(self.device)
            attention_mask = encodings['attention_mask'].to(self.device)
            
            # Pr√©diction
            with torch.no_grad():
                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
                probabilities = torch.softmax(outputs.logits, dim=1)
                financial_probs = probabilities[:, 1].cpu().numpy()
            
            # Conversion en r√©sultats
            for prob in financial_probs:
                is_financial = int(prob >= threshold)
                results.append({
                    'is_financial': is_financial,
                    'confidence': float(prob),
                    'label': 'Financial' if is_financial else 'Non-Financial'
                })
        
        return results

# Charger le classificateur
CLASSIFIER_MODEL_PATH = "./financial_news_classifier/best_model"

financial_classifier = FinancialNewsClassifier(
    CLASSIFIER_MODEL_PATH, 
    device=device
)

print("\n‚úÖ Classificateur DistilBERT pr√™t pour filtrage Financial/Non-Financial")

üì¶ Chargement du classificateur DistilBERT depuis ./financial_news_classifier/best_model...
‚úì Classificateur charg√© et pr√™t

‚úÖ Classificateur DistilBERT pr√™t pour filtrage Financial/Non-Financial


### Test du classificateur sur quelques exemples

In [14]:
## Test du classificateur

test_texts = [
    "Apple stock hits record high on strong iPhone sales",
    "Best video games to play this summer",
    "Federal Reserve raises interest rates to combat inflation",
    "Celebrity gossip: Hollywood stars reunite",
    "Tesla announces new Gigafactory in Texas",
    "Top 10 workout tips for beginners"
]

print("="*80)
print("üß™ TEST DU CLASSIFICATEUR FINANCIER")
print("="*80)

for text in test_texts:
    result = financial_classifier.predict(text)
    emoji = "üí∞" if result['is_financial'] else "üö´"
    print(f"\n{emoji} [{result['label']}] (confiance: {result['confidence']:.2%})")
    print(f"   '{text}'")

üß™ TEST DU CLASSIFICATEUR FINANCIER

üí∞ [Financial] (confiance: 99.54%)
   'Apple stock hits record high on strong iPhone sales'

üö´ [Non-Financial] (confiance: 0.05%)
   'Best video games to play this summer'

üí∞ [Financial] (confiance: 99.95%)
   'Federal Reserve raises interest rates to combat inflation'

üö´ [Non-Financial] (confiance: 0.87%)
   'Celebrity gossip: Hollywood stars reunite'

üö´ [Non-Financial] (confiance: 0.68%)
   'Tesla announces new Gigafactory in Texas'

üö´ [Non-Financial] (confiance: 0.06%)
   'Top 10 workout tips for beginners'


### Application du classificateur sur le CSV existant

In [15]:
## Charger le CSV existant (dernier fichier g√©n√©r√©)
import glob
import os

# Trouver le dernier CSV de sentiment
csv_files = glob.glob("sentiment_analysis_*.csv")
if csv_files:
    latest_csv = max(csv_files, key=os.path.getctime)
    print(f"üìÇ Chargement du CSV: {latest_csv}")
    df_existing = pd.read_csv(latest_csv)
    print(f"   {len(df_existing)} news charg√©es")
else:
    print("‚ùå Aucun CSV trouv√©. Utilisez 'df_finance' du notebook.")
    df_existing = df_finance.copy()

print(f"\nüìä Aper√ßu des colonnes:")
print(df_existing.columns.tolist())

üìÇ Chargement du CSV: sentiment_analysis_20260204_113632.csv
   975 news charg√©es

üìä Aper√ßu des colonnes:
['asset_ticker', 'asset_type', 'query_keyword', 'news_category', 'title', 'description', 'source', 'published_at', 'sentiment', 'confidence', 'prob_negative', 'prob_positive', 'url']


In [16]:
## Application du classificateur sur toutes les news

print("\n" + "="*80)
print("ü§ñ CLASSIFICATION FINANCI√àRE DE TOUTES LES NEWS")
print("="*80)

# Cr√©er le texte complet (titre + description)
df_existing['full_text'] = (
    df_existing['title'].fillna('') + ' ' + 
    df_existing['description'].fillna('')
)

# Classification par batch (plus rapide)
print(f"\nüîÑ Classification de {len(df_existing)} news...")
texts_to_classify = df_existing['full_text'].tolist()

classification_results = financial_classifier.batch_predict(
    texts_to_classify, 
    threshold=0.5,
    batch_size=32
)

# Ajouter les r√©sultats au DataFrame
df_existing['is_financial'] = [r['is_financial'] for r in classification_results]
df_existing['financial_confidence'] = [r['confidence'] for r in classification_results]
df_existing['financial_label'] = [r['label'] for r in classification_results]

# Supprimer la colonne temporaire
df_existing = df_existing.drop('full_text', axis=1)

print(f"‚úÖ Classification termin√©e!")

# Statistiques
print(f"\nüìä R√âSULTATS DE CLASSIFICATION:")
print(f"   News financi√®res (is_financial=1): {(df_existing['is_financial'] == 1).sum()} ({(df_existing['is_financial'] == 1).sum() / len(df_existing) * 100:.1f}%)")
print(f"   News non-financi√®res (is_financial=0): {(df_existing['is_financial'] == 0).sum()} ({(df_existing['is_financial'] == 0).sum() / len(df_existing) * 100:.1f}%)")
print(f"   Confiance moyenne: {df_existing['financial_confidence'].mean():.2%}")

# Aper√ßu
print(f"\nüì∞ APER√áU DES R√âSULTATS:")
print(df_existing[['title', 'is_financial', 'financial_label', 'financial_confidence', 'sentiment']].head(15))


ü§ñ CLASSIFICATION FINANCI√àRE DE TOUTES LES NEWS

üîÑ Classification de 975 news...
‚úÖ Classification termin√©e!

üìä R√âSULTATS DE CLASSIFICATION:
   News financi√®res (is_financial=1): 254 (26.1%)
   News non-financi√®res (is_financial=0): 721 (73.9%)
   Confiance moyenne: 26.46%

üì∞ APER√áU DES R√âSULTATS:
                                                title  is_financial  \
0   S&P 500 futures climb after index starts Febru...             1   
1   S&P 500 futures climb after index starts Febru...             1   
2   China Bans Hidden Car Door Handles in World-Fi...             0   
3   Switch has now outsold DS, becomes best-sellin...             0   
4   Switch has now outsold DS, becomes best-sellin...             0   
5   Powerball: See the winning numbers in Monday‚Äôs...             1   
6   Asian Stocks Set to Climb After US Data Lifts ...             1   
7   Elon Musk's SpaceX confirms it is taking over ...             0   
8   Elon Musk's SpaceX confirms it is ta

### Analyse des r√©sultats par cat√©gorie

In [17]:
## Analyse d√©taill√©e par actif et cat√©gorie

print("="*80)
print("üìä ANALYSE PAR ACTIF")
print("="*80)

asset_analysis = df_existing.groupby('asset_ticker').agg({
    'title': 'count',
    'is_financial': 'sum',
    'financial_confidence': 'mean'
}).rename(columns={
    'title': 'total_news',
    'is_financial': 'financial_news',
    'financial_confidence': 'avg_confidence'
})
asset_analysis['financial_ratio'] = (asset_analysis['financial_news'] / asset_analysis['total_news'] * 100).round(1)
asset_analysis = asset_analysis.sort_values('total_news', ascending=False)

print(asset_analysis.head(15))

print("\n" + "="*80)
print("üìä ANALYSE PAR TYPE D'ACTIF")
print("="*80)

type_analysis = df_existing.groupby('asset_type').agg({
    'title': 'count',
    'is_financial': 'sum',
    'financial_confidence': 'mean'
}).rename(columns={
    'title': 'total_news',
    'is_financial': 'financial_news',
    'financial_confidence': 'avg_confidence'
})
type_analysis['financial_ratio'] = (type_analysis['financial_news'] / type_analysis['total_news'] * 100).round(1)

print(type_analysis)

# Exemples de news NON-financi√®res d√©tect√©es
print("\n" + "="*80)
print("üö´ EXEMPLES DE NEWS NON-FINANCI√àRES D√âTECT√âES")
print("="*80)

non_financial = df_existing[df_existing['is_financial'] == 0].sort_values('financial_confidence')
print(non_financial[['title', 'source', 'financial_label', 'financial_confidence']].head(10))

üìä ANALYSE PAR ACTIF
              total_news  financial_news  avg_confidence  financial_ratio
asset_ticker                                                             
TSLA                 130              20        0.166121             15.4
HO                   124              11        0.090774              8.9
AAPL                  80              10        0.134594             12.5
CAC40                 77              21        0.277580             27.3
SP500                 75              35        0.466191             46.7
OIL                   64              48        0.743126             75.0
GER30                 63              14        0.223973             22.2
AMZN                  60              18        0.305016             30.0
MC                    59              16        0.263810             27.1
ENGI                  48              11        0.249496             22.9
GOLD                  48              20        0.417234             41.7
SAN            

### Export du CSV avec classification financi√®re

In [18]:
## Sauvegarder le nouveau CSV avec classification financi√®re

output_filename = f"hybrid_news_financial_classified_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"

df_existing.to_csv(output_filename, index=False)

print("="*80)
print("üíæ EXPORT TERMIN√â")
print("="*80)
print(f"\n‚úÖ CSV sauvegard√©: {output_filename}")
print(f"   Total: {len(df_existing)} news")
print(f"   Colonnes: {len(df_existing.columns)}")
print(f"\nüìã Colonnes incluses:")
for col in df_existing.columns:
    print(f"   ‚Ä¢ {col}")

# Export JSON √©galement
output_json = output_filename.replace('.csv', '.json')
df_existing.to_json(output_json, orient='records', indent=2)
print(f"\n‚úÖ JSON sauvegard√©: {output_json}")

üíæ EXPORT TERMIN√â

‚úÖ CSV sauvegard√©: hybrid_news_financial_classified_20260204_114901.csv
   Total: 975 news
   Colonnes: 16

üìã Colonnes incluses:
   ‚Ä¢ asset_ticker
   ‚Ä¢ asset_type
   ‚Ä¢ query_keyword
   ‚Ä¢ news_category
   ‚Ä¢ title
   ‚Ä¢ description
   ‚Ä¢ source
   ‚Ä¢ published_at
   ‚Ä¢ sentiment
   ‚Ä¢ confidence
   ‚Ä¢ prob_negative
   ‚Ä¢ prob_positive
   ‚Ä¢ url
   ‚Ä¢ is_financial
   ‚Ä¢ financial_confidence
   ‚Ä¢ financial_label

‚úÖ JSON sauvegard√©: hybrid_news_financial_classified_20260204_114901.json


### üìå Pipeline complet recommand√© : Filtrage en 2 √©tapes

**√âTAPE 1** : Classification binaire DistilBERT (is_financial)  
**√âTAPE 2** : Analyse sentiment FinBERT (uniquement si is_financial=1)

Ceci permet d'√©conomiser des ressources en √©vitant d'analyser le sentiment de news non-financi√®res.

In [None]:
## BONUS : Filtrer uniquement les news financi√®res pour analyse approfondie

df_financial_only = df_existing[df_existing['is_financial'] == 1].copy()

print("="*80)
print("üí∞ DATASET FILTR√â - NEWS FINANCI√àRES UNIQUEMENT")
print("="*80)
print(f"\nAvant filtrage: {len(df_existing)} news")
print(f"Apr√®s filtrage: {len(df_financial_only)} news financi√®res")
print(f"√âlimin√©es: {len(df_existing) - len(df_financial_only)} news non-financi√®res ({(len(df_existing) - len(df_financial_only)) / len(df_existing) * 100:.1f}%)")

# Statistiques sentiment sur news financi√®res uniquement
print(f"\nüìä SENTIMENT DES NEWS FINANCI√àRES:")
print(f"   Positive: {(df_financial_only['sentiment'] == 'Positive').sum()} ({(df_financial_only['sentiment'] == 'Positive').sum() / len(df_financial_only) * 100:.1f}%)")
print(f"   Negative: {(df_financial_only['sentiment'] == 'Negative').sum()} ({(df_financial_only['sentiment'] == 'Negative').sum() / len(df_financial_only) * 100:.1f}%)")
print(f"   Confiance sentiment moyenne: {df_financial_only['confidence'].mean():.2%}")
print(f"   Confiance classification moyenne: {df_financial_only['financial_confidence'].mean():.2%}")

# Export du dataset filtr√©
output_filtered = f"financial_news_only_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
df_financial_only.to_csv(output_filtered, index=False)
print(f"\n‚úÖ Dataset filtr√© sauvegard√©: {output_filtered}")