In [None]:
"""
Pipeline minimal pour le projet "Santé et Innovation" (OMS + Forbes Afrique).
- Récupère des articles (exemples d'URLs ou via newspaper3k)
- Pré-traitement (lower, punctuation, stopwords, tokenization, lemmatize optionnel)
- TF-IDF vectorization
- Similarité Cosine entre sources
- Sentiment (optionnel / basic)
- Clustering simple (KMeans) ou topic modeling (NMF)
"""

from newspaper import Article
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import unicodedata
from unidecode import unidecode
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.decomposition import NMF
from tqdm import tqdm

# --- CONFIG ---
# Liste initiale d'URLs (exemples) : tu peux remplir/étendre avec des pages WHO et Forbes Africa
URLS = [
    # exemples (remplace par les urls réelles d'articles WHO et Forbes Afrique)
    "https://www.who.int/news/item/2025-01-10-example-article",   # remplacer
    "https://www.forbesafrica.com/technology/2025/01/05/example-article"  # remplacer
]

LANG = "fr"   # "fr" si la majorité du contenu est français, sinon "en" ou détecter automatiquement
USE_SPACY_LEMMA = True

# --- Init ressources linguistiques ---
nltk.download('punkt')
nltk.download('stopwords')
STOPWORDS = set(stopwords.words('french' if LANG.startswith('fr') else 'english'))
# spaCy (fr) si disponible
if USE_SPACY_LEMMA:
    try:
        nlp = spacy.load("fr_core_news_sm") if LANG.startswith('fr') else spacy.load("en_core_web_sm")
    except Exception as e:
        print("spaCy model non trouvé. Désactiver la lemmatisation ou installer le modèle spaCy.")
        USE_SPACY_LEMMA = False
        nlp = None

# --- Fonctions utilitaires ---
def fetch_article(url):
    """
    Tente d'extraire le texte et le titre de l'article.
    Utilise newspaper3k si possible, sinon fallback to requests+bs4.
    """
    try:
        art = Article(url, language=LANG)
        art.download()
        art.parse()
        return {"url": url, "title": art.title, "text": art.text}
    except Exception:
        # fallback: simple requests + bs4
        try:
            res = requests.get(url, timeout=10, headers={"User-Agent": "Mozilla/5.0"})
            soup = BeautifulSoup(res.text, "html.parser")
            # heuristique : récupérer <article> ou concatérer <p>
            article_tag = soup.find("article")
            if article_tag:
                txt = " ".join([p.get_text(separator=" ", strip=True) for p in article_tag.find_all("p")])
            else:
                txt = " ".join([p.get_text(separator=" ", strip=True) for p in soup.find_all("p")])
            title = soup.title.string if soup.title else ""
            return {"url": url, "title": title, "text": txt}
        except Exception as e:
            print(f"Erreur fetch {url} : {e}")
            return {"url": url, "title": "", "text": ""}

def normalize_text(text):
    """Lower, remove accents, punctuation, digits, extra spaces."""
    if not text: 
        return ""
    text = text.lower()
    # remove accents
    text = unidecode(text)
    # remove punctuation and digits (keep simple letters and spaces)
    text = re.sub(r'[^a-z\s]', ' ', text)
    # collapse whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def tokenize_and_clean(text, stopwords_set=STOPWORDS, lemmatize=False):
    """Tokenize, remove stopwords, optionally lemmatize via spaCy."""
    if not text:
        return ""
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t.isalpha() and len(t) > 1]  # alphabetic tokens
    tokens = [t for t in tokens if t not in stopwords_set]
    if lemmatize and USE_SPACY_LEMMA and nlp is not None:
        doc = nlp(" ".join(tokens))
        tokens = [token.lemma_ for token in doc]
    return " ".join(tokens)

# --- Pipeline principal ---
def build_corpus(urls):
    rows = []
    for u in tqdm(urls, desc="Téléchargement articles"):
        art = fetch_article(u)
        rows.append(art)
    df = pd.DataFrame(rows)
    return df

def preprocess_dataframe(df):
    df["text_raw"] = df["text"].fillna("")
    df["text_norm"] = df["text_raw"].apply(normalize_text)
    df["text_clean"] = df["text_norm"].apply(lambda t: tokenize_and_clean(t, STOPWORDS, lemmatize=USE_SPACY_LEMMA))
    return df

def vectorize_tfidf(corpus, max_features=5000, ngram_range=(1,2)):
    """Retourne matrice TF-IDF (sparse) et le vectorizer."""
    vec = TfidfVectorizer(max_features=max_features, ngram_range=ngram_range)
    X = vec.fit_transform(corpus)
    return X, vec

def compute_similarity(X):
    """Matrice de similarité cosinus (N x N)."""
    return cosine_similarity(X)

def sentiment_basic(text):
    """
    Version basique de sentiment : heuristique par lexique (très simple).
    Pour du sérieux -> utiliser transformers pipeline multilingue ou model francophone.
    """
    # liste minimale d'exemples
    pos_words = {"bon","positif","amelior","utile","important","prometteur"}
    neg_words = {"risque","mort","critique","probleme","negatif","alerte","grave"}
    tset = set(text.split())
    score = sum(1 for w in tset if w in pos_words) - sum(1 for w in tset if w in neg_words)
    return score

# --- Exécution exemple ---
if __name__ == "__main__":
    # 1) construire corpus
    df = build_corpus(URLS)
    print(f"{len(df)} articles récupérés.")

    # 2) pré-traitement
    df = preprocess_dataframe(df)
    df["sentiment_score"] = df["text_clean"].apply(sentiment_basic)

    # 3) TF-IDF
    X, tfidf = vectorize_tfidf(df["text_clean"].fillna(""), max_features=3000, ngram_range=(1,2))
    sim = compute_similarity(X)

    # 4) similarité moyenne OMS-vs-Forbes (exemple : on marque la source dans df)
    # Ici tu dois fournir un champ 'source' (OMS/Forbes) dans ton df
    # Exemple : df['source'] = ['OMS','Forbes', ...]
    if 'source' in df.columns:
        sources = df['source'].values
        # calcule similarité moyenne entre groups
        import numpy as np
        mask_oms = (sources == 'OMS')
        mask_forbes = (sources == 'Forbes')
        if mask_oms.sum() and mask_forbes.sum():
            sub = sim[np.ix_(mask_oms, mask_forbes)]
            print("Similarité moyenne OMS<->Forbes:", sub.mean())

    # 5) clustering (KMeans) sur TF-IDF
    k = 5
    kmeans = KMeans(n_clusters=k, random_state=42)
    df["cluster"] = kmeans.fit_predict(X)

    # 6) topic modeling NMF (optionnel)
    nmf_k = 6
    nmf = NMF(n_components=nmf_k, random_state=42)
    W = nmf.fit_transform(X)
    H = nmf.components_
    # top mots par topic
    feature_names = tfidf.get_feature_names_out()
    topics = []
    for topic_idx, topic in enumerate(H):
        top_features_ind = topic.argsort()[:-11:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        topics.append(top_features)
        print(f"Topic {topic_idx} : {', '.join(top_features)}")

    # 7) sauvegarde
    df.to_csv("articles_processed.csv", index=False)
    print("Pipeline terminé. Fichier articles_processed.csv créé.")
