In [1]:
import newspaper
import pandas as pd
from newspaper import Article
from bs4 import BeautifulSoup
import requests
from datetime import datetime

In [2]:
# Fonction pour extraire les informations d'un article
def extract_article_info(article_url, site_name):
    article = Article(article_url)
    try:
        article.download()
        article.parse()

        # Vérifie si le contenu contient le mot "agriculture"
        if "agriculture" in article.text.lower():
            # Extraire les informations
            title = article.title
            author = article.authors[0] if article.authors else "Inconnu"
            date = article.publish_date or datetime.now().strftime("%Y-%m-%d")
            content = article.text
            image_url = article.top_image if article.top_image else ""

            return {
                "site_url": site_name,
                "title": title,
                "content": content,
                "author": author,
                "date": date,
                "link": article_url,
                "image_url": image_url,
            }
    except Exception as e:
        print(f"Erreur lors de l'extraction de l'article : {e}")

    return None

In [3]:
# Fonction pour extraire les liens d'articles d'une page
def get_article_links(site_url):
    try:
        response = requests.get(site_url)
        soup = BeautifulSoup(response.content, "html.parser")
        links = []

        # Exemples de sites avec des sélecteurs spécifiques
        if "senego.com" in site_url:
            articles = soup.find_all("article", class_="main-post")
            for article in articles:
                link = article.find("a")["href"]
                links.append(link)
        elif "dakaractu.com" in site_url:
            articles = soup.find_all("h3", class_="entry-title")
            for article in articles:
                link = article.find("a")["href"]
                links.append(link)
        
        return links
    except Exception as e:
        print(f"Erreur lors de l'obtention des liens d'articles pour {site_url}: {e}")
        return []




In [4]:
# Fonction principale pour le crawling
def crawl_articles(sites):
    all_articles = []
    for site_name, site_url in sites.items():
        print(f"Scraping {site_name}...")
        article_links = get_article_links(site_url)

        for article_url in article_links:
            article_info = extract_article_info(article_url, site_name)
            if article_info:
                all_articles.append(article_info)

    # Sauvegarder les articles dans un fichier Excel
    df = pd.DataFrame(all_articles)
    df.to_excel("agriculture_articles.xlsx", index=False)
    print("Articles enregistrés dans 'agriculture_articles.xlsx'.")

# Liste des sites à crawler
sites = {
    "Senego": "https://www.senego.com/",
    "Dakar Actu": "https://www.dakaractu.com/",
    # Ajouter d'autres sites si nécessaire
}

# Lancer le crawler
crawl_articles(sites)

Scraping Senego...
Scraping Dakar Actu...
Articles enregistrés dans 'agriculture_articles.xlsx'.
