In [1]:
import requests
from bs4 import BeautifulSoup
import re
import time
import csv
from datetime import datetime
import pandas as pd
import json
import logging
from pathlib import Path
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

In [2]:
BASE_URL = "https://ukraina.ru/interview/"
LOAD_MORE_URL = "https://ukraina.ru/services/interview/more.html"

def parse_articles(soup):
    articles = []
    for item in soup.select('div.list-item'):
        title_tag = item.select_one('a.list-item__title')
        if not title_tag:
            continue
        title = title_tag.text.strip()
        link = title_tag['href']
        author_tag = item.select_one('a.list-item__author b.list-item__author-title')
        author = author_tag.text.strip() if author_tag else ''
        date_tag = item.select_one('div.list-item__date')
        date_str = date_tag.text.strip() if date_tag else ''
        articles.append({
            'title': title,
            'link': "https://ukraina.ru" + link,
            'author': author,
            'date_str': date_str
        })

    return articles

def get_last_article_id_date(articles):
    if not articles:
        print("Liste d'articles vide dans get_last_article_id_date.")
        return None, None
    last_article = articles[-1]
    link = last_article['link']
    print("Extraction id/date du lien :", link)
    m = re.search(r'/(\d{8})/(?:[^/]+-)?(\d+)\.html', link)
    if m:
        date_str = m.group(1)
        id_str = m.group(2)
        print(f"Extrait id={id_str}, date={date_str}")
        return id_str, date_str
    print("Échec extraction id/date pour ce lien.")
    return None, None

def save_csv(filename, articles):
    with open(filename, "w", encoding="utf-8-sig", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=["title", "link", "author", "date_str"])
        writer.writeheader()
        for art in articles:
            writer.writerow(art)
    print(f"Articles sauvegardés dans {filename}, total : {len(articles)}")


def main():
    session = requests.Session()
    retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
    adapter = HTTPAdapter(max_retries=retries)
    session.mount("http://", adapter)
    session.mount("https://", adapter)

    headers = {
        "User-Agent": "Mozilla/5.0",
        "x-requested-with": "XMLHttpRequest",
    }

    print("Chargement initial...")
    r = session.get(BASE_URL, headers=headers)
    soup = BeautifulSoup(r.text, 'html.parser')
    all_articles = parse_articles(soup)
    print(f"Articles initiaux récupérés : {len(all_articles)}")

    if all_articles:
        print("Dernier lien article initial :", all_articles[-1]['link'])

    id_param, date_param = get_last_article_id_date(all_articles)
    if not id_param or not date_param:
        print("Impossible d'extraire id/date du dernier article initial, arrêt.")
        return

    params = {"id": id_param, "date": date_param}

    while True:
        print(f"Chargement articles suivants avec id={params['id']} date={params['date']} ...")
        try:
            resp = session.get(LOAD_MORE_URL, headers=headers, params=params, timeout=10)
            resp.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f"Erreur réseau : {e}, tentative de nouvelle requête après pause...")
            time.sleep(5)
            continue

        if not resp.text.strip():
            print("Plus d'articles trouvés via l'API ou réponse vide, fin.")
            break

        soup_next = BeautifulSoup(resp.text, 'html.parser')
        new_articles = parse_articles(soup_next)
        if not new_articles:
            print("Aucun nouvel article récupéré, fin.")
            break

        links_seen = set(a['link'] for a in all_articles)
        new_articles_filtered = [a for a in new_articles if a['link'] not in links_seen]
        if not new_articles_filtered:
            print("Tous les articles suivants sont déjà récupérés, fin.")
            break

        all_articles.extend(new_articles_filtered)
        print(f"Total articles récupérés : {len(all_articles)}")

        if len(all_articles) % 100 < len(new_articles_filtered):
            save_csv("articles_interview.csv", all_articles)

        new_id, new_date = get_last_article_id_date(new_articles_filtered)
        if not new_id or not new_date:
            print("Impossible d'extraire id/date du dernier article reçu, fin.")
            break

        params['id'] = new_id
        params['date'] = new_date

        time.sleep(1)

    print(f"Articles sauvegardés dans articles_interview.csv, total : {len(all_articles)}")

if __name__ == "__main__":
    main()

Chargement initial...
Articles initiaux récupérés : 10
Dernier lien article initial : https://ukraina.ru/20250527/vladimir-orlov-gidre-kotoraya-bet-bespilotnikami-po-rossii-pora-rubit-golovy-a-ne-schupaltsa-1063446375.html
Extraction id/date du lien : https://ukraina.ru/20250527/vladimir-orlov-gidre-kotoraya-bet-bespilotnikami-po-rossii-pora-rubit-golovy-a-ne-schupaltsa-1063446375.html
Extrait id=1063446375, date=20250527
Chargement articles suivants avec id=1063446375 date=20250527 ...
Total articles récupérés : 18
Extraction id/date du lien : https://ukraina.ru/20250525/radiovolny-vmesto-ratsii-kak-vengerskaya-grafinya-olga-batogova-smenila-razvedku-na-inoveschanie-1062771726.html
Extrait id=1062771726, date=20250525
Chargement articles suivants avec id=1062771726 date=20250525 ...
Total articles récupérés : 27
Extraction id/date du lien : https://ukraina.ru/20250523/ruslan-pankratov-kak-tolko-rossiya-razberetsya-s-ukrainoy-ona-zaymetsya-pribaltikoy-1063346519.html
Extrait id=1063346

In [3]:
JSON_FILE = "articles_content_interview.json"
LOG_FILE = "scraper_content_interview.log"
BATCH_SIZE = 50

logging.basicConfig(filename=LOG_FILE, level=logging.INFO,
                    format='%(asctime)s %(levelname)s: %(message)s')

HEADERS = {
    "User-Agent": "Mozilla/5.0",
    "x-requested-with": "XMLHttpRequest",
}

def fetch_article_content(url, session):
    try:
        resp = session.get(url, headers=HEADERS, timeout=10)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, 'html.parser')

        abstract_tag = soup.select_one('div.article__announce-text')
        abstract = abstract_tag.text.strip() if abstract_tag else ''

        full_text_blocks = soup.select('div.article__block[data-type="text"] > div.article__text')
        full_text = "\n\n".join([block.text.strip() for block in full_text_blocks])

        tags = [tag.text.strip() for tag in soup.select('div.article__tags a.article__tags-item')]

        return {
            "url": url,
            "abstract": abstract,
            "full_text": full_text,
            "tags": tags,
        }
    except Exception as e:
        logging.error(f"Erreur fetch_article_content {url}: {e}")
        return None

def load_existing_data(json_file):
    if Path(json_file).exists():
        with open(json_file, 'r', encoding='utf-8') as f:
            return json.load(f)
    return []

def save_data(json_file, data):
    with open(json_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    logging.info(f"Données sauvegardées dans {json_file}, total articles: {len(data)}")

# --- Main scraping function ---
def scrape_articles_content_from_csv(csv_file):
    df = pd.read_csv(csv_file)
    session = requests.Session()

    all_articles = load_existing_data(JSON_FILE)
    existing_urls = {a['url'] for a in all_articles}

    print(f"Articles déjà scrappés : {len(existing_urls)}")

    articles_to_fetch = [row for _, row in df.iterrows() if row['link'] not in existing_urls]

    print(f"Articles à scraper : {len(articles_to_fetch)}")

    for idx, row in enumerate(articles_to_fetch, 1):
        url = row['link']
        print(f"Scraping article {idx}/{len(articles_to_fetch)} : {url}")

        content = fetch_article_content(url, session)
        if content:
            full_article = {
                "title": row['title'],
                "link": row['link'],
                "author": row.get('author', ''),
                "date_str": row.get('date_str', ''),
                **content
            }
            all_articles.append(full_article)

            if idx % BATCH_SIZE == 0 or idx == len(articles_to_fetch):
                save_data(JSON_FILE, all_articles)

            time.sleep(1) 

        else:
            logging.warning(f"Contenu non récupéré pour {url}")

    print("Scraping terminé.")

scrape_articles_content_from_csv("articles_interview.csv")


Articles déjà scrappés : 0
Articles à scraper : 1603
Scraping article 1/1603 : https://ukraina.ru/20250529/1063490016.html
Scraping article 2/1603 : https://ukraina.ru/20250529/sdelka-s-trampom-i-usluga-turtsii-dmitriy-vydrin-o-tom-kak-rf-zakonchit-voynu-na-ukraine-za-paru-1063477232.html
Scraping article 3/1603 : https://ukraina.ru/20250529/1063466844.html
Scraping article 4/1603 : https://ukraina.ru/20250529/maykl-bom-tramp-ne-verit-v-uspeshnost-peregovorov-rf-i-ukrainy-no-zolotaya-likhoradka-ne-daet-emu-1063477629.html
Scraping article 5/1603 : https://ukraina.ru/20250528/timofey-borisov-merts-yulit-kogda-rech-idet-o-taurusakh-no-vsu-budut-poluchat-drugoe-nemetskoe-oruzhie-1063474166.html
Scraping article 6/1603 : https://ukraina.ru/20250528/pavel-leonov-o-tom-chto-voennoe-delo---iskusstvo-poetomu-nasha-zadacha-peredumat-protivnika-1063458482.html
Scraping article 7/1603 : https://ukraina.ru/20250528/1063448125.html
Scraping article 8/1603 : https://ukraina.ru/20250528/larisa-shesle