In [1]:
import requests
from bs4 import BeautifulSoup
import re
import time
import csv
from datetime import datetime
import pandas as pd
import json
import logging
from pathlib import Path
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

In [2]:
BASE_URL = "https://ukraina.ru/opinion/"
LOAD_MORE_URL = "https://ukraina.ru/services/opinion/more.html"

def parse_articles(soup):
    articles = []
    for item in soup.select('div.list-item'):
        title_tag = item.select_one('a.list-item__title')
        if not title_tag:
            continue
        title = title_tag.text.strip()
        link = title_tag['href']
        author_tag = item.select_one('a.list-item__author b.list-item__author-title')
        author = author_tag.text.strip() if author_tag else ''
        date_tag = item.select_one('div.list-item__date')
        date_str = date_tag.text.strip() if date_tag else ''
        articles.append({
            'title': title,
            'link': "https://ukraina.ru" + link,
            'author': author,
            'date_str': date_str
        })

    return articles

def get_last_article_id_date(articles):
    if not articles:
        print("Liste d'articles vide dans get_last_article_id_date.")
        return None, None
    last_article = articles[-1]
    link = last_article['link']
    print("Extraction id/date du lien :", link)
    m = re.search(r'/(\d{8})/(?:[^/]+-)?(\d+)\.html', link)
    if m:
        date_str = m.group(1)
        id_str = m.group(2)
        print(f"Extrait id={id_str}, date={date_str}")
        return id_str, date_str
    print("Échec extraction id/date pour ce lien.")
    return None, None

def save_csv(filename, articles):
    with open(filename, "w", encoding="utf-8-sig", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=["title", "link", "author", "date_str"])
        writer.writeheader()
        for art in articles:
            writer.writerow(art)
    print(f"Articles sauvegardés dans {filename}, total : {len(articles)}")


def main():
    session = requests.Session()
    retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
    adapter = HTTPAdapter(max_retries=retries)
    session.mount("http://", adapter)
    session.mount("https://", adapter)

    headers = {
        "User-Agent": "Mozilla/5.0",
        "x-requested-with": "XMLHttpRequest",
    }

    print("Chargement initial...")
    r = session.get(BASE_URL, headers=headers)
    soup = BeautifulSoup(r.text, 'html.parser')
    all_articles = parse_articles(soup)
    print(f"Articles initiaux récupérés : {len(all_articles)}")

    if all_articles:
        print("Dernier lien article initial :", all_articles[-1]['link'])

    id_param, date_param = get_last_article_id_date(all_articles)
    if not id_param or not date_param:
        print("Impossible d'extraire id/date du dernier article initial, arrêt.")
        return

    params = {"id": id_param, "date": date_param}

    while True:
        print(f"Chargement articles suivants avec id={params['id']} date={params['date']} ...")
        try:
            resp = session.get(LOAD_MORE_URL, headers=headers, params=params, timeout=10)
            resp.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f"Erreur réseau : {e}, tentative de nouvelle requête après pause...")
            time.sleep(5)
            continue

        if not resp.text.strip():
            print("Plus d'articles trouvés via l'API ou réponse vide, fin.")
            break

        soup_next = BeautifulSoup(resp.text, 'html.parser')
        new_articles = parse_articles(soup_next)
        if not new_articles:
            print("Aucun nouvel article récupéré, fin.")
            break

        links_seen = set(a['link'] for a in all_articles)
        new_articles_filtered = [a for a in new_articles if a['link'] not in links_seen]
        if not new_articles_filtered:
            print("Tous les articles suivants sont déjà récupérés, fin.")
            break

        all_articles.extend(new_articles_filtered)
        print(f"Total articles récupérés : {len(all_articles)}")

        if len(all_articles) % 100 < len(new_articles_filtered):
            save_csv("articles_opinion.csv", all_articles)

        new_id, new_date = get_last_article_id_date(new_articles_filtered)
        if not new_id or not new_date:
            print("Impossible d'extraire id/date du dernier article reçu, fin.")
            break

        params['id'] = new_id
        params['date'] = new_date

        time.sleep(1)

    print(f"Articles sauvegardés dans articles_opinion.csv, total : {len(all_articles)}")

if __name__ == "__main__":
    main()

Chargement initial...
Articles initiaux récupérés : 12
Dernier lien article initial : https://ukraina.ru/20250526/tramp-po-prezhnemu-vybiraet-voynu-mirnymi-rechami-nikogo-obmanut-emu-ne-udalos-1063417343.html
Extraction id/date du lien : https://ukraina.ru/20250526/tramp-po-prezhnemu-vybiraet-voynu-mirnymi-rechami-nikogo-obmanut-emu-ne-udalos-1063417343.html
Extrait id=1063417343, date=20250526
Chargement articles suivants avec id=1063417343 date=20250526 ...
Total articles récupérés : 23
Extraction id/date du lien : https://ukraina.ru/20250521/chto-meshaet-peregovoram-esch-raz-o-zaprete-i-legitimnosti-1063305685.html
Extrait id=1063305685, date=20250521
Chargement articles suivants avec id=1063305685 date=20250521 ...
Total articles récupérés : 33
Extraction id/date du lien : https://ukraina.ru/20250515/andrey-ganzha-on-zhil-v-drugom-mire-a-umer-v-etom-1063149751.html
Extrait id=1063149751, date=20250515
Chargement articles suivants avec id=1063149751 date=20250515 ...
Total articles 

In [2]:
JSON_FILE = "articles_content_opinion.json"
LOG_FILE = "scraper_content_opinion.log"
BATCH_SIZE = 50

logging.basicConfig(
    filename=LOG_FILE,
    level=logging.INFO,
    format='%(asctime)s %(levelname)s: %(message)s'
)

HEADERS = {
    "User-Agent": "Mozilla/5.0",
    "x-requested-with": "XMLHttpRequest",
}

def fetch_article_content(url, session):
    try:
        resp = session.get(url, headers=HEADERS, timeout=10)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, 'html.parser')

        abstract_tag = soup.select_one('div.article__announce-text')
        abstract = abstract_tag.text.strip() if abstract_tag else ''

        full_text_blocks = soup.select('div.article__block[data-type="text"] > div.article__text')
        full_text = "\n\n".join([block.text.strip() for block in full_text_blocks])

        tags = [tag.text.strip() for tag in soup.select('div.article__tags a.article__tags-item')]

        views_tag = soup.select_one('span.statistic__item.m-views')
        if views_tag:
            views_text = views_tag.get_text(strip=True).replace('\xa0', '').replace(' ', '')
            try:
                views = int(views_text)
            except ValueError:
                views = None
        else:
            views = None

        return {
            "url": url,
            "abstract": abstract,
            "full_text": full_text,
            "tags": tags,
            "views": views
        }
    except Exception as e:
        logging.error(f"Error in fetch_article_content {url}: {e}")
        return None

def load_existing_data(json_file):
    if Path(json_file).exists():
        with open(json_file, 'r', encoding='utf-8') as f:
            return json.load(f)
    return []

def save_data(json_file, data):
    with open(json_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    logging.info(f"Saved data to {json_file}, total articles: {len(data)}")

def scrape_articles_content_from_csv(csv_file):
    print("Loading CSV file...")
    df = pd.read_csv(csv_file).drop_duplicates(subset="link").reset_index(drop=True)

    session = requests.Session()
    all_articles = load_existing_data(JSON_FILE)
    existing_articles = {a['url']: a for a in all_articles}

    print(f"Already scraped articles: {len(existing_articles)}")

    articles_to_fetch = []
    for _, row in df.iterrows():
        url = row['link']
        existing = existing_articles.get(url)

        if existing is None or 'views' not in existing or existing['views'] is None:
            articles_to_fetch.append((url, row))

    print(f"Articles to (re)scrape (missing views or not scraped): {len(articles_to_fetch)}")

    for idx, (url, row) in enumerate(articles_to_fetch, 1):
        print(f"Scraping article {idx}/{len(articles_to_fetch)}: {url}")
        content = fetch_article_content(url, session)
        if content:
            full_article = {
                "title": row['title'],
                "link": row['link'],
                "author": row.get('author', ''),
                "date_str": row.get('date_str', ''),
                **content
            }
            existing_articles[url] = full_article

            if idx % BATCH_SIZE == 0 or idx == len(articles_to_fetch):
                save_data(JSON_FILE, list(existing_articles.values()))

            time.sleep(1)
        else:
            logging.warning(f"Content not retrieved for {url}")

    print("Scraping finished.")

scrape_articles_content_from_csv("articles_opinion.csv")

Loading CSV file...
Already scraped articles: 7099
Articles to (re)scrape (missing views or not scraped): 7100
Scraping article 1/7100: https://ukraina.ru/20250529/si-vis-pacem-para-bellum-ot-es-i-odkb-protivniki-gotovyatsya-k-bezopasnoy-skhvatke--1063495085.html
Scraping article 2/7100: https://ukraina.ru/20250529/den-nezavisimosti-gruzii-myslennoe-puteshestvie-v-chudesnyy-mir-diplomaticheskikh-nedogovornnostey-1063492260.html
Scraping article 3/7100: https://ukraina.ru/20250529/master-klass-mezhdu-raundami-peregovorov-1063480420.html
Scraping article 4/7100: https://ukraina.ru/20250528/lev-gotovitsya-k-brosku--papa-rimskiy-opyat-zamyslil-globalnyy-katolitsizm-dlya-vsego-mira-1063476634.html
Scraping article 5/7100: https://ukraina.ru/20250528/insayd-iz-ssha-kto-novyy-kurator-ukrainy-v-vashingtone-i-kto-budet-podpisyvat-peremirie-ot-ukrainy-1063474906.html
Scraping article 6/7100: https://ukraina.ru/20250528/shakhtery-i-roboty-kak-iz-razrukhi-i-razoreniya-izvlech-donbassu-pribyl-10634

In [3]:
data = pd.read_json(JSON_FILE, encoding='utf-8')
data.head()

Unnamed: 0,title,link,author,date_str,url,abstract,full_text,tags,views
0,"""Si vis pacem, para bellum"" от ЕС и ОДКБ. Прот...",https://ukraina.ru/20250529/si-vis-pacem-para-...,Владимир Скачко,17:19,https://ukraina.ru/20250529/si-vis-pacem-para-...,"Главный редактор журнала ""Национальная оборона...","Например, когда Германия таки решится и постав...","[ЕС, Украина, Еврокомиссия, война, вооружения,...",
1,День независимости Грузии. Мысленное путешеств...,https://ukraina.ru/20250529/den-nezavisimosti-...,Василий Стоякин,15:35,https://ukraina.ru/20250529/den-nezavisimosti-...,24 мая президент Турции Реджеп Тайип Эрдоган п...,Общее правило при определении дней независимос...,"[Мнения, Грузия, США, Турция, Дональд Трамп, Э...",
2,Мастер-класс между раундами переговоров,https://ukraina.ru/20250529/master-klass-mezhd...,Ростислав Ищенко,10:02,https://ukraina.ru/20250529/master-klass-mezhd...,Второго июня в Стамбуле запланирован второй ра...,"Дело в том, что подобные делегации, состоящие ...","[Россия, Украина, Москва, Сергей Лавров, Влади...",
3,Лев готовится к броску. Папа Римский опять за...,https://ukraina.ru/20250528/lev-gotovitsya-k-b...,Владимир Скачко,"Вчера, 21:14",https://ukraina.ru/20250528/lev-gotovitsya-k-b...,Папа Римский Лев XIV в конце общей аудиенции 2...,Такое предложение – организовать встречу Росси...,"[Ватикан, Украина, Россия, Папа Римский, Влади...",
4,Инсайд из США. Кто новый куратор Украины в Ваш...,https://ukraina.ru/20250528/insayd-iz-ssha-kto...,Михаил Павлив,"Вчера, 19:54",https://ukraina.ru/20250528/insayd-iz-ssha-kto...,После задавших темп в международном кейсе пред...,"Состоялся обмен пленными в три захода, выполне...","[Украина, Россия, Вашингтон, Владимир Зеленски...",
