In [1]:
import requests
from bs4 import BeautifulSoup
import re
import time
import csv
from datetime import datetime
import pandas as pd
import json
import logging
from pathlib import Path
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

In [2]:
def parse_articles(soup):
    articles = []
    for item in soup.select('div.list-item'):
        title_tag = item.select_one('a.list-item__title')
        if not title_tag:
            continue
        title = title_tag.text.strip()
        link = title_tag['href']
        author_tag = item.select_one('a.list-item__author b.list-item__author-title')
        author = author_tag.text.strip() if author_tag else ''
        date_tag = item.select_one('div.list-item__date')
        date_str = date_tag.text.strip() if date_tag else ''
        articles.append({
            'title': title,
            'link': "https://ukraina.ru" + link,
            'author': author,
            'date_str': date_str
        })
    return articles

def get_last_article_id_date(articles):
    if not articles:
        return None, None
    articles_sorted = sorted(articles, key=lambda x: x['link'])
    last_article = articles_sorted[0]
    link = last_article['link']
    m = re.search(r'/(\d{8})/(?:[^/]+-)?(\d+)\.html', link)
    if m:
        date_str = m.group(1)
        id_str = m.group(2)
        return id_str, date_str
    return None, None

def save_csv(filename, articles):
    with open(filename, "w", encoding="utf-8-sig", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=["title", "link", "author", "date_str"])
        writer.writeheader()
        for art in articles:
            writer.writerow(art)
    print(f"Saved {len(articles)} articles to {filename}")

def scrape_section(BASE_URL, LOAD_MORE_URL, output_csv):
    print(f"--- Starting scraping section: {BASE_URL} ---")
    session = requests.Session()
    retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
    adapter = HTTPAdapter(max_retries=retries)
    session.mount("http://", adapter)
    session.mount("https://", adapter)

    headers = {
        "User-Agent": "Mozilla/5.0",
        "x-requested-with": "XMLHttpRequest",
    }

    r = session.get(BASE_URL, headers=headers)
    soup = BeautifulSoup(r.text, 'html.parser')
    all_articles = parse_articles(soup)
    print(f"Initial articles fetched: {len(all_articles)}")

    id_param, date_param = get_last_article_id_date(all_articles)
    if not id_param or not date_param:
        print("ID/date extraction failed. Skipping section.")
        return all_articles

    params = {"id": id_param, "date": date_param}

    while True:
        try:
            resp = session.get(LOAD_MORE_URL, headers=headers, params=params, timeout=10)
            resp.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f"Network error: {e}. Retrying...")
            time.sleep(5)
            continue

        if not resp.text.strip():
            print("No more articles. Stopping.")
            break

        soup_next = BeautifulSoup(resp.text, 'html.parser')
        new_articles = parse_articles(soup_next)

        if not new_articles:
            print("No new articles found.")
            break

        links_seen = set(a['link'] for a in all_articles)
        new_articles_filtered = [a for a in new_articles if a['link'] not in links_seen]

        if not new_articles_filtered:
            print("No unseen articles. Ending.")
            break

        all_articles.extend(new_articles_filtered)
        print(f"Total collected so far: {len(all_articles)}")

        new_id, new_date = get_last_article_id_date(new_articles_filtered)
        if not new_id or not new_date:
            print("ID/date extraction failed in new batch.")
            break

        params['id'] = new_id
        params['date'] = new_date
        time.sleep(0.5)

    save_csv(output_csv, all_articles)
    return all_articles


In [3]:
historia_articles = scrape_section(
    BASE_URL="https://ukraina.ru/history/",
    LOAD_MORE_URL="https://ukraina.ru/services/history/more.html",
    output_csv="articles_historia.csv"
)

--- Starting scraping section: https://ukraina.ru/history/ ---
Initial articles fetched: 9
Total collected so far: 16
Total collected so far: 23
Total collected so far: 30
Total collected so far: 38
Total collected so far: 45
Total collected so far: 51
Total collected so far: 57
Total collected so far: 62
Total collected so far: 70
Total collected so far: 78
Total collected so far: 86
Total collected so far: 92
Total collected so far: 99
Total collected so far: 105
Total collected so far: 113
Total collected so far: 121
Total collected so far: 127
Total collected so far: 134
Total collected so far: 141
Total collected so far: 149
Total collected so far: 155
Total collected so far: 162
Total collected so far: 169
Total collected so far: 175
Total collected so far: 182
Total collected so far: 189
Total collected so far: 196
Total collected so far: 202
Total collected so far: 207
Total collected so far: 213
Total collected so far: 220
Total collected so far: 227
Total collected so far: 23

In [4]:
opinion_articles = scrape_section(
    BASE_URL="https://ukraina.ru/opinion/",
    LOAD_MORE_URL="https://ukraina.ru/services/opinion/more.html",
    output_csv="articles_opinion.csv"
)

--- Starting scraping section: https://ukraina.ru/opinion/ ---
Initial articles fetched: 12
Total collected so far: 22
Total collected so far: 32
Total collected so far: 40
Total collected so far: 48
Total collected so far: 59
Total collected so far: 69
Total collected so far: 79
Total collected so far: 90
Total collected so far: 101
Total collected so far: 107
Total collected so far: 116
Total collected so far: 127
Total collected so far: 137
Total collected so far: 147
Total collected so far: 158
Total collected so far: 164
Total collected so far: 174
Total collected so far: 183
Total collected so far: 191
Total collected so far: 200
Total collected so far: 210
Total collected so far: 221
Total collected so far: 230
Total collected so far: 241
Total collected so far: 251
Total collected so far: 258
Total collected so far: 269
Total collected so far: 280
Total collected so far: 291
Total collected so far: 301
Total collected so far: 311
Total collected so far: 320
Total collected so f

In [5]:
URL = "https://ukraina.ru/history/"
OUTPUT_CSV = "articles_historia_scroll_2014_2020.csv"
SCROLL_DURATION = 300


def parse_articles(soup):
    articles = []
    for item in soup.select("div.list-item"):
        title_tag = item.select_one("a.list-item__title")
        if not title_tag:
            continue
        title = title_tag.get_text(strip=True)
        link = title_tag.get("href")
        if not link:
            continue
        full_link = "https://ukraina.ru" + link

        author_tag = item.select_one("a.list-item__author b.list-item__author-title")
        author = author_tag.get_text(strip=True) if author_tag else ''

        date_tag = item.select_one("div.list-item__date")
        date_str = date_tag.get_text(strip=True) if date_tag else ''

        articles.append({
            "title": title,
            "link": full_link,
            "author": author,
            "date_str": date_str
        })
    return articles

options = Options()
options.add_argument("--disable-notifications")
options.add_argument("--disable-popup-blocking")
options.add_argument("--disable-infobars")
options.add_argument("--disable-extensions")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option("useAutomationExtension", False)

driver = webdriver.Chrome(options=options)
driver.get(URL)
time.sleep(60)  

print("Scrolling started...")
start_time = time.time()
scroll_pause = 0.5
last_height = driver.execute_script("return document.body.scrollHeight")

while True:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(scroll_pause)
    new_height = driver.execute_script("return document.body.scrollHeight")

    if new_height == last_height:
        print("No new content loaded. Stopping scroll.")
        break

    if time.time() - start_time > SCROLL_DURATION:
        print("Max scroll duration reached.")
        break

    last_height = new_height

print("Scroll complete. Parsing page...")

soup = BeautifulSoup(driver.page_source, "html.parser")
articles = parse_articles(soup)
driver.quit()

print(f"Total articles scraped: {len(articles)}")

with open(OUTPUT_CSV, "w", encoding="utf-8-sig", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=["title", "link", "author", "date_str"])
    writer.writeheader()
    writer.writerows(articles)

print(f"Saved to {OUTPUT_CSV}")


Scrolling started...
No new content loaded. Stopping scroll.
Scroll complete. Parsing page...
Total articles scraped: 9
Saved to articles_historia_scroll_2014_2020.csv


In [6]:
def fetch_article_content_basic(url, session):
    try:
        resp = session.get(url, headers=HEADERS, timeout=10)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, 'html.parser')

        abstract_tag = soup.select_one('div.article__announce-text')
        abstract = abstract_tag.text.strip() if abstract_tag else ''

        full_text_blocks = soup.select('div.article__block[data-type="text"] > div.article__text')
        full_text = "\n\n".join([block.text.strip() for block in full_text_blocks])

        tags = [tag.text.strip() for tag in soup.select('div.article__tags a.article__tags-item')]

        return {
            "url": url,
            "abstract": abstract,
            "full_text": full_text,
            "tags": tags
        }
    except Exception as e:
        logging.error(f"Error in fetch_article_content_basic {url}: {e}")
        return None

def load_existing_data(json_file):
    if Path(json_file).exists():
        with open(json_file, 'r', encoding='utf-8') as f:
            return json.load(f)
    return []

def save_data(json_file, data):
    with open(json_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    logging.info(f"Saved data to {json_file}, total articles: {len(data)}")

def scrape_articles_content_from_multiple_csv(csv_files, JSON_FILE):
    print("Loading CSV files...")
    dfs = [pd.read_csv(f) for f in csv_files]
    df = pd.concat(dfs).drop_duplicates(subset="link").reset_index(drop=True)

    session = requests.Session()
    all_articles = load_existing_data(JSON_FILE)

    existing_articles = {a['url']: a for a in all_articles}

    print(f"Already scraped articles: {len(existing_articles)}")

    articles_to_fetch = []
    for _, row in df.iterrows():
        url = row['link']
        existing = existing_articles.get(url)

        if existing is None:
            articles_to_fetch.append((url, row))

    print(f"Articles to scrape : {len(articles_to_fetch)}")
    for idx, (url, row) in enumerate(articles_to_fetch, 1):
        print(f"Scraping article {idx}/{len(articles_to_fetch)}: {url}")
        content = fetch_article_content_basic(url, session)
        if content:
            full_article = {
                "title": row['title'],
                "link": row['link'],
                "author": row.get('author', ''),
                "date_str": row.get('date_str', ''),
                **content
            }
            existing_articles[url] = full_article

            if idx % BATCH_SIZE == 0 or idx == len(articles_to_fetch):
                save_data(JSON_FILE, list(existing_articles.values()))

            time.sleep(1)
        else:
            logging.warning(f"Content not retrieved for {url}")

    print("Scraping finished.")

In [7]:
CSV_FILES_HISTORIA = ["articles_historia.csv", "articles_historia_scroll_2014_2020.csv"]
CSV_FILES_OPINION = ["articles_opinion.csv"]
JSON_FILE_HISTORIA = "articles_content_historia.json"
LOG_FILE_HISTORIA = "scraper_content_historia.log"
JSON_FILE_OPINION = "articles_content_opinion.json"
LOG_FILE_OPINION = "scraper_content_opinion.log"

BATCH_SIZE = 50

HEADERS = {
    "User-Agent": "Mozilla/5.0",
    "x-requested-with": "XMLHttpRequest",
}

In [8]:
logging.basicConfig(
    filename=LOG_FILE_HISTORIA,
    level=logging.INFO,
    format='%(asctime)s %(levelname)s: %(message)s'
)

scrape_articles_content_from_multiple_csv(CSV_FILES_HISTORIA, JSON_FILE_HISTORIA)

Loading CSV files...
Already scraped articles: 4371
Articles to scrape : 2
Scraping article 1/2: https://ukraina.ru/20250610/na-kievskom-napravlenii--edinstvennyy-film-o-kievskom-kotle-1063816565.html
Scraping article 2/2: https://ukraina.ru/20250610/1023859046.html
Scraping finished.


In [9]:
logging.basicConfig(
    filename=LOG_FILE_OPINION,
    level=logging.INFO,
    format='%(asctime)s %(levelname)s: %(message)s'
)

scrape_articles_content_from_multiple_csv(CSV_FILES_OPINION, JSON_FILE_OPINION)

Loading CSV files...
Already scraped articles: 7100
Articles to scrape : 45
Scraping article 1/45: https://ukraina.ru/20250610/knr-po-amerikanski-los-anzheles-na-puti-iz-ssha-v-kaliforniyskuyu-narodnuyu-respubliku-1063817273.html
Scraping article 2/45: https://ukraina.ru/20250609/zadacha-6-tysyach-tel-chto-pridumal-zelenskiy-dlya-mistifikatsii-obmena-telami-pogibshikh-soldat-1063797895.html
Scraping article 3/45: https://ukraina.ru/20250609/ukrainskiy-bunt-bessmyslennyy-besposchadnyy-nerealnyy-1063793638.html
Scraping article 4/45: https://ukraina.ru/20250609/neodnoznachnye-raschety-antirampistov-evropa-tyanet-trampa-na-voynu-pokornostyu-i-strakhom-zelenskogo-1063789836.html
Scraping article 5/45: https://ukraina.ru/20250609/mir-na-grani-na-grani-chego-1063767900.html
Scraping article 6/45: https://ukraina.ru/20250607/doroga-v-ad-uskorilas-ukrainskie-neonatsisty-uzhe-izbavilis-ot-chelovecheskogo-oblika-1063747742.html
Scraping article 7/45: https://ukraina.ru/20250605/vozmezdie-neminue

In [10]:
def get_views_from_dynamics(article_url):
    try:
        parts = article_url.strip('/').split('/')
        article_id = parts[-1].replace('.html', '')
        date_str = parts[-2]
        url = f"https://ukraina.ru/services/dynamics/{date_str}/{article_id}.html"

        headers = {
            "User-Agent": "Mozilla/5.0",
            "Referer": article_url,
            "Accept": "text/html, */*;q=0.01",
            "X-Requested-With": "XMLHttpRequest"
        }

        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, "html.parser")
        views_tag = soup.select_one('span.statistic__item.m-views')
        if views_tag:
            views_text = views_tag.get_text(strip=True).replace('\xa0', '').replace(' ', '')
            return int(views_text)
    except Exception as e:
        print(f"Error fetching views from dynamics URL: {e}")
        return None

In [20]:
def update_views_from_dynamics(json_file):
    if not Path(json_file).exists():
        print("JSON file not found.")
        return

    with open(json_file, encoding="utf-8") as f:
        try:
            data = json.load(f)
        except json.JSONDecodeError as e:
            print(f"JSONDecodeError: {e}")
            return

    updated = 0
    for i, article in enumerate(data, 1):
        if 'views' not in article or article['views'] is None:
            url = article.get("url") or article.get("link")
            if not url:
                continue
            views = get_views_from_dynamics(url)
            article['views'] = views
            updated += 1
            print(f"[{i}] Updated views: {views} for {url}")

        if updated % 50 == 0:
            with open(json_file, 'w', encoding='utf-8') as f:
                json.dump(data, f, ensure_ascii=False, indent=2)
            print(f"Saved after {updated} updates.")

    # Sauvegarde finale
    with open(json_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print(f"Total articles updated with views: {updated}")

In [21]:
update_views_from_dynamics("articles_content_historia.json")

JSONDecodeError: Expecting ',' delimiter: line 40186 column 11829 (char 18258757)


In [24]:
JSON_FILE = "articles_content_historia.json"

with open(JSON_FILE, "r", encoding="utf-8") as f:
    for i in range(10):
        print(f.readline().strip())


[
{
"title": "Жёлты воды: первая победа Богдана Хмельницкого",
"link": "https://ukraina.ru/20250517/1023584796.html",
"author": "Александр  Александров",
"date_str": "17 мая, 16:04",
"url": "https://ukraina.ru/20250517/1023584796.html",
"abstract": "16 мая 1648 года Войско Запорожское нанесло поражение отряду Стефана Потоцкого под Жёлтыми водами. Вся эпопея длилась 18 дней и потребовала существенного напряжения сил обоих сторон, однако Хмельницкий, сумевший реализовать несколько хитрых задумок и воспользовавшийся ошибками поляков, довёл дело до закономерного исхода",
"full_text": "Узнав о начале восстания казаков под руководством Хмельницкого, великий коронный гетман Николай Потоцкий, не дожидаясь помощи от могущественного магната Иеремии Вишневецкого, 21 апреля 1648 года направил против казацко-татарских отрядов два крупных соединения, надеясь подавить мятеж быстрым точечным ударом. Он писал: \"Этот безрассудный человек, Хмельницкий, не преклонится перед милостью\".\n\nПервым войском 

In [25]:
update_views_from_dynamics("articles_content_opinion.json")

[1] Updated views: 1902 for https://ukraina.ru/20250529/si-vis-pacem-para-bellum-ot-es-i-odkb-protivniki-gotovyatsya-k-bezopasnoy-skhvatke--1063495085.html
[2] Updated views: 987 for https://ukraina.ru/20250529/den-nezavisimosti-gruzii-myslennoe-puteshestvie-v-chudesnyy-mir-diplomaticheskikh-nedogovornnostey-1063492260.html
[3] Updated views: 13724 for https://ukraina.ru/20250529/master-klass-mezhdu-raundami-peregovorov-1063480420.html
[4] Updated views: 1182 for https://ukraina.ru/20250528/lev-gotovitsya-k-brosku--papa-rimskiy-opyat-zamyslil-globalnyy-katolitsizm-dlya-vsego-mira-1063476634.html
[5] Updated views: 3984 for https://ukraina.ru/20250528/insayd-iz-ssha-kto-novyy-kurator-ukrainy-v-vashingtone-i-kto-budet-podpisyvat-peremirie-ot-ukrainy-1063474906.html
[6] Updated views: 1748 for https://ukraina.ru/20250528/shakhtery-i-roboty-kak-iz-razrukhi-i-razoreniya-izvlech-donbassu-pribyl-1063459026.html
[7] Updated views: 16010 for https://ukraina.ru/20250528/kharkov-ili-odessa-106345

In [26]:
JSON_FILE = "articles_content_opinion.json"
data = pd.read_json(JSON_FILE, encoding='utf-8')
data.head()
print("Data length:", len(data))
data["views"]

Data length: 7145


0        1902
1         987
2       13724
3        1182
4        3984
        ...  
7140     1472
7141      598
7142    21853
7143     1477
7144      937
Name: views, Length: 7145, dtype: int64