In [1]:
import json
import cfscrape
from bs4 import BeautifulSoup
import feedparser
import os
import tarfile
from pathlib import Path
import typing as t
import logging
import tempfile
from datetime import datetime
from dateutil.parser import parse

In [2]:
logging.getLogger('scrapper').setLevel(logging.DEBUG)
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S')

In [3]:
RSS_FILE_URL = 'https://www.investing.com/rss/news_301.rss'

PARSED_ARTICLES_DIR = os.path.join('data', 'parse_kostya_investing')
os.makedirs(PARSED_ARTICLES_DIR, exist_ok=True)

ALREADY_LOADED_ARTICLES_DB_FILE = os.path.join(PARSED_ARTICLES_DIR, 'list_of_articles.txt')

In [4]:
def get_article_links_from_rss(rss_url):
    return [entry.link for entry in feedparser.parse(rss_url).entries]


class ArticleData(t.NamedTuple):
    slug: str
    dt: datetime
    title: str
    text: str
    picture_bytes: bytes
    href: str


def get_data(scrapper, article_link, soup: BeautifulSoup) -> ArticleData:
    article_title = soup.find('h1').text
    for script in soup.find('div', class_="WYSIWYG articlePage").find_all('script'):
        script.decompose()
    for div in soup.findAll('div', class_='relatedInstrumentsWrapper'):
        div.decompose()

    article_text = ''.join(soup.find('div', class_="WYSIWYG articlePage").findAll(string=True)).strip()

    try:
        article_pic_href = soup.find('div', class_="WYSIWYG articlePage").find('img').get('src')
    except Exception as e:
        logger.error(e)
        article_pic_href = None

    dates = {}
    for i in soup.select('div.contentSectionDetails span'):
        d_s = i.text.split(' ')
        dates[d_s[0].strip().lower()] = parse(' '.join(d_s[1:-1]))

    return ArticleData(
        slug=article_link[article_link.rfind(str('/')) + 1:],
        title=article_title,
        dt=dates['published'],
        text=article_text,
        picture_bytes=scrapper.get(article_pic_href).content if article_pic_href is not None else '',
        href=article_link,
    )


def archive_files(archive_path, archive_file_name, files):
    archive_name = os.path.join(archive_path, f'{archive_file_name}.tar.xz')
    with tarfile.open(archive_name, 'w:xz') as tar_obj:
        for file in files:
            tar_obj.add(file)
    logger.info(f'saved to "{archive_name}"')


def save_cached_articles(links: t.Set[str]) -> None:
    with open(ALREADY_LOADED_ARTICLES_DB_FILE, 'wt', encoding='utf-8') as file:
        file.writelines(f'{i}\n' for i in sorted(links))


def load_cached_articles() -> t.Set[str]:
    if not os.path.exists(ALREADY_LOADED_ARTICLES_DB_FILE):
        return set()

    with open(ALREADY_LOADED_ARTICLES_DB_FILE, 'rt', encoding='utf-8') as file:
        return set(line.strip() for line in file.readlines())


def parsing(rss_file_url) -> None:
    all_article_links = get_article_links_from_rss(rss_file_url)
    old_article_links = load_cached_articles()
    new_article_links = set(filter(lambda x: x not in old_article_links, all_article_links))
    successfully_saved_links = set()

    scrapper = cfscrape.create_scraper()
    logger.info(f"Count articles for parsing: {len(new_article_links)}")

    try:
        for article_link in new_article_links:
            logger.info(f'process article "{article_link}"')

            article_response = scrapper.get(article_link)
            try:
                article_response.raise_for_status()
            except Exception as e:
                logger.error(e)
                continue

            soup = BeautifulSoup(article_response.text, 'lxml')
            info = get_data(scrapper, article_link, soup)

            files_to_archive = []
            with tempfile.TemporaryDirectory() as tmpdirname:
                with (
                    open(os.path.join(tmpdirname, 'article.html'), 'w', encoding='utf-8') as html_file,
                    open(os.path.join(tmpdirname, 'data.json'), 'w+t') as json_file,
                ):
                    json.dump({
                        'title': info.title,
                        'text': info.text,
                        'publication_dt': info.dt.strftime("%Y-%m-%d %H:%M:%S"),
                        'parsing_dt': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                        'locale': soup.find("meta", attrs={"property": "og:locale"}).get("content"),
                        'href': info.href,
                        'author': json.loads(soup.find('script', type='application/ld+json').text)['author']['name'],
                        'article_section': json.loads(soup.find('script', type='application/ld+json').text)['articleSection'],
                    }, json_file, indent=4)
                    html_file.write(article_response.text)
                    files_to_archive.append(html_file.name)

                    json_file.seek(0)
                    files_to_archive.append(json_file.name)

                    if info.picture_bytes:
                        article_pic_href = soup.find('div', class_="WYSIWYG articlePage").find('img').get('src')
                        pic_format = article_pic_href[article_pic_href.rfind('.') + 1:]
                        with open(os.path.join(tmpdirname, f'header_pic.{pic_format}'), 'wb') as img_file:
                            img_file.write(info.picture_bytes)
                            files_to_archive.append(img_file.name)

                    archive_files(PARSED_ARTICLES_DIR, info.slug, files_to_archive)
                    successfully_saved_links.add(article_link)

        save_cached_articles(old_article_links.union(successfully_saved_links))
        logger.info(f"Count articles for saving: {len(successfully_saved_links)}")
    except KeyboardInterrupt:
        save_cached_articles(old_article_links.union(successfully_saved_links))
        logger.info(f"Keyboard interrupt.Count articles for saving: {len(successfully_saved_links)}")

parsing(RSS_FILE_URL)

01:57:24 INFO:Count articles for parsing: 0
01:57:24 INFO:Count articles for saving: 0
