In [None]:
import json
import cfscrape
from bs4 import BeautifulSoup
import feedparser
import os
import tarfile
from pathlib import Path
import typing as t
import logging
import tempfile
from datetime import datetime
from dateutil.parser import parse
from dateutil.tz import gettz

In [None]:
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S')

In [None]:
RSS_FILE_URL = 'https://www.livemint.com/rss/news'

PARSED_ARTICLES_DIR = os.path.join('data', 'parse_livemint')
os.makedirs(PARSED_ARTICLES_DIR, exist_ok=True)

ALREADY_LOADED_ARTICLES_DB_FILE = os.path.join(PARSED_ARTICLES_DIR, 'list_of_articles.txt')

In [None]:
def get_article_links_from_rss(rss_url):
    return [entry.link for entry in feedparser.parse(rss_url).entries]

class ArticleData(t.NamedTuple):
    slug: str
    dt: datetime
    title: str
    text: str
    keywords: list
    picture_href: str
    picture_bytes: bytes
    href: str

def check_element_exist(f):
    def func(*args, **kwargs):
        try:
            result = f(*args, **kwargs)
            return result
        except AttributeError:
            logger.error(f'Article {args[1]} has no searched element')
            result = ''
            return result

    return func


@check_element_exist
def get_title(soup, article_link):
    return soup.find('h1').text


@check_element_exist
def get_text(soup, article_link):
    return ' '.join([i.text.strip() for i in soup.find('div', class_='mainArea').findAll('p')])


@check_element_exist
def get_picture_href(soup, article_link):
    return soup.find('span', class_='pos-rel dblock imgmobalignment').find('img').get('src')


@check_element_exist
def get_date(soup, article_link):
    return parse(soup.find('meta', attrs={'property': 'article:published_time'}).get('content'))


@check_element_exist
def get_keywords(soup, article_link):
    return soup.find('meta', attrs={'name': 'keywords'}).get('content')


def get_data(scrapper, article_link, soup: BeautifulSoup) -> ArticleData:
    article_title = get_title(soup, article_link)

    article_text = get_text(soup, article_link)

    article_pic_href = get_picture_href(soup, article_link)

    article_date = get_date(soup, article_link)

    article_keywords = get_keywords(soup, article_link)

    return ArticleData(
        slug=article_link[article_link.rfind(str('/')) + 1:article_link.rfind('.')],
        title=article_title,
        dt=article_date,
        text=article_text,
        keywords=article_keywords.split(','),
        picture_href=article_pic_href,
        picture_bytes=scrapper.get(article_pic_href).content if article_pic_href else '',
        href=article_link,
    )


def archive_files(archive_path, archive_file_name, files):
    archive_name = os.path.join(archive_path, f'{archive_file_name}.tar.xz')
    with tarfile.open(archive_name, 'w:xz') as tar_obj:
        for file in files:
            tar_obj.add(file)
    logger.info(f'saved to "{archive_name}"')


def save_cached_articles(links: t.Set[str]) -> None:
    with open(ALREADY_LOADED_ARTICLES_DB_FILE, 'wt', encoding='utf-8') as file:
        file.writelines(f'{i}\n' for i in sorted(links))


def load_cached_articles() -> t.Set[str]:
    if not os.path.exists(ALREADY_LOADED_ARTICLES_DB_FILE):
        return set()

    with open(ALREADY_LOADED_ARTICLES_DB_FILE, 'rt', encoding='utf-8') as file:
        return set(line.strip() for line in file.readlines())

def parsing(rss_file_url) -> None:
    all_article_links = get_article_links_from_rss(rss_file_url)
    old_article_links = load_cached_articles()
    new_article_links = set(filter(lambda x: x not in old_article_links, all_article_links))
    successfully_saved_links = set()

    scrapper = cfscrape.create_scraper()
    logger.info(f'Count articles for parsing: {len(new_article_links)}')

    try:
        for article_link in new_article_links:
            logger.info(f'Process article "{article_link}"')

            article_response = scrapper.get(article_link)
            try:
                article_response.raise_for_status()
            except Exception as e:
                logger.error(e)
                continue

            soup = BeautifulSoup(article_response.text, 'lxml')
            info = get_data(scrapper, article_link, soup)

            files_to_archive = []
            with tempfile.TemporaryDirectory() as tmpdirname:
                with (
                    open(os.path.join(tmpdirname, 'article.html'), 'w', encoding='utf-8') as html_file,
                    open(os.path.join(tmpdirname, 'data.json'), 'w+t') as json_file,
                ):
                    json.dump({
                        'title': info.title,
                        'text': info.text,
                        'publication_dt': info.dt.isoformat(' ', "seconds"),
                        'parsing_dt': datetime.now(gettz()).isoformat(' ', 'seconds'),
                        'meta_keywords': info.keywords,
                        'href': info.href,
                    }, json_file, indent=4)
                    html_file.write(article_response.text)
                    files_to_archive.append(html_file.name)

                    json_file.seek(0)
                    files_to_archive.append(json_file.name)

                    if info.picture_bytes:
                        pic_format = info.picture_href[info.picture_href.rfind('.') + 1:]
                        with open(os.path.join(tmpdirname, f'header_pic.{pic_format}'), 'wb') as img_file:
                            img_file.write(info.picture_bytes)
                            files_to_archive.append(img_file.name)

                    archive_files(PARSED_ARTICLES_DIR, info.slug, files_to_archive)
                    successfully_saved_links.add(article_link)

        save_cached_articles(old_article_links.union(successfully_saved_links))
        logger.info(f'Count articles for saving: {len(successfully_saved_links)}')
    except KeyboardInterrupt:
        save_cached_articles(old_article_links.union(successfully_saved_links))
        logger.info(f'Keyboard interrupt.Count articles for saving: {len(successfully_saved_links)}')

parsing(RSS_FILE_URL)