In [1]:
import json
import cfscrape
from bs4 import BeautifulSoup
import feedparser
import os
import tarfile

RSS_FILE_URL = 'https://www.investing.com/rss/news_301.rss'


def get_links_from_rss_file(_link):
    entries = feedparser.parse(_link).entries
    rss_parsed_links = [entry.link for entry in entries]
    return rss_parsed_links


def create_soup(_url, scrapper):
    response = scrapper.get(_url)
    soup = BeautifulSoup(response.text, 'lxml')
    return soup


def get_data(_article, scrapper):
    soup = create_soup(_article, scrapper)
    article_date = soup.findAll('div', class_='contentSectionDetails')[1].text.lstrip()
    article_title = soup.find('h1').text
    del_article_scripts = soup.findAll('script')
    for script in del_article_scripts:
        script.decompose()

    del_article_div = soup.findAll('div', class_='relatedInstrumentsWrapper')
    for div in del_article_div:
        div.decompose()

    article_text = ''.join(soup.find('div', class_="WYSIWYG articlePage").find_all(text=True)).lstrip()
    article_pic = soup.find('div', class_="WYSIWYG articlePage").find('img').get('src')
    article_data = [article_title, article_date, article_text, article_pic]
    return article_data


def archive_files(link, files):
    slug = link[link.rfind(str('/')) + 1:]
    archive_name = f'{os.getcwd()}\\{slug}.tar.xz'
    print(slug)
    print(archive_name)
    with tarfile.open(archive_name, 'w:xz') as tar_obj:
        for _file in files:
            tar_obj.add(_file)


def parsing():
    scrapper = cfscrape.create_scraper()
    if not os.path.exists('parsed_articles'):
        os.mkdir('parsed_articles')
    os.chdir('parsed_articles')
    rss_links = get_links_from_rss_file(RSS_FILE_URL)

    new_articles = []
    if not os.path.exists('list_of_articles.txt'):
        with open('list_of_articles.txt', 'w', encoding='utf-8') as file_check:
            pass

    with open('list_of_articles.txt', 'r+', encoding='utf-8') as file:
        file_data = [line.strip() for line in file.readlines()]
        for url in rss_links:
            if url not in file_data:
                file.write(f'{url}')
                file.write('\n')
                new_articles.append(url)

    print(f"Count articles for parsing {len(new_articles)}")
    for article in new_articles:
        print(f"Proceed article: {article}")
        info = get_data(article, scrapper)
        with open('article_data.txt', 'w', encoding='utf-8') as text_file, open('article_pic.jpg',
                                                                                'wb') as img_file, open(
            'article_html.html', 'w', encoding='utf-8') as html_file, open('meta.json', 'w') as json_file:
            for data in info[:3]:
                text_file.write(data)
                text_file.write('\n')

            pic = scrapper.get(info[3])
            img_file.write(pic.content)

            article_html = scrapper.get(article).text
            html_file.write(article_html)

            metadata_tags = create_soup(article, scrapper).findAll('meta')
            metadata = {tag.get('name') or tag.get('property') or tag.get('http-equiv'): tag.get('content') for tag in
                        metadata_tags}
            json.dump(metadata, json_file, indent=3)

            files_for_archive = [text_file.name, img_file.name, html_file.name, json_file.name]

            archive_files(article, files_for_archive)

        for file in files_for_archive:
            os.remove(file)


parsing()


Count articles for parsing 10
Proceed article: https://www.investing.com/news/cryptocurrency-news/buying-a-car-with-bitcoin-gets-37m-fine-prison-time-in-morocco-3079992


  article_text = ''.join(soup.find('div', class_="WYSIWYG articlePage").find_all(text=True)).lstrip()


buying-a-car-with-bitcoin-gets-37m-fine-prison-time-in-morocco-3079992
C:\Users\User\Desktop\parcingProject\parsed_articles\buying-a-car-with-bitcoin-gets-37m-fine-prison-time-in-morocco-3079992.tar.xz
Proceed article: https://www.investing.com/news/cryptocurrency-news/shariahcompliant-coins-release-set-to-spur-crypto-adoption-in-muslim-nations-3079945
shariahcompliant-coins-release-set-to-spur-crypto-adoption-in-muslim-nations-3079945
C:\Users\User\Desktop\parcingProject\parsed_articles\shariahcompliant-coins-release-set-to-spur-crypto-adoption-in-muslim-nations-3079945.tar.xz
Proceed article: https://www.investing.com/news/cryptocurrency-news/binance-looks-to-the-uk-for-regulation-amid-us-crypto-crackdown-3079944
binance-looks-to-the-uk-for-regulation-amid-us-crypto-crackdown-3079944
C:\Users\User\Desktop\parcingProject\parsed_articles\binance-looks-to-the-uk-for-regulation-amid-us-crypto-crackdown-3079944.tar.xz
Proceed article: https://www.investing.com/news/cryptocurrency-news/can