## Install packeges

In [1]:
pip install requests

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install lxml




In [4]:
pip install xml-python

Note: you may need to restart the kernel to use updated packages.


## Imports

In [5]:
import os
import requests
import json
import lzma
import zipfile
import tempfile
import re
from datetime import datetime
from typing import List, IO
from bs4 import BeautifulSoup

## ArticleInfo class

In [6]:
class ArticleInfo:
    header: str
    content: str
    publication_dt: datetime
    parsing_dt: datetime
    html: str
    href: str
    meta_keywords: List[str]
    language: str

## Make GET request and return text from response

In [7]:
def get_article_info(href: str) -> str:
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'
    }
    cookies = {}
    r = requests.get(href, headers=headers, cookies=cookies)
    if r.status_code != 200:
        with open('pars_log.txt', 'a') as f:
            f.write(f"""Error getting info from URL.
                    URL:{href}
                    Status:{r.status_code}
                    Reason:{r.reason}
                    Time:{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}""")
        raise requests.exceptions.RequestException(response=r)
    return r.text

## Parse page with short news by tag and page number

In [8]:
def parse_coindeskcom_news_page(news_tag: str, num_page: int) -> List:
    try:
        page_html = get_article_info(f'https://www.coindesk.com/tag/{news_tag}/{num_page}')
    except requests.exceptions.RequestException as err:
        print(err)
        return None
    soup = BeautifulSoup(page_html, "html.parser")
    short_news = soup.find_all('div', class_='articleTextSection')
    news_list = []
    for item in short_news:
        title = item.find('a', class_='card-title')
        pub_date = item.find('div',
                              class_='timing-data').find('span',
                                                         class_='typography__StyledTypography-owin6q-0 fUOSEs').text
        tmp_news = {
            'category': item.find('a', class_='category').text,
            'title': title.text,
            'link': 'https://coindesk.com' + title.attrs['href'],
            'description': item.find('span', class_='content-text').text,
            'author': item.find('a', class_='ac-author').text,
            'pub_datetime': datetime.strptime(pub_date.replace('.', ''), '%b %d, %Y at %I:%M %p %Z')
        }
        news_list.append(tmp_news)

    return news_list

## Get news list from RSS

In [16]:
def get_coindeskcom_articles_from_rss(from_dt: datetime=datetime.now(), to_dt: datetime=None) -> List:
    xml_data = get_article_info('https://www.coindesk.com/arc/outboundfeeds/rss/')
    if xml_data == '':
        with open('pars_log.txt', 'a') as f:
            f.write(f"""Error getting RSS.
                        URL: https://www.coindesk.com/arc/outboundfeeds/rss/
                        Reason: Empty result
                        Time:{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n""")
    soup = BeautifulSoup(xml_data, 'xml')
    dl = []
    all_items = soup.find_all('item')
    for item in all_items:
        try:
            row = {
                'category': item.find('category').text,
                'guid': item.find('guid').text,
                'title': item.find('title').text,
                'pub_datetime': datetime.strptime(item.find('pubDate').text, "%a, %d %b %Y %H:%M:%S %z"),
                'description': item.find('description').text,
                'link': item.find('link').text
            }
            if to_dt:
                row['pub_datetime'] = datetime.strptime(row['pub_datetime'].strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S')
                if to_dt <= row['pub_datetime'] <= from_dt:
                    dl.append(row)
        except Exception as err:
            print(err)
            with open('pars_log.txt', 'a') as f:
                f.write(f"""coindesk.com RSS parsing error.
                        Item: {item.__dict__}
                        Error: {err}
                        Time:{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n""")
            continue

    return dl

## Get all news in time window

In [19]:
def get_all_articles_coindeskcom(from_dt: datetime, to_dt: datetime) -> List[dict]:
    web3_tags = [
        'yuga-labs',
        'nfts',
        'metaverse',
        'dao',
        'gaming'
    ]
    articles_list = []
    if not from_dt:
        from_dt = datetime.now()
    if not to_dt:
        from_dt = datetime(1970, 1, 1, 0, 0, 0)
    for w3_tag in web3_tags:
        print(f'Get {w3_tag} news')
        page_num = 0
        while True:
            page_num += 1
            news_page_articles = parse_coindeskcom_news_page(w3_tag, page_num)
            if to_dt <= news_page_articles[-1]['pub_datetime'] and news_page_articles[0]['pub_datetime'] <= from_dt:
                articles_list.extend(news_page_articles)
            else:
                while news_page_articles and news_page_articles[0]['pub_datetime'] > from_dt:
                    news_page_articles.pop(0)
                while news_page_articles and news_page_articles[-1]['pub_datetime'] < to_dt:
                    news_page_articles.pop()
                if news_page_articles:
                    articles_list.extend(news_page_articles)
                else:
                    break

    return articles_list

## Parse one article and return ArticleInfo object

In [11]:
def parse_article_coindeskcom(href: str) -> ArticleInfo:
    html_info = get_article_info(href)
    if html_info == '':
        with open('pars_log.txt', 'a') as f:
            f.write(f"""coindesk.com article parsing error.
                    URL: {href}
                    Reason: Empty result
                    Time:{datetime.now().strftime('%Y-%m-%d %H:%M:%S %Z')}\n""")
        return None
    soup = BeautifulSoup(html_info, "html.parser")
    ainfo = ArticleInfo()
    content_classes = [
        'common-textstyles__StyledWrapper-sc-18pd49k-0 eSbCkN',
        'headingstyles__StyledWrapper-l955mv-0 fMEozb',
        'liststyles__StyledWrapper-sc-13iatdm-0 eksenZ'
    ]

    def publication_dt_check(tag):
        return (tag.name == 'div' and
                tag.has_attr('class') and
                'at-created' in tag.attrs['class']) or \
               (tag.name == 'span' and
                tag.has_attr('class') and
                'typography__StyledTypography-owin6q-0' in tag.attrs['class'] and
                'fUOSEs' in tag.attrs['class'])
    try:
        ainfo.header = soup.find('div', class_='at-headline').text
        ainfo.content = soup.find('div', class_='at-subheadline').text
        ainfo.content += '\n'.join(i.text for i in soup.find('div', class_='at-content-wrapper').find_all('div', class_=content_classes))
        if soup.find('div', class_='at-category').text == 'Opinion':
            ainfo.publication_dt = datetime.strptime(soup.find(publication_dt_check).text.replace('.', ''),
                                                     "%b %d, %Y at %I:%M %p %Z")  # %r
        else:
            ainfo.publication_dt = datetime.strptime(soup.find('div', class_='at-created').text.replace('.', ''),
                                                    "%b %d, %Y at %I:%M %p %Z") #%r
        ainfo.parsing_dt = datetime.now()
        ainfo.language = soup.find('div', class_='footer-selectstyles__StyledRootContainer-sxto8j-0 lkWIzk').text
        ainfo.html = html_info
        ainfo.href = href
    except Exception as err:
        with open('pars_log.txt', 'a') as f:
            f.write(f"""cointelegraph.com article parsing error.
                    URL: {href}
                    Error: {err}
                    Time:{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n""")
        return None

    return ainfo

## Save archive with html and json to disk

In [21]:
def save_to_disk(file_name: str = 'article.xz', article: ArticleInfo = None) -> None:
    print(f'Try to save {file_name} file')
    if not article:
        with open('pars_log.txt', 'a') as f:
            f.write(f"""Article save error.
                    file_name: {file_name}
                    Reason: Empty article
                    Time:{datetime.now().strftime('%Y-%m-%d %H:%M:%S %Z')}\n""")
        return
    with tempfile.TemporaryDirectory() as temp_dir:
        with open(os.path.join(temp_dir, 'article.html'), 'wb') as f:
            f.write(lzma.compress(bytes(article.html, 'utf-8')))
        json_obj = article.__dict__
        json_obj['publication_dt'] = json_obj['publication_dt'].strftime('%Y-%m-%d %H:%M:%S')
        json_obj['parsing_dt'] = json_obj['parsing_dt'].strftime('%Y-%m-%d %H:%M:%S')
        with open(os.path.join(temp_dir, 'article.json'), "wb") as f:
            f.write(lzma.compress(bytes(json.dumps(json_obj, indent=4), 'utf-8')))
        with zipfile.ZipFile(file_name, "w") as zpf:
            zpf.write(os.path.join(temp_dir, 'article.html'), 'article.html')
            zpf.write(os.path.join(temp_dir, 'article.json'), 'article.json')


## Get last parsing datetime from file

In [13]:
def get_last_pars_dt() -> datetime:
    try:
        with open('last_parsing.txt','r') as dtf:
            return datetime.strptime(dtf.read(), '%Y-%m-%d %H:%M:%S')
    except FileNotFoundError:
        with open('pars_log.txt', 'a') as f:
            f.write(f"""Time get error.
                    Reason: No datetime file
                    Time:{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n""")
            return datetime(1970, 1, 1, 0, 0, 0)


## Decompress archive

In [22]:
def decompress_archive(file_name):
    with zipfile.ZipFile(file_name, "r") as fp:
        with open('article.html', 'wb') as html_file, open('article.json', 'wb') as json_file:
            html_file.write(lzma.decompress(fp.read('article.html')))
            json_file.write(lzma.decompress(fp.read('article.json')))

## Main

In [23]:
if __name__ == '__main__':
    news_list = get_all_articles_coindeskcom(datetime(2023, 5, 31, 23, 59), datetime(2023, 5, 24, 0, 0))
#     last_pars_time = get_last_pars_dt()
#     news_list = get_coindeskcom_articles_from_rss(from_dt=datetime.now(), to_dt=last_pars_time)
    for item in news_list:
        tmp_article = parse_article_coindeskcom(item['link'])
        if not tmp_article:
            continue
        try:
            save_to_disk(re.sub(r'[^a-zA-z0-9]', '', f'{item["category"]}{item["title"]}')+'.xz', tmp_article)
        except Exception as err:
            with open('pars_log.txt', 'a') as f:
                f.write(f"""Article save error.
                        Item: {item}
                        Error: {err}
                        Time:{datetime.now().strftime('%Y-%m-%d %H:%M:%S %Z')}\n""")

    with open('last_parsing.txt', 'w') as f:
        f.write(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    print('Done!')


Get yuga-labs news
Get nfts news
Get metaverse news
Get dao news
Get gaming news
Try to save Web3PixelPenguinsanNFTCharityScamShowsDangersOfNFTInfluencerCulture.xz file
Try to save Web3NFTArtistFewociousRevealsUpcomingCollectionFewos.xz file
Try to save Web3SothebysSecond3ACNFTAuctionIncludesLandmarkDmitriCherniakWork.xz file
Try to save Web3FindSatoshiLabsRollsOutAIToolThatTurnsSelfiesIntoNFTs.xz file
Try to save Web3BRC721ETokenStandardConvertsEthereumNFTstoBitcoinNFTs.xz file
Try to save Web3MercedesBenzWeb3ArmToReleaseNFTCollectionWithDigitalArtCommunityFingerprintsDAO.xz file
Try to save Web3NikeTripsUpSWOOSHLaunchWhileBitcoinNFTsSoar.xz file
Try to save Web3NikeOF1NFTSaleSurpasses1MDespiteDelaysTechIssues.xz file
Try to save Web3PudgyPenguinsNFTProjectOnceEndangeredProvesWeb3TurnaroundIsPossible.xz file
Try to save Web3BinanceLaunchingNFTLoanFeature.xz file
Try to save Web3BitcoinBasedSpacePepesLedWeeklyTradingVolumesAmongNFTCollections.xz file
Try to save Web3Web3FriendlyBrowser