In [1]:
pip install requests

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install lxml

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install xml-python

Note: you may need to restart the kernel to use updated packages.


## Imports

In [5]:
import os
import requests
import json
import lzma
import zipfile
import tempfile
from datetime import datetime
from typing import List, IO
from bs4 import BeautifulSoup

## Make GET request and return text from response

In [6]:
def get_text_from_url(href: str) -> str:
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'
    }
    cookies = {}
    r = requests.get(href, headers=headers, cookies=cookies)
    print(href)
    print(r.status_code)
    print(r.reason)
    # print(r.headers)
    # print(r.cookies)
    # print(r.content)
    # print(r.text)
    # print(dir(r))
    return r.text

## Parse RSS from coindesk.com

get str with XML-data, 
parse it 
and return list of dicts with base info about news

In [7]:
def parse_coindeskcom_rss(xml_data: str) -> List:
    soup = BeautifulSoup(xml_data, 'xml')
    dl = []
    all_items = soup.find_all('item')
    for item in all_items:
        row = {
            'guid': item.find('guid').text,
            'title': item.find('title').text,
            'pubDate': datetime.strptime(item.find('pubDate').text, "%a, %d %B %Y %H:%M:%S %z"),
            'description': item.find('description').text,
            'link': item.find('link').text
        }

        dl.append(row)

    return dl

## Parse RSS from cointelegraphcom.com

get str with XML-data, 
parse it 
and return list of dicts with base info about news

In [8]:
def parse_cointelegraphcom_rss(xml_data: str) -> List:
    soup = BeautifulSoup(xml_data, 'xml')
    dl = []
    all_items = soup.find_all('item')
    for item in all_items:
        if item.find('link').text.find('https://cointelegraph.com/news'):
            continue

        guid = item.find('guid').text
        row = {
            'guid': guid[guid.rfind("/")+1:],
            'title': item.find('title').text,
            'pubDate': datetime.strptime(item.find('pubDate').text, "%a, %d %B %Y %H:%M:%S %z"),
            'description': item.find('description').text,
            'link': item.find('link').text
        }
        dl.append(row)

    return dl

## class ArticleInfo

In [9]:
class ArticleInfo:
    header: str
    content: str
    publication_dt: datetime
    parsing_dt: datetime
    html: str
    href: str
    meta_keywords: List[str]
    language: str

## Parse one article from cointelegraph.com

In [10]:
def parse_article_cointelegraphcom(href: str) -> ArticleInfo:
    html_info = get_text_from_url(href)
    soup = BeautifulSoup(html_info, "html.parser")
    ainfo = ArticleInfo()
    # ainfo.language = soup.find('button', {'data-testid': 'language-button'}).text
    ainfo.language = 'English'
    soup = soup.find('div', class_='post-page__item')
    ainfo.header = soup.find('h1', class_='post__title').text
    ainfo.content = soup.find('p', class_='post__lead').text
    ainfo.content += '\n'.join(i.text for i in soup.find('div', class_='post__content-wrapper').find_all('p'))
    ainfo.publication_dt = datetime.strptime(soup.find('time').attrs['datetime'], "%Y-%m-%d")  # %r
    ainfo.parsing_dt = datetime.now()
    ainfo.html = html_info
    ainfo.href = href

    return ainfo

## Parse one article from coindesk.com

In [11]:
def parse_article_coindeskcom(href: str) -> ArticleInfo:
    html_info = get_text_from_url(href)
    soup = BeautifulSoup(html_info, "html.parser")
    ainfo = ArticleInfo()
    content_classes = [
        'common-textstyles__StyledWrapper-sc-18pd49k-0 eSbCkN',
        'headingstyles__StyledWrapper-l955mv-0 fMEozb',
        'liststyles__StyledWrapper-sc-13iatdm-0 eksenZ'
    ]

    def publication_dt_check(tag):
        return (tag.name == 'div' and
                tag.has_attr('class') and
                'at-created' in tag.attrs['class']) or \
               (tag.name == 'span' and
                tag.has_attr('class') and
                'typography__StyledTypography-owin6q-0' in tag.attrs['class'] and
                'fUOSEs' in tag.attrs['class'])

    ainfo.header = soup.find('div', class_='at-headline').text
    ainfo.content = soup.find('div', class_='at-subheadline').text
    ainfo.content += '\n'.join(i.text for i in soup.find('div', class_='at-content-wrapper').find_all('div', class_=content_classes))
    if soup.find('div', class_='at-category').text == 'Opinion':
        ainfo.publication_dt = datetime.strptime(soup.find(publication_dt_check).text.replace('.', ''),
                                                 "%B %d, %Y at %I:%M %p %Z")  # %r
    else:
        ainfo.publication_dt = datetime.strptime(soup.find('div', class_='at-created').text.replace('.', ''),
                                                "%B %d, %Y at %I:%M %p %Z") #%r
    ainfo.parsing_dt = datetime.now()
    ainfo.language = soup.find('div', class_='footer-selectstyles__StyledRootContainer-sxto8j-0 lkWIzk').text
    ainfo.html = html_info
    ainfo.href = href

    return ainfo

## save_to_disk function

get file_name and ArticleInfo object
create files article.html and article.json
compress it
save it into archive with file_name
and delete from disc

In [12]:
def save_to_disk(file_name: str = 'article.xz', article: ArticleInfo = None) -> None:
    if not article:
        print('Nothing to save')
        return
    with tempfile.TemporaryDirectory() as temp_dir:
        with open(os.path.join(temp_dir, 'article.html'), 'wb') as f:
            f.write(lzma.compress(bytes(article.html, 'utf-8')))
        json_obj = article.__dict__
        json_obj['publication_dt'] = json_obj['publication_dt'].strftime('%Y-%m-%d %H:%M:%S')
        json_obj['parsing_dt'] = json_obj['parsing_dt'].strftime('%Y-%m-%d %H:%M:%S')
        with open(os.path.join(temp_dir, 'article.json'), "wb") as f:
            f.write(lzma.compress(bytes(json.dumps(json_obj, indent=4), 'utf-8')))
        with zipfile.ZipFile(file_name, "w") as zpf:
            zpf.write(os.path.join(temp_dir, 'article.html'), 'article.html')
            zpf.write(os.path.join(temp_dir, 'article.json'), 'article.json')

## Main program

In [16]:
if __name__ == '__main__':
    # xml_info = get_text_from_url('https://www.coindesk.com/arc/outboundfeeds/rss/')
    # news_list = parse_coindeskcom_rss(xml_info)
    # for item in news_list:
    #     tmp_article = parse_article_coindeskcom(item['link'])
    #     save_to_disk(f'{item["guid"]}.xz', tmp_article)

#     xml_info = get_text_from_url('https://cointelegraph.com/rss')
#     news_list = parse_cointelegraphcom_rss(xml_info)
#     for item in news_list:
#         tmp_article = parse_article_cointelegraphcom(item['link'])
#         save_to_disk(f'{item["guid"]}.xz', tmp_article)
    pass
