In [1]:
import pandas as pd
import re
import os
import bs4
import bz2
import datetime
import dateparser
import locale
import time
from poptimizer.data.adapters.html.parser import _get_html

In [2]:
async def get_html(url_source: str, html_path: str):
    downloaded = False
    if not os.path.exists(html_path + '.bz2'):
        print('download:', url_source)
        html = await _get_html(url_source)
        soup = bs4.BeautifulSoup(html, "lxml")
        downloaded = True
        # cache
        with bz2.open(html_path + '.bz2', 'wb') as f:
            f.write(soup.prettify(encoding='utf-8'))
            
    if not downloaded:
        with bz2.open(html_path + '.bz2', 'rb') as f:
            soup = bs4.BeautifulSoup(f.read().decode(encoding='utf-8'), "lxml")
    return soup

In [3]:
base_path = os.path.join('bcs','htmls')
os.makedirs(base_path, exist_ok=True)

In [4]:
url_source = 'https://bcs-express.ru/dividednyj-kalendar?year=2020'
html_path = os.path.join(base_path, '_divs.html')
soup = await get_html(url_source, html_path)

In [5]:
async def get_ticker(ticker: str):
    url_source = f'https://bcs-express.ru/kotirovki-i-grafiki/{ticker.lower()}'
    html_path = os.path.join(base_path, ticker.lower() + datetime.datetime.today().strftime('_%Y-%m-%d.html'))
    return await get_html(url_source, html_path)

In [6]:
rows = soup.find_all("select", {'class': 'js-filter-select-control', 'name':'emitent'})
data = set()
for r in rows:
    for o in r.find_all('option'):
        val = o.get('value') # might be None
        if val:
            val = [s.upper() for s in val.strip().split(',')]
            tick = val[0]
            operating_mode = val[1]
            name = o.contents[0].strip()
            data.add((tick, operating_mode, name))
ticks_df = pd.DataFrame.from_records(data=list(data), columns=['ticker', 'operating_mode', 'company_name'])
ticks_df.set_index('ticker', inplace=True)
ticks_df

Unnamed: 0_level_0,operating_mode,company_name
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1
NKSH,TQBR,Нижкамшина
DSKY,TQBR,Детский Мир
MRKP,TQBR,МРСК ЦП
RSTI,TQBR,Россети
YNDX,TQBR,Яндекс
...,...,...
KRSB,TQBR,КрасэЭсб
MRKV,TQBR,МРСК Волги
SIBN,TQBR,Газпром нефть
FLOT,TQBR,Совкомфлот


In [7]:
def parse_head(used_body):
    cols = ['actual_time', 'actual_price', 
            'pct_change_tod', 'open', 'close_prev_day', 'volume', 
            'max_price_ytd', 'min_price_ytd', 'pct_change_ytd',
            'pct_change_month', 'pct_change_year']
    data = []
    
    default_loc = locale.getlocale()
    
    head = used_body.find("div", {'class': 'quote-head js-quote-head'})
    actual_time = head.find("div", {'class': 'quote-head__date js-time'}).contents[0].strip()
    
    locale.setlocale( locale.LC_ALL, ('Russian_Russia', '1251') )
    
    actual_time = dateparser.parse(actual_time, date_formats=['%H:%M  %d %B %Y'], languages=['ru'])
    data.append(actual_time)
    actual_price = head.find("div", {'class': 'quote-head__price-value js-quote-head-price js-price-close'}).contents[0].strip()
    
    locale.setlocale( locale.LC_ALL, 'ru_RU.UTF-8' )
    
    actual_price = locale.atof(actual_price)
    data.append(actual_price)
    tables = head.find_all("div", {'class': 'quote-head__table'})

    for t in tables:
        rows = t.find_all('div', {'class': re.compile('quote-head__table-row')})
        for r in rows:
            cells = r.find_all('div', {'class': re.compile('quote-head__table-cell.*')})
            for i in range(len(cells)):
                span = cells[i].find('span', {'class': re.compile('js-profit-percent')})
                if span:
                    cells[i] = span
            try:
                row_data = locale.atof(cells[1].contents[0].strip().split('\n')[0].replace('%', ''))
            except (IndexError, ValueError):
                print('Error:', cells)
                row_data = None
            data.append(row_data)
    df = pd.DataFrame(data=[data], columns=cols)
    
    locale.setlocale( locale.LC_ALL, default_loc )
    return df

In [8]:
fmt_re = re.compile(r'<strong>|</strong>|<ol>|</ol><li>|</li>|<o>|</o>|<br>|</br>|<br/>|<li>|</li>|<ul>|</ul>|<p>|</p>|\n')

def clear_format(contents):
    cont = []
    for c in contents:
        cont.extend(list(filter(None, [row.strip() for row in re.split(fmt_re, str(c))])))
    return ' | '.join(cont)

def parse_info(used_body):
    em_info = used_body.find("div", {'class': 'quote-emitent__info'})
    em_info_items = em_info.find_all("div", {'class': 'quote-emitent__data-item'})
    cols = []
    values = []
    for it in em_info_items:
        cols.append(it.find("div", {'class': 'quote-emitent__data-title'}).contents[0].strip())
        v = it.find("div", {'class': 'quote-emitent__data-value'}).contents[0].strip()
        try:
            v = locale.atof(v)
        except ValueError:
            pass
        values.append(v)

    cols.append(em_info.find("h2", {'class': 'quote-emitent__description-title'}).contents[0].strip())
    text = em_info.find("div", {'class': 'quote-emitent__description-text'})
    if text.find('div'):
        text = ' '.join([clear_format(d.contents) for d in text.find_all('div')])
    else:
        text = clear_format(text.contents)
    values.append(text)
    df = pd.DataFrame(data=[values], columns=cols)
    return df

In [9]:
def parse_divs(used_body):
    interesting = [
               re.compile('dividends-table__cell _title.*'),
               re.compile('dividends-table__cell _last-day.*'),
               re.compile('dividends-table__cell _close-date.*'),
               re.compile('dividends-table__cell _value.*'),
               re.compile('dividends-table__cell _price.*'),
               re.compile('dividends-table__cell _profit.*')
              ]
    divs = used_body.find("div", {'class': 'dividends-table js-div-table'})
    rows = divs.find_all("div", {'class': 'dividends-table__row _item'})
    data = set()
    for i, r in enumerate(rows):
        row_data = []
        for c in interesting:
            d = r.find("div", {'class': c})
            spans = d.find_all('span')
            if spans:
                d = spans[0]
            v = d.contents[0].strip('—%\n ')
            try:
                v = locale.atof(v)
            except ValueError:
                try:
                    v = datetime.datetime.strptime(v, '%d.%m.%Y')
                except ValueError:
                    pass
            row_data.append(v)
        data.add(tuple(row_data))
    divs_df = pd.DataFrame.from_records(data=list(data), columns=['company_name_period', 
                                                                  'last_day_to_buy', 'close_date',
                                                                  'value', 'price', 'profit_pct'])
    return divs_df

In [10]:
def save_to_excel(filename, dfs):
    # Given a dict of dataframes, for example:
    # dfs = {'gadgets': df_gadgets, 'widgets': df_widgets}
    writer = pd.ExcelWriter(filename, engine='xlsxwriter')
    for sheetname, df in dfs.items():  # loop through `dict` of dataframes
        df.to_excel(writer, sheet_name=sheetname)  # send df to writer
        worksheet = writer.sheets[sheetname]  # pull worksheet object
        for idx, col in enumerate(df.columns):  # loop through all columns
            series = df[col]
            max_len = series.astype(str).map(len).max() + 1
            worksheet.set_column(idx + 1, idx + 1, max_len)  # set column width
    writer.save()

In [11]:
sleep = 1

heads = []
infos = []
divs = []
for t in ticks_df.index:
    print(t)
    
    downloaded = False
    while not downloaded:
        try:
            soup = await get_ticker(t)
            downloaded = True
        except ClientPayloadError:
            print(f'Sleep for {sleep} sec')
            time.sleep(sleep)
    
    used_body = soup.find("div", {'class': 'page-grid__container-left'})
    
    head = parse_head(used_body)
    head['TICKER'] = t
    heads.append(head)
    
    
    info = parse_info(used_body)
    info['TICKER'] = t
    infos.append(info)
    
    try:
        div = parse_divs(used_body)
        div['TICKER'] = t
        divs.append(div)
    except AttributeError:
        print('Dividends not found')
        
infos = pd.concat(infos).set_index('TICKER').sort_index()
heads = pd.concat(heads).set_index('TICKER').sort_index()
divs = pd.concat(divs).sort_values(['TICKER', 'close_date', 'company_name_period']).set_index('TICKER')

NKSH
Dividends not found
DSKY
MRKP
RSTI
YNDX
Dividends not found
MOBB
Dividends not found
KBSB
KUNF
Error: [<div class="quote-head__table-cell">
               закрытие предыдущего дня
              </div>, <div class="quote-head__table-cell">
               -
              </div>]
Error: [<div class="quote-head__table-cell">
                 макс. цена
                </div>, <div class="quote-head__table-cell">
</div>]
Error: [<div class="quote-head__table-cell">
                 мин. цена
                </div>, <div class="quote-head__table-cell">
</div>]
Error: [<div class="quote-head__table-cell">
                 изменение
                </div>, <div class="quote-head__table-cell">
                 -
                </div>]
Error: [<div class="quote-head__table-cell">
                 изменение
                </div>, <div class="quote-head__table-cell">
                 -
                </div>]
Dividends not found
PMSBP
PLSM
Dividends not found
KTSB
Dividends not found
USBN
M

Dividends not found
KUBE
SLEN
Dividends not found
PIKK
SAGOP
SBERP
ENPL
NNSB
LSRG
MRKZ
KRSB
MRKV
SIBN
FLOT
Error: [<div class="quote-head__table-cell">
                 изменение
                </div>, <div class="quote-head__table-cell">
                 -
                </div>]
Error: [<div class="quote-head__table-cell">
                 изменение
                </div>, <div class="quote-head__table-cell">
                 -
                </div>]
Dividends not found
ISKJ
Dividends not found


In [12]:
save_to_excel('bcs/bcs.xlsx', {'info': infos, 'state': heads, 'dividends': divs})

In [13]:
infos

Unnamed: 0_level_0,Акция,Номинал,free-float,Полное название,Капитализация,Тип,Количество,ISIN-код,Гос. регномер,Отрасль,Описание
TICKER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ABRD,Абрау-Дюрсо,1,-,АбрауДюрсо ОАО,"19,45 млрд.",Обыкновенная,9.80002e+07,RU000A0JS5T7,-,Финансы,Завод шампанских вин «Абрау-Дюрсо» — крупнейши...
ACKO,АСКО,-,-,АСКО,-,Обыкновенная,-,RU000A0JXS91,-,Финансы,Страховая компания «ЮЖУРАЛ-АСКО» основана 15 м...
AFKS,Система,0.09,33,"АФК ""Система"" ОАО ао","226,88 млрд.",Обыкновенная,9.65e+09,RU000A0DQZE3,-,Потребительский сектор,АФК «Система» является одной из самых крупных ...
AFLT,Аэрофлот,1,41,ПАО «Аэрофлот»,"73,98 млрд.",Обыкновенная,1.22767e+09,RU0009062285,-,Транспорт,ПАО «Аэрофлот» — крупнейший авиаперевозчик Рос...
AGRO,AGRO-гдр,0,-,AGRO-гдр,.,Обыкновенная,0,US7496552057,-,Прочее,Группа Компаний «РУСАГРО» - это крупнейший вер...
...,...,...,...,...,...,...,...,...,...,...,...
YRSBP,ТНСэнЯр ап,1,-,Ярославская сбыт.комп.ОАО ап,"407,91 млн.",Обыкновенная,4.50732e+06,RU000A0D88B3,-,Электроэнергетика,ПАО «ТНС энерго Ярославль» (до 30 июня 2015 го...
ZILL,ЗИЛ,"1 000,00",-,Завод им. И.А.Лихачева ОАО ао,"6,66 млрд.",Обыкновенная,2.65996e+06,RU0009086193,-,Машиностроение,ООО ЗИЛ (Завод имени Лихачёва) — одна из старе...
ZMZN,ЗМЗ,1,-,Заволжский мотор.з-д(ОАО) ао,"1,91 млрд.",Обыкновенная,1.12734e+08,RU0009101539,-,Машиностроение,ПАО «ЗМЗ» — один из крупнейших российских прои...
ZMZNP,ЗМЗ ап,1,-,Заволжский мотор.завод (ОАО ап,481 млн.,Обыкновенная,3.75781e+07,RU0006752854,-,Машиностроение,ПАО «ЗМЗ» — один из крупнейших российских прои...


In [14]:
heads

Unnamed: 0_level_0,actual_time,actual_price,pct_change_tod,open,close_prev_day,volume,max_price_ytd,min_price_ytd,pct_change_ytd,pct_change_month,pct_change_year
TICKER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ABRD,2020-10-09 18:45:00,198.500,0.00,195.00,198.5,2.081150e+06,202.5,195,1.28,-2.93,48.13
ACKO,2020-10-09 18:37:00,4.960,0.00,4.96,4.96,5.127600e+04,5.04,4.96,-0.40,-0.8,7.36
AFKS,2020-10-09 23:49:00,23.511,-1.00,23.75,23.748,7.852769e+08,24.117,23.517,1.03,13.7,85.74
AFLT,2020-10-09 23:49:00,60.260,-3.92,60.66,62.72,6.650002e+09,64.92,61.8,-4.77,-26.49,-39.75
AGRO,2020-10-09 18:45:00,739.000,-0.67,745.00,744,4.620441e+07,761.6,730,1.67,1.23,7.91
...,...,...,...,...,...,...,...,...,...,...,...
YRSBP,2020-10-08 18:01:00,90.500,0.56,90.00,90,2.322000e+04,93,86.5,4.65,3.43,5.85
ZILL,2020-10-09 18:37:00,2505.000,-0.40,2475.00,2515,9.960000e+03,2530,2480,0.40,1.01,51.82
ZMZN,2020-10-12 00:00:00,16.900,0.00,17.20,16.9,2.402000e+03,17.3,16.9,-3.98,0,0
ZMZNP,2020-10-12 00:00:00,12.800,-7.25,12.80,13.8,1.280000e+03,13.8,13,7.81,0,0


In [15]:
divs

Unnamed: 0_level_0,company_name_period,last_day_to_buy,close_date,value,price,profit_pct
TICKER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ABRD,Абрау-Дюрсо ао нераспр. прибыль,2016-11-16 00:00:00,2016-11-19 00:00:00,0.41,88,0.47
ABRD,Абрау – Дюрсо ао 2016,2017-06-28 00:00:00,2017-07-02 00:00:00,1.29,200,0.65
ABRD,Абрау – Дюрсо ао 2017,2018-06-22 00:00:00,2018-06-26 00:00:00,2.86,147,1.95
ABRD,Абрау – Дюрсо ао 2018,2019-07-08 00:00:00,2019-07-10 00:00:00,2.86,142,2.01
ABRD,Абрау-Дюрсо ао 2019,2020-10-15 00:00:00,2020-10-19 00:00:00,1.03,198.5,0.52
...,...,...,...,...,...,...
YAKG,ЯТЭК ао 2016,NaT,NaT,0,,
YKEN,Якутскэнерго ао 2010,,2011-04-14 00:00:00,0,,0
YKEN,Якутскэнерго ао 2011,,2012-05-12 00:00:00,0,,0
YKENP,Якутскэнерго ап 2010,2011-04-14 00:00:00,2011-04-14 00:00:00,0.00939045,0.485,1.94


In [None]:
# clean cache
for fname in os.listdir(base_path):
    rm_path = os.path.join(base_path, fname)
    print('remove:', rm_path)
    os.remove(rm_path)