In [1]:
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm

from telethon.sync import TelegramClient

import requests
from bs4 import BeautifulSoup

In [2]:
async def scrap_telegram(name_channel, n_documents, 
                         api_id=1582972, 
                         api_hash='041a2f13e654abbc45a55945668dbcc9'):
    
    async with TelegramClient(session='session', api_id=api_id, api_hash=api_hash) as client:

        link = f't.me/{name_channel}'
        entity = await client.get_input_entity(link)

        documents = []
        
        messages = client.iter_messages(entity, limit=n_documents)

        async for message in messages:
            if message.message:
                documents += [(name_channel, message.id, message.date, message.message)]
    
    documents = pd.DataFrame(documents, columns=['channel', 'id', 'date', 'document'])
    
    return documents


def scrap_medicine(url, page_ids):
    
    documents = []
    
    for page_id in tqdm(page_ids):
        page_url = url.format(page_id)
        try:
            page = requests.get(page_url)
            content = page.content
            soup = BeautifulSoup(content, 'html.parser')
            
            date = soup.find_all('div', class_='news__details__date')
            content = soup.find_all('div', class_='news__details__content')
            
            assert(len(date) == 1)
            assert(len(content) == 1)
            
            date = date[0].get_text()
            content = content[0].get_text()
            
            if content:
                documents += [(page_url, date, content)]
        except:
            pass
    
    documents = pd.DataFrame(documents, columns=['url', 'date', 'document'])
    
    return documents


def scrap_tutby(url, page_ids):
    
    documents = []
    
    for page_id in tqdm(page_ids):
        page_url = url.format(page_id)
        try:
            page = requests.get(page_url)
            content = page.content
            soup = BeautifulSoup(content, 'html.parser')
            
            label = soup.find_all('a', attrs={'itemprop': 'articleSection'})
            header = soup.find_all('h1', attrs={'itemprop': 'headline'})
            date = soup.find_all('time', attrs={'itemprop': 'datePublished'})
            content = soup.find_all('div', id='article_body')
            tags = soup.find_all('li', class_='tag-taxonomy-topic')
            
            assert(len(label) == 1)
            assert(len(header) == 1)
            assert(len(date) == 1)
            assert(len(content) == 1)
            assert(len(tags) == 1)
            
            label = label[0].get_text()
            header = header[0].get_text()
            date = date[0]['datetime']
            content = content[0].get_text()
            
            tags = tags[0].findChildren('a', recursive=False)
            tags = [tag.get_text() for tag in tags]
            
            if content:
                documents += [(page_url, label, header, date, content, tags)]
        except:
            pass
    
    documents = pd.DataFrame(documents, columns=['url', 'label', 'header', 'date', 'document', 'tags'])
    
    return documents

In [5]:
path_corpora = Path('../data/corpora')

path_corpus_medicine = path_corpora / Path('medicine.csv')
path_corpus_tut = path_corpora / Path('tutby.csv')

path_corpus_telegram = path_corpora / Path('telegram')
name_channels = ['minzdravbelarus', 'cpbelarus', 'vandroukiby', 'sputnikby', 'pressmvd', 'itbeard', 'skgovby', 'naviny_by']

In [6]:
%%time

url = 'https://news.tut.by/{}.html'
page_ids = range(700306, 702020)

corpus = scrap_tutby(url, page_ids)

corpus.to_csv(path_corpus_tut, index=False)


print(f'corpus.shape: {corpus.shape}')
display(corpus.head(3))

corpus.shape: (1431, 6)


Unnamed: 0,url,label,header,date,document,tags
0,https://news.tut.by/700306.html,Общество,"Хороводы на перекрестке, водомет и задержания....",2020-09-13T21:33:00+03:00,\n13 сентября в Бресте несколько тысяч человек...,"[акции протеста, силовые ведомства]"
1,https://news.tut.by/700307.html,Новость дня,«Покраснело ухо и было разбито колено». В Жоди...,2020-09-13T20:39:00+03:00,"Читатели прислали в редакцию несколько видео, ...","[акции протеста, милиция, силовые ведомства, ГАИ]"
2,https://news.tut.by/700308.html,Общество,"Многометровый бчб-флаг, «космонавты» у исполко...",2020-09-13T23:21:00+03:00,\nСегодня воскресенье — и гомельчане снова выш...,"[Видео TUT.BY, акции протеста, местная власть,..."
