In [1]:
from bs4 import BeautifulSoup
import os
import pandas

In [2]:
url_rating = 'https://www.anekdot.ru/an/top/jfx{year}01-{year}12,{start},{end},.html'
url_content = 'https://www.anekdot.ru/an/top/jfx{year}01-{year}12,{start},{end}.html'

In [3]:
def get_year(i):
    if i < 9:
        return f'0{i}'
    return str(i)

In [4]:
def prepare_url(url, year, batch, batch_size = 100):
    url = url.replace('{year}', get_year(year))
    url = url.replace('{start}', str(batch_size * batch))
    url = url.replace('{end}', str(batch_size))
    return url

In [5]:
def get_rating(year, batch, batch_size = 100):
    link_rating = prepare_url(url_rating, year, batch, batch_size)
    os.system('curl ' + link_rating + ' > rate.txt')
    with open('rate.txt', encoding='utf-8') as f:
        try:
            content = f.read()
        except UnicodeDecodeError:
            return []
    soup = BeautifulSoup(content, 'lxml')
    rates = soup.find_all('tr')
    rates = [float(rate.find_all('td')[5].string) for rate in rates[1:-1]]
    return rates

In [7]:
def get_contents(year, batch, batch_size = 100):
    link_content = prepare_url(url_content, year, batch, batch_size)
    os.system('curl ' + link_content + ' > content.txt')
    with open('content.txt', encoding='utf-8') as f:
        try:
            content = f.read()
        except UnicodeDecodeError:
            return []
    soup = BeautifulSoup(content, 'lxml')
    contents = soup.find_all('div', class_ = 'topicbox')
    contents = [content.find_all('div', class_ = 'text')[0].text for content in contents[1:] 
                if len(content.find_all('div', class_ = 'text'))]
    return contents

In [9]:
for year in range(3, 21):
    batch = 0
    print(f'Start year {get_year(year)}')
    while True:
        contents = get_contents(year, batch)
        if not contents:
            print(f'Finished year {get_year(year)} on batch {batch}')
            print('------------------------------------------------')
            break
        data = pandas.DataFrame(contents, columns=['content'])
        data.to_csv(f'data/anecdots-{year}-{batch}.csv', )
        print(f'Batch {batch} of year {get_year(year)} processed')
        batch += 1

Start year 03
Batch 0 of year 03 processed
Batch 1 of year 03 processed
Batch 2 of year 03 processed
Batch 3 of year 03 processed
Batch 4 of year 03 processed
Batch 5 of year 03 processed
Batch 6 of year 03 processed
Batch 7 of year 03 processed
Batch 8 of year 03 processed
Batch 9 of year 03 processed
Batch 10 of year 03 processed
Batch 11 of year 03 processed
Batch 12 of year 03 processed
Batch 13 of year 03 processed
Batch 14 of year 03 processed
Batch 15 of year 03 processed
Batch 16 of year 03 processed
Batch 17 of year 03 processed
Batch 18 of year 03 processed
Batch 19 of year 03 processed
Batch 20 of year 03 processed
Batch 21 of year 03 processed
Batch 22 of year 03 processed
Batch 23 of year 03 processed
Batch 24 of year 03 processed
Batch 25 of year 03 processed
Batch 26 of year 03 processed
Batch 27 of year 03 processed
Batch 28 of year 03 processed
Batch 29 of year 03 processed
Batch 30 of year 03 processed
Batch 31 of year 03 processed
Batch 32 of year 03 processed
Batch 

Batch 26 of year 06 processed
Batch 27 of year 06 processed
Batch 28 of year 06 processed
Batch 29 of year 06 processed
Batch 30 of year 06 processed
Batch 31 of year 06 processed
Batch 32 of year 06 processed
Batch 33 of year 06 processed
Batch 34 of year 06 processed
Batch 35 of year 06 processed
Batch 36 of year 06 processed
Batch 37 of year 06 processed
Batch 38 of year 06 processed
Batch 39 of year 06 processed
Batch 40 of year 06 processed
Batch 41 of year 06 processed
Finished year 06 on batch 42
------------------------------------------------
Start year 07
Batch 0 of year 07 processed
Batch 1 of year 07 processed
Batch 2 of year 07 processed
Batch 3 of year 07 processed
Batch 4 of year 07 processed
Batch 5 of year 07 processed
Batch 6 of year 07 processed
Batch 7 of year 07 processed
Batch 8 of year 07 processed
Batch 9 of year 07 processed
Batch 10 of year 07 processed
Batch 11 of year 07 processed
Batch 12 of year 07 processed
Batch 13 of year 07 processed
Batch 14 of year 0

Batch 108 of year 10 processed
Batch 109 of year 10 processed
Batch 110 of year 10 processed
Batch 111 of year 10 processed
Batch 112 of year 10 processed
Batch 113 of year 10 processed
Batch 114 of year 10 processed
Batch 115 of year 10 processed
Batch 116 of year 10 processed
Batch 117 of year 10 processed
Batch 118 of year 10 processed
Batch 119 of year 10 processed
Batch 120 of year 10 processed
Batch 121 of year 10 processed
Batch 122 of year 10 processed
Batch 123 of year 10 processed
Batch 124 of year 10 processed
Batch 125 of year 10 processed
Batch 126 of year 10 processed
Batch 127 of year 10 processed
Batch 128 of year 10 processed
Batch 129 of year 10 processed
Batch 130 of year 10 processed
Batch 131 of year 10 processed
Batch 132 of year 10 processed
Batch 133 of year 10 processed
Batch 134 of year 10 processed
Batch 135 of year 10 processed
Batch 136 of year 10 processed
Batch 137 of year 10 processed
Batch 138 of year 10 processed
Batch 139 of year 10 processed
Batch 14

Batch 144 of year 13 processed
Batch 145 of year 13 processed
Batch 146 of year 13 processed
Batch 147 of year 13 processed
Batch 148 of year 13 processed
Batch 149 of year 13 processed
Batch 150 of year 13 processed
Batch 151 of year 13 processed
Batch 152 of year 13 processed
Batch 153 of year 13 processed
Batch 154 of year 13 processed
Batch 155 of year 13 processed
Batch 156 of year 13 processed
Batch 157 of year 13 processed
Batch 158 of year 13 processed
Batch 159 of year 13 processed
Batch 160 of year 13 processed
Batch 161 of year 13 processed
Batch 162 of year 13 processed
Batch 163 of year 13 processed
Batch 164 of year 13 processed
Batch 165 of year 13 processed
Batch 166 of year 13 processed
Batch 167 of year 13 processed
Batch 168 of year 13 processed
Batch 169 of year 13 processed
Batch 170 of year 13 processed
Batch 171 of year 13 processed
Batch 172 of year 13 processed
Batch 173 of year 13 processed
Batch 174 of year 13 processed
Batch 175 of year 13 processed
Batch 17

Batch 3 of year 18 processed
Batch 4 of year 18 processed
Batch 5 of year 18 processed
Batch 6 of year 18 processed
Batch 7 of year 18 processed
Batch 8 of year 18 processed
Batch 9 of year 18 processed
Batch 10 of year 18 processed
Batch 11 of year 18 processed
Batch 12 of year 18 processed
Batch 13 of year 18 processed
Batch 14 of year 18 processed
Batch 15 of year 18 processed
Batch 16 of year 18 processed
Batch 17 of year 18 processed
Batch 18 of year 18 processed
Batch 19 of year 18 processed
Batch 20 of year 18 processed
Batch 21 of year 18 processed
Batch 22 of year 18 processed
Batch 23 of year 18 processed
Batch 24 of year 18 processed
Batch 25 of year 18 processed
Batch 26 of year 18 processed
Batch 27 of year 18 processed
Batch 28 of year 18 processed
Batch 29 of year 18 processed
Batch 30 of year 18 processed
Batch 31 of year 18 processed
Batch 32 of year 18 processed
Batch 33 of year 18 processed
Batch 34 of year 18 processed
Batch 35 of year 18 processed
Batch 36 of year 

In [8]:
rates = get_rating(12, 0)
contents = get_contents(12, 0)

In [12]:
len(rates)

0