In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

plt.ion()

In [34]:
import os
import requests
import contextlib
import functools

In [73]:
def file_or_nothing(download_fn):
    @functools.wraps(download_fn)
    def wrapped(name, *args, **kwargs):
        cache_path = os.path.join('data', name)
        cached = os.path.exists(cache_path)
        try:
            return download_fn(name, *args, **kwargs)
        except:
            if not cached and os.path.exists(cache_path):
                os.remove(os.path.join(cache_path))
            else:
                print('failed but not cleaning up')
    return wrapped


@file_or_nothing
@contextlib.contextmanager
def remote_binary(name, url=None):
    if not os.path.exists('data'):
        os.mkdir('data')
    cache_path = os.path.join('data', name)
    cached = os.path.exists(cache_path)
    assert cached or url is not None
    if not cached:
        with open(cache_path, 'wb') as f, requests.get(url, stream=True) as r:
            for chunk in r.iter_content(chunk_size=4096):
                f.write(chunk)
    with open(cache_path, mode='rb') as f:
        yield f
        
        
@file_or_nothing
@contextlib.contextmanager
def remote_text(name, enc='utf8', url=None):
    if not os.path.exists('data'):
        os.mkdir('data')
    cache_path = os.path.join('data', name)
    cached = os.path.exists(cache_path)
    assert cached or url is not None
    if not cached:
        with open(cache_path, 'w') as f, requests.get(url, stream=True) as r:
            r.encoding = enc
            for chunk in r.iter_lines(chunk_size=10000, decode_unicode=True):
                f.write(chunk)
                f.write('\n')
    with open(cache_path, mode='r') as f:
        yield f

In [74]:
# http://obrnadzor.gov.ru/ru/opendata/7701537808-PODVED/
with remote_text(
        'podved_1',
        enc='cp1251',
        url='http://obrnadzor.gov.ru/common/upload/opendata/7701537808-PODVED/data-20150325-structure-20150325.csv') as f:
    podved_1 = pd.read_csv(f)

with remote_text(
    'podved_2',
    enc='cp1251',
    url='http://obrnadzor.gov.ru/common/upload/opendata/7701537808-PODVED/data-20141208-structure-20141208.csv') as f:
    podved_2 = pd.read_csv(f, sep=';')
    

In [89]:
podved_1[['name', 'site']]

Unnamed: 0,name,site
0,ФГБНУ «Федеральный институт педагогических изм...,new.fipi.ru
1,ФГБУ «Федеральный центр тестирования»,http://www.rustest.ru/
2,ФГБНУ «Главный государственный экспертный цент...,http://www.nic.gov.ru/
3,ФГБУ «Национальное аккредитационное агентство ...,www.nica.ru
4,ФГБУ «Информационно-методический центр анализа»,www.imtsa.ru


In [None]:
! wget -c http://isga.obrnadzor.gov.ru/accredreestr/opendata/ -O open_data.zip

--2019-06-07 07:42:51--  http://isga.obrnadzor.gov.ru/accredreestr/opendata/
Resolving isga.obrnadzor.gov.ru (isga.obrnadzor.gov.ru)... 176.99.141.18
Connecting to isga.obrnadzor.gov.ru (isga.obrnadzor.gov.ru)|176.99.141.18|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 72541151 (69M) [application/zip]
Saving to: ‘open_data.zip’

open_data.zip        27%[====>               ]  18.82M  63.3KB/s    eta 8m 3s  