In [1]:
import json
import requests
import time
import string
import pandas as pd
from json_parser import *
from data_collector import *

In [2]:
idescat = idescat_API(batch_size=100)

In [5]:
type(idescat)

data_collector.idescat_API

In [3]:
idescat.collect_and_parse_data()

The estimated time to download the amount of data is 0.38333333333333336 minutes
(0/100). Data from 250019 was downloaded in 0.2902500629425049 seconds
(1/100). Data from 080018 was downloaded in 0.20442795753479004 seconds
(2/100). Data from 250024 was downloaded in 0.26927804946899414 seconds
(3/100). Data from 250030 was downloaded in 0.3261547088623047 seconds
(4/100). Data from 080023 was downloaded in 0.26628875732421875 seconds
(5/100). Data from 170010 was downloaded in 0.30294370651245117 seconds
(6/100). Data from 080142 was downloaded in 0.2094414234161377 seconds
(7/100). Data from 430017 was downloaded in 0.19957876205444336 seconds
(8/100). Data from 170025 was downloaded in 0.2953155040740967 seconds
(9/100). Data from 250387 was downloaded in 0.19547510147094727 seconds
(10/100). Data from 250045 was downloaded in 0.26928043365478516 seconds
(11/100). Data from 250058 was downloaded in 0.28324317932128906 seconds
(12/100). Data from 250061 was downloaded in 0.2710270881

KeyboardInterrupt: 

In [18]:

class idescat_API:    
    def __init__(self):
        meta = self.create_meta_municipalities()
        requested = []

    def request_mun(mun_id):
        mun_res = requests.get("https://api.idescat.cat/emex/v1/dades.json?tipus=mun?id={}&lang=en".format(mun_id))
        return mun_res.json()


    def create_meta_municipalities():
        """
        Retrieves dictionairy containing all municipalities from the idescat. Structure of the dict:
            key: id of the mun
            value: name of the mun
        """
        meta = {}
        mun_res = requests.get("https://api.idescat.cat/emex/v1/dades.json?lang=en")
        mun_meta = mun_res.json()
        muns = mun_meta['fitxes']['cols']['col']
        for mun in muns:
            meta[mun['id']] = mun['content']

        return meta


    def collect_all_muns(meta, n):
        """
        Collects all the data of each municipality, making a request every n seconds
        """

        start_time = time.time()
        municipality_dict = {}
        total = len(meta.keys())
        estimated_time = total * (n + 0.23)
        failed_list = []
        print(f'The estimated time to download the amount of data is {estimated_time/60} minutes')
        for i, identifier in enumerate(meta.keys()):
            try:
                municipality_dict[identifier] = request_mun(identifier)
            except:
                failed_list.append(identifier)
            time.sleep(n)
            if i == 0:
                last_time = time.time()
                print(f'({i}/{total}). Data from {identifier} was downloaded in {last_time - start_time} seconds')
            else:
                now = time.time()
                print(f'({i}/{total}). Data from {identifier} was downloaded in {now - last_time} seconds')
                last_time = now
        print("Retrieved data of {} municipalities in {} seconds".format(i ,int(time.time()-start_time)))
        return municipality_dict


def add_muns_to_parser(parser, muns):
    for name, mun in muns.items():
        parser.add_json(d=mun, name=name)
        

def change_dicts_to_panda_df(dicts):
    pd_dict = {}
    for name, dct in dicts.items():
        df = pd.DataFrame(dct, index=[name])
        pd_dict[name] = df
    return pd_dict        


def concatenate_dict_of_dfs(dfs):
    for i, (name, d) in enumerate(dfs.items()):
        if i == 0:
            df = d
        else:
            df = pd.concat([df, d])
    return df


def pipeline():
    meta = create_meta_municipalities()
    json_data = collect_all_muns(meta, n=0)
    json_parser = JSONParser(json_value='v', json_column_name=['c', 'calt'])
    json_parser.add_value_map(value_splitter)
    json_parser.add_key_map(homogenize_key)
    add_muns_to_parser(json_parser, json_data)
    json_parser.parse_data()
    pd_dict = change_dicts_to_panda_df(json_parser.containers())
    df = concatenate_dict_of_dfs(pd_dict)
    df.to_csv('data.csv')

In [19]:
pipeline()

The estimated time to download the amount of data is 3.791166666666667 minutes
(0/989). Data from 250019 was downloaded in 0.21543598175048828 seconds
(1/989). Data from 080018 was downloaded in 0.31439685821533203 seconds
(2/989). Data from 250024 was downloaded in 0.23918628692626953 seconds
(3/989). Data from 250030 was downloaded in 0.36741018295288086 seconds
(4/989). Data from 080023 was downloaded in 0.33695220947265625 seconds
(5/989). Data from 170010 was downloaded in 0.3125014305114746 seconds
(6/989). Data from 080142 was downloaded in 0.280073881149292 seconds
(7/989). Data from 430017 was downloaded in 0.26648926734924316 seconds
(8/989). Data from 170025 was downloaded in 0.16276311874389648 seconds
(9/989). Data from 250387 was downloaded in 0.1767277717590332 seconds
(10/989). Data from 250045 was downloaded in 0.17409181594848633 seconds
(11/989). Data from 250058 was downloaded in 0.20891261100769043 seconds
(12/989). Data from 250061 was downloaded in 0.175409317016

(112/989). Data from 430265 was downloaded in 0.20306849479675293 seconds
(113/989). Data from 080229 was downloaded in 0.2973313331604004 seconds
(114/989). Data from 170195 was downloaded in 0.19401049613952637 seconds
(115/989). Data from 170209 was downloaded in 0.3214378356933594 seconds
(116/989). Data from 170216 was downloaded in 0.18487858772277832 seconds
(117/989). Data from 080235 was downloaded in 0.20999932289123535 seconds
(118/989). Data from 250557 was downloaded in 0.15411019325256348 seconds
(119/989). Data from 170221 was downloaded in 0.26456308364868164 seconds
(120/989). Data from 430271 was downloaded in 0.1543872356414795 seconds
(121/989). Data from 430287 was downloaded in 0.2021493911743164 seconds
(122/989). Data from 172348 was downloaded in 0.20662784576416016 seconds
(123/989). Data from 430290 was downloaded in 0.26853251457214355 seconds
(124/989). Data from 170237 was downloaded in 0.3069946765899658 seconds
(125/989). Data from 170293 was downloaded 

(225/989). Data from 082687 was downloaded in 0.21755456924438477 seconds
(226/989). Data from 082665 was downloaded in 0.15276718139648438 seconds
(227/989). Data from 080689 was downloaded in 0.1818535327911377 seconds
(228/989). Data from 250729 was downloaded in 0.1783127784729004 seconds
(229/989). Data from 250735 was downloaded in 0.2187809944152832 seconds
(230/989). Data from 170502 was downloaded in 0.20760560035705566 seconds
(231/989). Data from 170519 was downloaded in 0.23291873931884766 seconds
(232/989). Data from 250740 was downloaded in 0.23792529106140137 seconds
(233/989). Data from 250753 was downloaded in 0.19530606269836426 seconds
(234/989). Data from 250766 was downloaded in 0.14169597625732422 seconds
(235/989). Data from 170545 was downloaded in 0.16028118133544922 seconds
(236/989). Data from 250772 was downloaded in 0.18695592880249023 seconds
(237/989). Data from 080692 was downloaded in 0.29842352867126465 seconds
(238/989). Data from 430458 was downloade

(337/989). Data from 251018 was downloaded in 0.22275137901306152 seconds
(338/989). Data from 080958 was downloaded in 0.19190168380737305 seconds
(339/989). Data from 251023 was downloaded in 0.13843965530395508 seconds
(340/989). Data from 080961 was downloaded in 0.15390896797180176 seconds
(341/989). Data from 251039 was downloaded in 0.1844785213470459 seconds
(342/989). Data from 251057 was downloaded in 0.16067099571228027 seconds
(343/989). Data from 251044 was downloaded in 0.17682170867919922 seconds
(344/989). Data from 430691 was downloaded in 0.1631777286529541 seconds
(345/989). Data from 080977 was downloaded in 0.1437540054321289 seconds
(346/989). Data from 170813 was downloaded in 0.20382094383239746 seconds
(347/989). Data from 080996 was downloaded in 0.1562962532043457 seconds
(348/989). Data from 430705 was downloaded in 0.1289224624633789 seconds
(349/989). Data from 170828 was downloaded in 0.221785306930542 seconds
(350/989). Data from 251095 was downloaded in

(449/989). Data from 171100 was downloaded in 0.28186869621276855 seconds
(450/989). Data from 430923 was downloaded in 0.22235822677612305 seconds
(451/989). Data from 171096 was downloaded in 0.27245211601257324 seconds
(452/989). Data from 430862 was downloaded in 0.3592534065246582 seconds
(453/989). Data from 430884 was downloaded in 0.23855924606323242 seconds
(454/989). Data from 081252 was downloaded in 0.17731213569641113 seconds
(455/989). Data from 081304 was downloaded in 0.17947816848754883 seconds
(456/989). Data from 251399 was downloaded in 0.13595914840698242 seconds
(457/989). Data from 081311 was downloaded in 0.1427161693572998 seconds
(458/989). Data from 251403 was downloaded in 0.17068910598754883 seconds
(459/989). Data from 430897 was downloaded in 0.19976449012756348 seconds
(460/989). Data from 251386 was downloaded in 0.17129063606262207 seconds
(461/989). Data from 081265 was downloaded in 0.15084052085876465 seconds
(462/989). Data from 081326 was download

(561/989). Data from 431129 was downloaded in 0.23886895179748535 seconds
(562/989). Data from 081672 was downloaded in 0.16201138496398926 seconds
(563/989). Data from 431135 was downloaded in 0.21398544311523438 seconds
(564/989). Data from 250306 was downloaded in 0.24269723892211914 seconds
(565/989). Data from 171352 was downloaded in 0.22825050354003906 seconds
(566/989). Data from 251734 was downloaded in 0.2281491756439209 seconds
(567/989). Data from 081825 was downloaded in 0.2858116626739502 seconds
(568/989). Data from 431418 was downloaded in 0.20798230171203613 seconds
(569/989). Data from 081688 was downloaded in 0.2648966312408447 seconds
(570/989). Data from 171365 was downloaded in 0.24457931518554688 seconds
(571/989). Data from 251728 was downloaded in 0.32660603523254395 seconds
(572/989). Data from 171371 was downloaded in 0.22430920600891113 seconds
(573/989). Data from 431140 was downloaded in 0.3509821891784668 seconds
(574/989). Data from 171404 was downloaded

(674/989). Data from 171609 was downloaded in 0.2666919231414795 seconds
(675/989). Data from 082114 was downloaded in 0.3466377258300781 seconds
(676/989). Data from 171616 was downloaded in 0.2932705879211426 seconds
(677/989). Data from 082129 was downloaded in 0.2177135944366455 seconds
(678/989). Data from 171621 was downloaded in 0.2397480010986328 seconds
(679/989). Data from 082093 was downloaded in 0.1352231502532959 seconds
(680/989). Data from 082135 was downloaded in 0.16452383995056152 seconds
(681/989). Data from 171637 was downloaded in 0.5364203453063965 seconds
(682/989). Data from 251924 was downloaded in 0.23817706108093262 seconds
(683/989). Data from 251977 was downloaded in 0.5910320281982422 seconds
(684/989). Data from 171642 was downloaded in 0.34336423873901367 seconds
(685/989). Data from 082153 was downloaded in 0.20323538780212402 seconds
(686/989). Data from 081939 was downloaded in 0.22215867042541504 seconds
(687/989). Data from 439023 was downloaded in 

(786/989). Data from 170524 was downloaded in 0.21979618072509766 seconds
(787/989). Data from 082711 was downloaded in 0.18813753128051758 seconds
(788/989). Data from 252069 was downloaded in 0.21646785736083984 seconds
(789/989). Data from 431476 was downloaded in 0.14209485054016113 seconds
(790/989). Data from 252075 was downloaded in 0.23050141334533691 seconds
(791/989). Data from 082726 was downloaded in 0.37480807304382324 seconds
(792/989). Data from 252081 was downloaded in 0.24377918243408203 seconds
(793/989). Data from 252094 was downloaded in 0.18225383758544922 seconds
(794/989). Data from 252108 was downloaded in 0.27976369857788086 seconds
(795/989). Data from 082732 was downloaded in 0.2817718982696533 seconds
(796/989). Data from 252115 was downloaded in 0.3063662052154541 seconds
(797/989). Data from 252120 was downloaded in 0.31471967697143555 seconds
(798/989). Data from 082747 was downloaded in 0.281466007232666 seconds
(799/989). Data from 171946 was downloaded

(898/989). Data from 082994 was downloaded in 0.25847387313842773 seconds
(899/989). Data from 172174 was downloaded in 0.2670443058013916 seconds
(900/989). Data from 172168 was downloaded in 0.2977936267852783 seconds
(901/989). Data from 083015 was downloaded in 0.19511985778808594 seconds
(902/989). Data from 083008 was downloaded in 0.1852266788482666 seconds
(903/989). Data from 172180 was downloaded in 0.26370739936828613 seconds
(904/989). Data from 172207 was downloaded in 0.16376757621765137 seconds
(905/989). Data from 172214 was downloaded in 0.31276988983154297 seconds
(906/989). Data from 083054 was downloaded in 0.440096378326416 seconds
(907/989). Data from 252445 was downloaded in 0.2681765556335449 seconds
(908/989). Data from 172235 was downloaded in 0.40719127655029297 seconds
(909/989). Data from 431750 was downloaded in 0.2848324775695801 seconds
(910/989). Data from 083067 was downloaded in 0.2467975616455078 seconds
(911/989). Data from 252458 was downloaded in 

ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

In [56]:
lst = {i:'a' for i in range(0,4)}

In [57]:
lst1 = {'a':1, 'b': 3}

In [58]:
lst + lst1

TypeError: unsupported operand type(s) for +: 'dict' and 'dict'