In [4]:
import json
import requests
import time
import string
import pandas as pd
from json_parser import *

In [9]:
def request_mun(mun_id):
    mun_res = requests.get("https://api.idescat.cat/emex/v1/dades.json?id={}&lang=en".format(mun_id))
    return mun_res.json()
   

def create_meta_municipalities():
    """
    Retrieves dictionairy containing all municipalities from the idescat. Structure of the dict:
        key: id of the mun
        value: name of the mun
    """
    meta = {}
    mun_res = requests.get("https://api.idescat.cat/emex/v1/dades.json?lang=en")
    mun_meta = mun_res.json()
    muns = mun_meta['fitxes']['cols']['col']
    for mun in muns:
        meta[mun['id']] = mun['content']
        
    return meta


def collect_all_muns(meta, n):
    """
    Collects all the data of each municipality, making a request every n seconds
    """
    
    start_time = time.time()
    municipality_dict = {}
    total = len(meta.keys())
    for i, identifier in enumerate(meta.keys()):
        municipality_dict[identifier] = request_mun(identifier)
        time.sleep(n)
        if i == 0:
            last_time = time.time()
            print(f'({i}/{total}). Data from {identifier} was downloaded in {last_time - start_time} seconds')
        else:
            now = time.time()
            print(f'({i}/{total}). Data from {identifier} was downloaded in {now - last_time} seconds')
            last_time = now
        if i == 20:
            break
    print("Retrieved data of {} municipalities in {} seconds".format(i ,int(time.time()-start_time)))
    return municipality_dict


def add_muns_to_parser(parser, muns):
    for name, mun in muns.items():
        parser.add_json(d=mun, name=name)
        

def change_dicts_to_panda_df(dicts):
    pd_dict = {}
    for name, dct in dicts.items():
        df = pd.DataFrame(dct, index=[name])
        pd_dict[name] = df
    return pd_dict        


def concatenate_dict_of_dfs(dfs):
    for i, (name, d) in enumerate(dfs.items()):
        if i == 0:
            df = d
        else:
            df = pd.concat([df, d])
    return df


def pipeline():
    meta = create_meta_municipalities()
    json_data = collect_all_muns(meta, n=1)
    json_parser = JSONParser(json_value='v', json_column_name=['c', 'calt'])
    json_parser.add_value_map(value_splitter)
    json_parser.add_key_map(homogenize_key)
    add_muns_to_parser(json_parser, json_data)
    json_parser.parse_data()
    pd_dict = change_dicts_to_panda_df(json_parser.containers())
    df = concatenate_dict_of_dfs(pd_dict)
    df.to_csv('data.csv')

In [10]:
pipeline()

(0/989). Data from 250019 was downloaded in 1.2685823440551758 seconds
(1/989). Data from 080018 was downloaded in 1.2758002281188965 seconds
(2/989). Data from 250024 was downloaded in 1.271655797958374 seconds
(3/989). Data from 250030 was downloaded in 1.3054680824279785 seconds
(4/989). Data from 080023 was downloaded in 1.2600934505462646 seconds
(5/989). Data from 170010 was downloaded in 1.289602518081665 seconds
(6/989). Data from 080142 was downloaded in 1.270857334136963 seconds
(7/989). Data from 430017 was downloaded in 1.3258671760559082 seconds
(8/989). Data from 170025 was downloaded in 1.2899339199066162 seconds
(9/989). Data from 250387 was downloaded in 1.2774360179901123 seconds
(10/989). Data from 250045 was downloaded in 1.276102066040039 seconds
(11/989). Data from 250058 was downloaded in 1.832357406616211 seconds
(12/989). Data from 250061 was downloaded in 1.2750751972198486 seconds
(13/989). Data from 170031 was downloaded in 1.3033974170684814 seconds
(14/989