In [1]:
import json
import requests
import time
import string
import pandas as pd

In [2]:
response = requests.get("https://api.idescat.cat/emex/v1/dades.json?lang=en")

In [3]:
meta = response.json()

In [4]:
meta['fitxes'].keys()

dict_keys(['p', 'indicadors', 'lang', 'version', 'cols', 'n', 'o'])

In [5]:
meta['fitxes']['cols']['col']

[{'scheme': 'mun', 'id': '250019', 'content': 'Abella de la Conca'},
 {'scheme': 'mun', 'id': '080018', 'content': 'Abrera'},
 {'scheme': 'mun', 'id': '250024', 'content': 'Àger'},
 {'scheme': 'mun', 'id': '250030', 'content': 'Agramunt'},
 {'scheme': 'mun', 'id': '080023', 'content': 'Aguilar de Segarra'},
 {'scheme': 'mun', 'id': '170010', 'content': 'Agullana'},
 {'scheme': 'mun', 'id': '080142', 'content': 'Aiguafreda'},
 {'scheme': 'mun', 'id': '430017', 'content': 'Aiguamúrcia'},
 {'scheme': 'mun', 'id': '170025', 'content': 'Aiguaviva'},
 {'scheme': 'mun', 'id': '250387', 'content': 'Aitona'},
 {'scheme': 'mun', 'id': '250045', 'content': 'Alamús, els'},
 {'scheme': 'mun', 'id': '250058', 'content': 'Alàs i Cerc'},
 {'scheme': 'mun', 'id': '250061', 'content': "Albagés, l'"},
 {'scheme': 'mun', 'id': '170031', 'content': 'Albanyà'},
 {'scheme': 'mun', 'id': '250077', 'content': 'Albatàrrec'},
 {'scheme': 'mun', 'id': '250083', 'content': 'Albesa'},
 {'scheme': 'mun', 'id': '2500

In [6]:
mun_response = requests.get("https://api.idescat.cat/emex/v1/dades.json?id=080018&lang=en")

In [7]:
mun_response.json()

{'fitxes': {'p': 'id=080018',
  'gg': {'g': [{'tt': {'t': {'ff': {'f': [{'r': '2021',
          'c': 'Surface area',
          'u': 'km²',
          'v': '19.94,485.99,32108.00',
          'calt': 'Surface area',
          'id': 'f271',
          'updated': '2021-12-23T11:00:00+00:00'},
         {'r': '2013',
          'c': 'Altitude',
          'u': 'm',
          'v': '105,_,_',
          'calt': 'Altitude',
          'id': 'f258',
          'updated': '2014-02-17T11:00:00+00:00'},
         {'r': '2013',
          'c': 'Longitude',
          'u': 'º',
          'v': '1.903100,_,_',
          'calt': 'Longitude',
          'id': 'f328',
          'updated': '2014-02-17T11:00:00+00:00'},
         {'r': '2013',
          'c': 'Latitude',
          'u': 'º',
          'v': '41.518531,_,_',
          'calt': 'Latitude',
          'id': 'f329',
          'updated': '2014-02-17T11:00:00+00:00'},
         {'r': '2013',
          'c': 'UTM coordinates x',
          'u': 'm',
          'v': '4

In [8]:
def request_mun(mun_id):
    mun_res = requests.get("https://api.idescat.cat/emex/v1/dades.json?id={}&lang=en".format(mun_id))
    return mun_res.json()
   

def create_meta_municipalities():
    """
    Retrieves dictionairy containing all municipalities from the idescat. Structure of the dict:
        key: id of the mun
        value: name of the mun
    """
    meta = {}
    mun_res = requests.get("https://api.idescat.cat/emex/v1/dades.json?lang=en")
    mun_meta = mun_res.json()
    muns = mun_meta['fitxes']['cols']['col']
    for mun in muns:
        meta[mun['id']] = mun['content']
        
    return meta


def collect_all_muns(meta, n):
    """
    Collects all the data of each municipality, making a request every n seconds
    """
    
    start_time = time.time()
    municipality_dict = {}
    i = 0
    for identifier in meta.keys():
        
        municipality_dict[identifier] = request_mun(identifier)
        time.sleep(n)
        i += 1
        if i == 10:
            break
    
    print("Retrieved data of {} municipalities in {} seconds".format(i ,int(time.time()-start_time)))
    return municipality_dict

In [9]:
testmeta = create_meta_municipalities()

In [10]:
test = collect_all_muns(testmeta, 0.2)

Retrieved data of 10 municipalities in 4 seconds


In [7]:
with open("test.json", 'w') as file:
    json.dump(test['250019'], file)

In [10]:
pd.read_json('test.json')

Unnamed: 0,250019,80018,250024,250030,80023,170010,80142,430017,170025,250387
fitxes,"{'p': 'id=250019', 'gg': {'g': [{'tt': {'t': {...","{'p': 'id=080018', 'gg': {'g': [{'tt': {'t': {...","{'p': 'id=250024', 'gg': {'g': [{'tt': {'t': {...","{'p': 'id=250030', 'gg': {'g': [{'tt': {'t': {...","{'p': 'id=080023', 'gg': {'g': [{'tt': {'t': {...","{'p': 'id=170010', 'gg': {'g': [{'tt': {'t': {...","{'p': 'id=080142', 'gg': {'g': [{'tt': {'t': {...","{'p': 'id=430017', 'gg': {'g': [{'tt': {'t': {...","{'p': 'id=170025', 'gg': {'g': [{'tt': {'t': {...","{'p': 'id=250387', 'gg': {'g': [{'tt': {'t': {..."


In [24]:
pd.json_normalize(test['250024']['fitxes']['gg']['g'], max_level=10)

Unnamed: 0,c,id,tt.t.ff.f,tt.t.c,tt.t.id,tt.t.l,tt.t,tt.t.r,tt.t.s,tt.t.u,tt.t.calt,tt.t.updated
0,Territory,g173,"[{'r': '2021', 'c': 'Surface area', 'u': 'km²'...",Geographic indicators,t176,https://www.idescat.cat/pub/?id=inddt&n=396&ge...,,,,,,
1,Population,g168,,,,,"[{'ff': {'f': [{'c': 'Surface area', 'u': 'km²...",,,,,
2,Culture · Language,g172,,,,,"[{'ff': {'f': [{'c': 'Understand', 'v': '508,3...",,,,,
3,Education,g214,"[{'c': 'Primary education or lower', 'v': '23....",Population aged 15 and over. By level of educa...,t215,https://www.idescat.cat/pub/?id=eep&n=14540&ge...,,2019 (p),Idescat. Educational Attainment Statistics.,%,Level of education attained,2021-10-25T10:00:00+00:00
4,Elections,g174,,,,,"[{'ff': {'f': [{'c': 'Electors', 'v': '505,280...",,,,,
5,Labour,g169,,,,,"[{'ff': {'f': [{'c': 'Men', 'v': '139,9307,177...",,,,,
6,Quality of life,g171,"[{'c': 'Main', 'v': '259,15288,2944944', 'calt...",Family dwellings. By type,t116,https://www.idescat.cat/pub/?id=censph&n=30&ge...,,2011,"Idescat, based on the INE's Population and Hou...",,Type of dwellings,2013-12-19T11:00:00+00:00
7,Main aggregates · Public sector finance,g163,,,,,"[{'ff': {'f': [{'c': 'Number of receipts', 'v'...",,,,,
8,Economic sectors,g170,,,,,"[{'ff': {'f': [{'c': 'Cultivated land', 'v': '...",,,,,
9,Environment,g205,,,,,"[{'ff': {'f': [{'c': 'Generation per capita', ...",,,,,


In [7]:
print("amount of lists in the first layer of the complicated json: " + str(len(test['250019']['fitxes']['gg']['g'])))
total = 0
for i in range(0,10):
    print("lenght of the list in the second layer of the complicated json: "+ str(len(test['250019']['fitxes']['gg']['g'][i]['tt']['t'])))
    total += len(test['250019']['fitxes']['gg']['g'][i]['tt']['t'])
    
print(total)

amount of lists in the first layer of the complicated json: 10
lenght of the list in the second layer of the complicated json: 4
lenght of the list in the second layer of the complicated json: 18
lenght of the list in the second layer of the complicated json: 3
lenght of the list in the second layer of the complicated json: 9
lenght of the list in the second layer of the complicated json: 3
lenght of the list in the second layer of the complicated json: 10
lenght of the list in the second layer of the complicated json: 8
lenght of the list in the second layer of the complicated json: 2
lenght of the list in the second layer of the complicated json: 8
lenght of the list in the second layer of the complicated json: 2
67


In [46]:
test['250019']['fitxes']['gg']['g'][1]['tt']['t'][1]['ff']['f'][1]['v']

'82,6460,3943531'

In [140]:
test['250019']['fitxes']['gg']['g'][1]['tt']['t'][1]['c']

'Population. By sex'

In [79]:
test['250019']['fitxes']['gg']['g']

[{'tt': {'t': {'ff': {'f': [{'r': '2021',
       'c': 'Surface area',
       'u': 'km²',
       'v': '78.27,1343.09,32108.00',
       'calt': 'Surface area',
       'id': 'f271',
       'updated': '2021-12-23T11:00:00+00:00'},
      {'r': '2013',
       'c': 'Altitude',
       'u': 'm',
       'v': '956,_,_',
       'calt': 'Altitude',
       'id': 'f258',
       'updated': '2014-02-17T11:00:00+00:00'},
      {'r': '2013',
       'c': 'Longitude',
       'u': 'º',
       'v': '1.092892,_,_',
       'calt': 'Longitude',
       'id': 'f328',
       'updated': '2014-02-17T11:00:00+00:00'},
      {'r': '2013',
       'c': 'Latitude',
       'u': 'º',
       'v': '42.162392,_,_',
       'calt': 'Latitude',
       'id': 'f329',
       'updated': '2014-02-17T11:00:00+00:00'},
      {'r': '2013',
       'c': 'UTM coordinates x',
       'u': 'm',
       'v': '342450,_,_',
       'calt': 'UTM coordinates x',
       'id': 'f259',
       'updated': '2014-02-17T11:00:00+00:00'},
      {'r': '2013',

In [35]:
len(test['250019']['fitxes']['gg']['g'])

10

In [55]:
test['250019']['fitxes']['cols']['col']

[{'scheme': 'mun', 'id': '250019', 'content': 'Abella de la Conca'},
 {'scheme': 'com', 'id': '25', 'content': 'Pallars Jussà'},
 {'scheme': 'ca', 'id': '09', 'content': 'Catalunya'}]

In [149]:

# def parse_mun_to_dict_of_list(data):
#     """FUnction build before the knowledge of data not being complete."""
#     columns = {}
#     for key in data.keys():
#         print(data[key]['fitxes']['cols']['col'][0]['id'])
#         print(key)
#         assert data[key]['fitxes']['cols']['col'][0]['id'] == key
        
        
#         for g in data[key]['fitxes']['gg']['g']:
            
#             if type(g['tt']['t']) == type(dict):
                
#                 if type(g['tt']['t']['ff']['f']) == type(dict):
#                     value = value_splitter(g['tt']['t']['ff']['f']['v'])
#                     if g['tt']['t']['c'] + "_" + g['tt']['t']['ff']['f']['calt'] in columns.keys():
#                         columns[g['tt']['t']['c'] + "_" + g['tt']['t']['ff']['f']['calt']].append(value)
#                     else:
#                         columns[g['tt']['t']['c'] + "_" + g['tt']['t']['ff']['f']['calt']] = [value]
#                 else:
#                     for f in g['tt']['t']['ff']['f']:
#                         value = value_splitter(f['v'])
#                         if g['tt']['t']['c'] + "_" + f['calt'] in columns.keys():
#                             columns[g['tt']['t']['c'] + "_" + f['calt']].append(value)
#                         else:
#                             columns[g['tt']['t']['c'] + "_" + f['calt']] = [value]
#             else:
#                 for t in g['tt']['t']:
#                     if type(t['ff']['f']) == type(dict):
#                         value = value_splitter(t['ff']['f']['v'])
#                         if t['c'] + "_" + t['ff']['f']['calt'] in columns.keys():
#                             columns[t['c'] + "_" + t['ff']['f']['calt']].append(value)
#                         else:
#                             columns[t['c'] + "_" + t['ff']['f']['calt']] = [value]
#                     else:
#                         for f in t['ff']['f']:
#                             value = value_splitter(f['v'])
#                             if t['c'] + "_" + f['calt'] in columns.keys():
#                                 columns[t['c'] + "_" + f['calt']].append(value)
#                             else:
#                                 columns[t['c'] + "_" + f['calt']] = [value]
                    
#     return columns        

In [179]:
def value_splitter(value):
    assert type(value) == str
    mun, com, ca = value.split(',')
    return mun, com, ca


def is_not_in_column_names(column_names, name):
    if name in column_names:
        return False
    else:
        return True

        
def get_column_names_from_muns(data):
    """Returns all the names of the columns which are in the data set
    It looks super cumbersome because the people from idescat do not know how to keep consistency in their data
    """
    column_names = []
    # For each municipality in the data, go over the json.
    for key in data.keys():
        # Check if the municipality is really that municipality.
        assert data[key]['fitxes']['cols']['col'][0]['id'] == key
        
        # go over all 'groups' in the dictoinary structure
        for g in data[key]['fitxes']['gg']['g']:
            
            # Check if the group contains one or more tables. The group is a dictoinary of tables when there is
            # is multiple tables in the group, and a list when there is only one. When this if statement returns true,
            # the next element is treated as a dictoinary, and when it returns false, it is treated as a list.
            if type(g['tt']['t']) == dict:

                # Check if the table contains one or more rows. The table is a list of rows when there is
                # is multiple rows in the table, and a dictionary when there is only one. When this if statement returns true,
                # the next element is treated as a dictoinary, and when it returns false, it is treated as a list.
                if type(g['tt']['t']['ff']['f']) == dict:
                    
                    # Check if the name is already in the column names list, and otherwise append it to the list.
                    if is_not_in_column_names(column_names, g['tt']['t']['c'] + "_" + g['tt']['t']['ff']['f']['calt']):
                        column_names.append(g['tt']['t']['c'] + "_" + g['tt']['t']['ff']['f']['calt'])
                else:
                    for f in g['tt']['t']['ff']['f']:
                        # Check if the name is already in the column names list, and otherwise append it to the list.
                        if is_not_in_column_names(column_names, g['tt']['t']['c'] + "_" + f['calt']):
                            column_names.append(g['tt']['t']['c'] + "_" + f['calt'])
                        #print(f['calt'])
            else:
                for t in g['tt']['t']:
                    
                    # Check if the table contains one or more rows. The table is a dictoinary of rows when there is
                    # is multiple rows in the table, and a list when there is only one. When this if statement returns true,
                    # the next element is treated as a dictoinary, and when it returns false, it is treated as a list.
                    if type(t['ff']['f']) == dict:
                        # Check if the name is already in the column names list, and otherwise append it to the list.
                        if is_not_in_column_names(column_names, t['c'] + "_" + t['ff']['f']['calt']):
                            column_names.append(t['c'] + "_" + t['ff']['f']['calt'])

                    else:
                        for f in t['ff']['f']:
                            # Check if the name is already in the column names list, and otherwise append it to the list.
                            if is_not_in_column_names(column_names, t['c'] + "_" + f['calt']):
                                column_names.append(t['c'] + "_" + f['calt'])
    return column_names
        
    
def make_col_dict(column_names):
    col_dict = {}
    for col in column_names:
        col_dict[col] = []
    return col_dict


def retrieve_mun_from_json(data, columns):
    column_names = list(columns.keys())
    for g in data['fitxes']['gg']['g']:
            
        # Check if the group contains one or more tables. The group is a dictoinary of tables when there is
        # is multiple tables in the group, and a list when there is only one. When this if statement returns true,
        # the next element is treated as a dictoinary, and when it returns false, it is treated as a list.
        if type(g['tt']['t']) == dict:

            # Check if the table contains one or more rows. The table is a dictoinary of rows when there is
            # is multiple rows in the table, and a list when there is only one. When this if statement returns true,
            # the next element is treated as a dictoinary, and when it returns false, it is treated as a list.
            if type(g['tt']['t']['ff']['f']) == dict:
                v, _, _ = value_splitter(g['tt']['t']['ff']['f']['v'])
                columns[g['tt']['t']['c'] + "_" + g['tt']['t']['ff']['f']['calt']].append(v)
                print(g['tt']['t']['c'] + "_" + g['tt']['t']['ff']['f']['calt'])
                print(g['tt']['t']['ff']['f']['r'])
                column_names.remove(g['tt']['t']['c'] + "_" + g['tt']['t']['ff']['f']['calt'])
            else:
                for f in g['tt']['t']['ff']['f']:
                    v, _, _ = value_splitter(f['v'])
                    columns[g['tt']['t']['c'] + "_" + f['calt']].append(v)
                    print(g['tt']['t']['c'] + "_" + f['calt'])
                    print('f')
                    column_names.remove(g['tt']['t']['c'] + "_" + f['calt'])
        else:
            for t in g['tt']['t']:
                    
                # Check if the table contains one or more rows. The table is a dictoinary of rows when there is
                # is multiple rows in the table, and a list when there is only one. When this if statement returns true,
                # the next element is treated as a dictoinary, and when it returns false, it is treated as a list.
                if type(t['ff']['f']) == dict:
                    v, _, _ = value_splitter(t['ff']['f']['v'])
                    columns[t['c'] + "_" + t['ff']['f']['calt']].append(v)
                    print(t['c'] + "_" + t['ff']['f']['calt'])
                    print(t['ff']['f']['r'])
                    column_names.remove(t['c'] + "_" + t['ff']['f']['calt'])
                else:
                    for f in t['ff']['f']:                     
                        v, _, _ = value_splitter(f['v'])
                        columns[t['c'] + "_" + f['calt']].append(v)
                        print(t['c'] + "_" + f['calt'])
                        print(f['r'])
                        column_names.remove(t['c'] + "_" + f['calt'])
    
    for c in column_names:
        columns[c].append('nan')
    return columns
    

In [180]:
x = get_column_names_from_muns(test)

In [181]:
s = make_col_dict(x)

In [182]:
v, _, _ = value_splitter('78.27,1343.09,32108.00')

In [183]:
test['080018']['fitxes']['cols']['col'][0]

{'scheme': 'mun', 'id': '080018', 'content': 'Abrera'}

In [184]:
check = retrieve_mun_from_json(test['080018'], s)

Geographic indicators_Surface area
f
Geographic indicators_Altitude
f
Geographic indicators_Longitude
f
Geographic indicators_Latitude
f
Geographic indicators_UTM coordinates x
f
Geographic indicators_UTM coordinates y
f
Density of population_Surface area


KeyError: 'r'

In [149]:
list(s.keys()).remove('Geographic indicators_Surface area')

In [161]:
check

{'Geographic indicators_Surface area': ['19.94', '78.27', '19.94'],
 'Geographic indicators_Altitude': ['105', '956', '105'],
 'Geographic indicators_Longitude': ['1.903100', '1.092892', '1.903100'],
 'Geographic indicators_Latitude': ['41.518531', '42.162392', '41.518531'],
 'Geographic indicators_UTM coordinates x': ['408471', '342450', '408471'],
 'Geographic indicators_UTM coordinates y': ['4596983', '4669650', '4596983'],
 'Density of population_Surface area': ['19.94', '78.27', '19.94'],
 'Density of population_Density': ['632.9', '2.2', '632.9'],
 'Population. By sex_Men': ['6373', '89', '6373'],
 'Population. By sex_Women': ['6247', '82', '6247'],
 'Population. By sex_Population': ['12620', '171', '12620'],
 'Population. By age groups_Population from 0 to 14 years': ['2138',
  '16',
  '2138'],
 'Population. By age groups_Population from 15 to 64 years': ['8503',
  '110',
  '8503'],
 'Population. By age groups_Population from 65 to 84 years': ['1749',
  '33',
  '1749'],
 'Popula

In [128]:
lst = ['s', 'a']
lst.remove('a')
lst.remove('a')

ValueError: list.remove(x): x not in list

In [29]:
test.get('080018')

dict_keys(['fitxes'])

In [42]:
lst1 = ['a', 'b', 'c']
lst2 = (True, False, True)
for i, ls in enumerate(lst1):
    print(i)

0
1
2


In [43]:
nl = [l1 for i, l1 in enumerate(lst1) if lst2[i]==True]

In [44]:
nl

['a', 'c']

In [46]:
dict(zip(lst1, lst2))

{'a': True, 'b': False, 'c': True}

In [49]:
lst1.index('p')

ValueError: 'p' is not in list

In [73]:
t = {'a':[False,1], 'b':[True, 2], 'c':[False, 3]}

In [76]:
for i, (k, v) in enumerate(t.items()):
    print(i, k, v[0], v[1])

0 a False 1
1 b True 2
2 c False 3


In [82]:
a = [(k, v[1]) for i, (k, v) in enumerate(t.items()) if v[0] == False]

In [95]:
a, b = (2, None)

In [96]:
(a, b) == (2, 1)

False

In [97]:
a is not None and b is not None

False

In [11]:
from json_parser import *

In [12]:
parser = JSONParser(json_value='v', json_column='calt')

In [13]:
parser.add_json(d=test['080018'], name='080018')

In [42]:
len(parser._containers['080018']._dict['Women'][4][4][4][4][4][4])


5

In [14]:
parser.parse_data()

ValueError: arrays must all be same length