In [1]:
import json
import requests
import time
import string
import pandas as pd

In [2]:
def request_mun(mun_id):
    mun_res = requests.get("https://api.idescat.cat/emex/v1/dades.json?id={}&lang=en".format(mun_id))
    return mun_res.json()
   

def create_meta_municipalities():
    """
    Retrieves dictionairy containing all municipalities from the idescat. Structure of the dict:
        key: id of the mun
        value: name of the mun
    """
    meta = {}
    mun_res = requests.get("https://api.idescat.cat/emex/v1/dades.json?lang=en")
    mun_meta = mun_res.json()
    muns = mun_meta['fitxes']['cols']['col']
    for mun in muns:
        meta[mun['id']] = mun['content']
        
    return meta


def collect_all_muns(meta, n):
    """
    Collects all the data of each municipality, making a request every n seconds
    """
    
    start_time = time.time()
    municipality_dict = {}
    i = 0
    for identifier in meta.keys():
        
        municipality_dict[identifier] = request_mun(identifier)
        time.sleep(n)
        i += 1
        if i == 10:
            break
    
    print("Retrieved data of {} municipalities in {} seconds".format(i ,int(time.time()-start_time)))
    return municipality_dict

In [3]:
testmeta = create_meta_municipalities()

In [4]:
test = collect_all_muns(testmeta, 0.2)

Retrieved data of 10 municipalities in 4 seconds


In [59]:
test

{'250019': {'fitxes': {'p': 'id=250019',
   'gg': {'g': [{'tt': {'t': {'ff': {'f': [{'r': '2021',
           'c': 'Surface area',
           'u': 'km²',
           'v': '78.27,1343.09,32108.00',
           'calt': 'Surface area',
           'id': 'f271',
           'updated': '2021-12-23T11:00:00+00:00'},
          {'r': '2013',
           'c': 'Altitude',
           'u': 'm',
           'v': '956,_,_',
           'calt': 'Altitude',
           'id': 'f258',
           'updated': '2014-02-17T11:00:00+00:00'},
          {'r': '2013',
           'c': 'Longitude',
           'u': 'º',
           'v': '1.092892,_,_',
           'calt': 'Longitude',
           'id': 'f328',
           'updated': '2014-02-17T11:00:00+00:00'},
          {'r': '2013',
           'c': 'Latitude',
           'u': 'º',
           'v': '42.162392,_,_',
           'calt': 'Latitude',
           'id': 'f329',
           'updated': '2014-02-17T11:00:00+00:00'},
          {'r': '2013',
           'c': 'UTM coordinates

In [5]:
with open("test.json", 'w') as file:
    json.dump(test['250019'], file)

In [149]:

# def parse_mun_to_dict_of_list(data):
#     """FUnction build before the knowledge of data not being complete."""
#     columns = {}
#     for key in data.keys():
#         print(data[key]['fitxes']['cols']['col'][0]['id'])
#         print(key)
#         assert data[key]['fitxes']['cols']['col'][0]['id'] == key
        
        
#         for g in data[key]['fitxes']['gg']['g']:
            
#             if type(g['tt']['t']) == type(dict):
                
#                 if type(g['tt']['t']['ff']['f']) == type(dict):
#                     value = value_splitter(g['tt']['t']['ff']['f']['v'])
#                     if g['tt']['t']['c'] + "_" + g['tt']['t']['ff']['f']['calt'] in columns.keys():
#                         columns[g['tt']['t']['c'] + "_" + g['tt']['t']['ff']['f']['calt']].append(value)
#                     else:
#                         columns[g['tt']['t']['c'] + "_" + g['tt']['t']['ff']['f']['calt']] = [value]
#                 else:
#                     for f in g['tt']['t']['ff']['f']:
#                         value = value_splitter(f['v'])
#                         if g['tt']['t']['c'] + "_" + f['calt'] in columns.keys():
#                             columns[g['tt']['t']['c'] + "_" + f['calt']].append(value)
#                         else:
#                             columns[g['tt']['t']['c'] + "_" + f['calt']] = [value]
#             else:
#                 for t in g['tt']['t']:
#                     if type(t['ff']['f']) == type(dict):
#                         value = value_splitter(t['ff']['f']['v'])
#                         if t['c'] + "_" + t['ff']['f']['calt'] in columns.keys():
#                             columns[t['c'] + "_" + t['ff']['f']['calt']].append(value)
#                         else:
#                             columns[t['c'] + "_" + t['ff']['f']['calt']] = [value]
#                     else:
#                         for f in t['ff']['f']:
#                             value = value_splitter(f['v'])
#                             if t['c'] + "_" + f['calt'] in columns.keys():
#                                 columns[t['c'] + "_" + f['calt']].append(value)
#                             else:
#                                 columns[t['c'] + "_" + f['calt']] = [value]
                    
#     return columns        

In [179]:
# def value_splitter(value):
#     assert type(value) == str
#     mun, com, ca = value.split(',')
#     return mun, com, ca


# def is_not_in_column_names(column_names, name):
#     if name in column_names:
#         return False
#     else:
#         return True

        
# def get_column_names_from_muns(data):
#     """Returns all the names of the columns which are in the data set
#     It looks super cumbersome because the people from idescat do not know how to keep consistency in their data
#     """
#     column_names = []
#     # For each municipality in the data, go over the json.
#     for key in data.keys():
#         # Check if the municipality is really that municipality.
#         assert data[key]['fitxes']['cols']['col'][0]['id'] == key
        
#         # go over all 'groups' in the dictoinary structure
#         for g in data[key]['fitxes']['gg']['g']:
            
#             # Check if the group contains one or more tables. The group is a dictoinary of tables when there is
#             # is multiple tables in the group, and a list when there is only one. When this if statement returns true,
#             # the next element is treated as a dictoinary, and when it returns false, it is treated as a list.
#             if type(g['tt']['t']) == dict:

#                 # Check if the table contains one or more rows. The table is a list of rows when there is
#                 # is multiple rows in the table, and a dictionary when there is only one. When this if statement returns true,
#                 # the next element is treated as a dictoinary, and when it returns false, it is treated as a list.
#                 if type(g['tt']['t']['ff']['f']) == dict:
                    
#                     # Check if the name is already in the column names list, and otherwise append it to the list.
#                     if is_not_in_column_names(column_names, g['tt']['t']['c'] + "_" + g['tt']['t']['ff']['f']['calt']):
#                         column_names.append(g['tt']['t']['c'] + "_" + g['tt']['t']['ff']['f']['calt'])
#                 else:
#                     for f in g['tt']['t']['ff']['f']:
#                         # Check if the name is already in the column names list, and otherwise append it to the list.
#                         if is_not_in_column_names(column_names, g['tt']['t']['c'] + "_" + f['calt']):
#                             column_names.append(g['tt']['t']['c'] + "_" + f['calt'])
#                         #print(f['calt'])
#             else:
#                 for t in g['tt']['t']:
                    
#                     # Check if the table contains one or more rows. The table is a dictoinary of rows when there is
#                     # is multiple rows in the table, and a list when there is only one. When this if statement returns true,
#                     # the next element is treated as a dictoinary, and when it returns false, it is treated as a list.
#                     if type(t['ff']['f']) == dict:
#                         # Check if the name is already in the column names list, and otherwise append it to the list.
#                         if is_not_in_column_names(column_names, t['c'] + "_" + t['ff']['f']['calt']):
#                             column_names.append(t['c'] + "_" + t['ff']['f']['calt'])

#                     else:
#                         for f in t['ff']['f']:
#                             # Check if the name is already in the column names list, and otherwise append it to the list.
#                             if is_not_in_column_names(column_names, t['c'] + "_" + f['calt']):
#                                 column_names.append(t['c'] + "_" + f['calt'])
#     return column_names
        
    
# def make_col_dict(column_names):
#     col_dict = {}
#     for col in column_names:
#         col_dict[col] = []
#     return col_dict


# def retrieve_mun_from_json(data, columns):
#     column_names = list(columns.keys())
#     for g in data['fitxes']['gg']['g']:
            
#         # Check if the group contains one or more tables. The group is a dictoinary of tables when there is
#         # is multiple tables in the group, and a list when there is only one. When this if statement returns true,
#         # the next element is treated as a dictoinary, and when it returns false, it is treated as a list.
#         if type(g['tt']['t']) == dict:

#             # Check if the table contains one or more rows. The table is a dictoinary of rows when there is
#             # is multiple rows in the table, and a list when there is only one. When this if statement returns true,
#             # the next element is treated as a dictoinary, and when it returns false, it is treated as a list.
#             if type(g['tt']['t']['ff']['f']) == dict:
#                 v, _, _ = value_splitter(g['tt']['t']['ff']['f']['v'])
#                 columns[g['tt']['t']['c'] + "_" + g['tt']['t']['ff']['f']['calt']].append(v)
#                 print(g['tt']['t']['c'] + "_" + g['tt']['t']['ff']['f']['calt'])
#                 print(g['tt']['t']['ff']['f']['r'])
#                 column_names.remove(g['tt']['t']['c'] + "_" + g['tt']['t']['ff']['f']['calt'])
#             else:
#                 for f in g['tt']['t']['ff']['f']:
#                     v, _, _ = value_splitter(f['v'])
#                     columns[g['tt']['t']['c'] + "_" + f['calt']].append(v)
#                     print(g['tt']['t']['c'] + "_" + f['calt'])
#                     print('f')
#                     column_names.remove(g['tt']['t']['c'] + "_" + f['calt'])
#         else:
#             for t in g['tt']['t']:
                    
#                 # Check if the table contains one or more rows. The table is a dictoinary of rows when there is
#                 # is multiple rows in the table, and a list when there is only one. When this if statement returns true,
#                 # the next element is treated as a dictoinary, and when it returns false, it is treated as a list.
#                 if type(t['ff']['f']) == dict:
#                     v, _, _ = value_splitter(t['ff']['f']['v'])
#                     columns[t['c'] + "_" + t['ff']['f']['calt']].append(v)
#                     print(t['c'] + "_" + t['ff']['f']['calt'])
#                     print(t['ff']['f']['r'])
#                     column_names.remove(t['c'] + "_" + t['ff']['f']['calt'])
#                 else:
#                     for f in t['ff']['f']:                     
#                         v, _, _ = value_splitter(f['v'])
#                         columns[t['c'] + "_" + f['calt']].append(v)
#                         print(t['c'] + "_" + f['calt'])
#                         print(f['r'])
#                         column_names.remove(t['c'] + "_" + f['calt'])
    
#     for c in column_names:
#         columns[c].append('nan')
#     return columns
    

In [60]:
test.keys()

dict_keys(['250019', '080018', '250024', '250030', '080023', '170010', '080142', '430017', '170025', '250387'])

In [61]:
def add_muns_to_parser(parser, muns):
    for name, mun in muns.items():
        parser.add_json(d=mun, name=name)

In [62]:
def change_dicts_to_panda_df(dicts):
    pd_dict = {}
    for name, dct in dicts.items():
        df = pd.DataFrame(dct, index=[name])
        pd_dict[name] = df
    return pd_dict        

In [63]:
def concatenate_dict_of_dfs(dfs):
    for i, (name, d) in enumerate(dfs.items()):
        if i == 0:
            df = d
        else:
            df = pd.concat([df, d])
    return df

In [54]:
concatenate_dict_of_dfs(ptest)

Unnamed: 0,Territory_Geographic_indicators_Surface_area_Surface_area,Territory_Geographic_indicators_Altitude_Altitude,Territory_Geographic_indicators_Longitude_Longitude,Territory_Geographic_indicators_Latitude_Latitude,Territory_Geographic_indicators_UTM_coordinates_x_UTM_coordinates_x,Territory_Geographic_indicators_UTM_coordinates_y_UTM_coordinates_y,Population_Geographic_indicators_UTM_coordinates_y_UTM_coordinates_y_Density_Density_of_population_Surface_area_Surface_area,Population_Geographic_indicators_UTM_coordinates_y_UTM_coordinates_y_Density_Density_of_population_Density_Density,Population_Geographic_indicators_UTM_coordinates_y_UTM_coordinates_y_Sex_Population._By_sex_Men_Men,Population_Geographic_indicators_UTM_coordinates_y_UTM_coordinates_y_Sex_Population._By_sex_Women_Women,...,Economic_sectors_Type_of_dwellings_Family_dwellings._By_type_Family_dwellings_Total_Fleet_of_vehicles_Fleet_of_vehicles_Cars_Estate_cars,Economic_sectors_Type_of_dwellings_Family_dwellings._By_type_Family_dwellings_Total_Fleet_of_vehicles_Fleet_of_vehicles_Motorcycles_Motorcycles,Economic_sectors_Type_of_dwellings_Family_dwellings._By_type_Family_dwellings_Total_Fleet_of_vehicles_Fleet_of_vehicles_Industrial_vehicles_Industrial_vehicles,Economic_sectors_Type_of_dwellings_Family_dwellings._By_type_Family_dwellings_Total_Fleet_of_vehicles_Fleet_of_vehicles_Other__vehicles_Others,Economic_sectors_Type_of_dwellings_Family_dwellings._By_type_Family_dwellings_Total_Fleet_of_vehicles_Fleet_of_vehicles_Fleet_of_vehicles_Total,Environment_Type_of_dwellings_Family_dwellings._By_type_Family_dwellings_Total_Municipal_waste_Municipal_waste_Municipal_waste_per_capita_Generation_per_capita,Environment_Type_of_dwellings_Family_dwellings._By_type_Family_dwellings_Total_Municipal_waste_Municipal_waste_Selective_collection_of_municipal_waste_Selective_collection,Environment_Type_of_dwellings_Family_dwellings._By_type_Family_dwellings_Total_Industrial_waste_Industrial_waste_Establishments_with_yearly_ind._waste_declarations_Establishments_with_yearly_waste_declarations,Environment_Type_of_dwellings_Family_dwellings._By_type_Family_dwellings_Total_Industrial_waste_Industrial_waste_Industrial_waste_Industrial_waste,Quality_of_life_Level_of_education_attained_Population_aged_15_and_over._By_level_of_education_attained_Contributory_accounts_Total_Type_of_tenancy_of_dwellings_Main_family_dwellings._By_type_of_tenancy_Owned_main_family_homes_Owned
80018,19.94,105,1.9031,41.518531,408471,4596983,19.94,632.9,6373,6247,...,,,,,,,,,,
250019,78.27,956,1.092892,42.162392,342450,4669650,78.27,2.2,89,82,...,175.0,13.0,189.0,135.0,512.0,1.24,40.7,0.0,0.0,
250024,160.57,642,0.762617,42.002881,314700,4652600,160.57,3.7,322,266,...,367.0,69.0,191.0,75.0,702.0,1.6,43.6,0.0,0.0,
250030,79.65,337,1.099819,41.788086,342100,4628075,79.65,68.6,2763,2701,...,,,,,,,,,,1534.0
80023,43.32,480,1.632131,41.740044,386250,4621900,43.32,6.6,154,132,...,8645.0,168.0,6314.0,2749.0,17876.0,1.37,44.0,1.0,2.24,
170010,27.73,166,2.847522,42.395189,487450,4693750,27.73,31.9,445,440,...,559.0,82.0,187.0,28.0,856.0,1.54,48.9,2.0,45.66,


In [46]:
add_muns_to_parser(parser, test)

In [47]:
ptest = change_dicts_to_panda_df(containers)

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [48]:
print(type(ptest['080018']))

<class 'pandas.core.frame.DataFrame'>


In [17]:
parser._json_data.keys()

dict_keys(['080018', '250019', '250024', '250030', '080023', '170010', '080142', '430017', '170025', '250387'])

In [5]:
from json_parser import *

In [6]:
parser = JSONParser(json_value='v', json_column_name=['c', 'calt'])

In [7]:
parser.add_json(d=test['080018'], name='080018')
parser.add_json(d=test['250019'], name='250019')
parser.add_json(d=test['250024'], name='250024')
parser.add_json(d=test['250030'], name='250030')
parser.add_json(d=test['080023'], name='080023')
parser.add_json(d=test['170010'], name='170010')
parser.add_key_map(homogenize_key)
parser.add_value_map(value_splitter)

In [8]:
parser.parse_data()

None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None


In [9]:
parser._holder._keys

{}

In [10]:
containers = parser.containers()
for name, con in containers.items():
    print(len(con))

211
168
168
212
161
169


In [74]:
dct={1:'a'}
1 in dct.keys()

True

In [11]:
for key, value in containers['250019'].items():
    print(f'{value} \t {key}')

78.27 	 Territory_Geographic_indicators_Surface_area_Surface_area
956 	 Territory_Geographic_indicators_Altitude_Altitude
1.092892 	 Territory_Geographic_indicators_Longitude_Longitude
42.162392 	 Territory_Geographic_indicators_Latitude_Latitude
342450 	 Territory_Geographic_indicators_UTM_coordinates_x_UTM_coordinates_x
4669650 	 Territory_Geographic_indicators_UTM_coordinates_y_UTM_coordinates_y
78.27 	 Population_Density_Density_of_population_Surface_area_Surface_area
2.2 	 Population_Density_Density_of_population_Density_Density
89 	 Population_Sex_Population._By_sex_Men_Men
82 	 Population_Sex_Population._By_sex_Women_Women
171 	 Population_Sex_Population._By_sex_Population_Total
16 	 Population_Age_groups_Population._By_age_groups_Population_from_0_to_14_years_From_0_to_14_years
110 	 Population_Age_groups_Population._By_age_groups_Population_from_15_to_64_years_From_15_to_64_years
33 	 Population_Age_groups_Population._By_age_groups_Population_from_65_to_84_years_From_65_to_84_

In [63]:
pd1 = pd.DataFrame(containers['080018'])
pd2 = pd.DataFrame(containers['250019'], index=[0])

In [64]:
pd3 = pd.concat([pd1,pd2])

In [65]:
for col in pd3.columns:
    print(pd3[col])

0    19.94
1    19.94
0    78.27
Name: Territory_Geographic_indicators_Surface_area, dtype: object
0    105
1    105
0    956
Name: Territory_Geographic_indicators_Altitude, dtype: object
0    1.903100
1    1.903100
0    1.092892
Name: Territory_Geographic_indicators_Longitude, dtype: object
0    41.518531
1    41.518531
0    42.162392
Name: Territory_Geographic_indicators_Latitude, dtype: object
0    408471
1    408471
0    342450
Name: Territory_Geographic_indicators_UTM_coordinates_x, dtype: object
0    4596983
1    4596983
0    4669650
Name: Territory_Geographic_indicators_UTM_coordinates_y, dtype: object
0    19.94
1    19.94
0    78.27
Name: Population_Geographic_indicators_UTM_coordinates_y_Density_of_population_Surface_area, dtype: object
0    632.9
1    632.9
0      2.2
Name: Population_Geographic_indicators_UTM_coordinates_y_Density_of_population_Density, dtype: object
0    6373
1    6373
0      89
Name: Population_Geographic_indicators_UTM_coordinates_y_Population._By_sex_Me

Name: Quality_of_life_Population_aged_15_and_over._By_level_of_education_attained_Total_Family_dwellings._By_type_Main, dtype: object
0    503
1    503
0    NaN
Name: Quality_of_life_Population_aged_15_and_over._By_level_of_education_attained_Total_Family_dwellings._By_type_Secondary, dtype: object
0    269
1    269
0    NaN
Name: Quality_of_life_Population_aged_15_and_over._By_level_of_education_attained_Total_Family_dwellings._By_type_Vacant, dtype: object
0    5222
1    5222
0     NaN
Name: Quality_of_life_Population_aged_15_and_over._By_level_of_education_attained_Total_Family_dwellings._By_type_Total, dtype: object
0    335
1    335
0    NaN
Name: Quality_of_life_Population_aged_15_and_over._By_level_of_education_attained_Total_Main_family_dwellings._By_type_of_tenancy_Rented, dtype: object
0    4450
1    4450
0     NaN
Name: Quality_of_life_Population_aged_15_and_over._By_level_of_education_attained_Total_Main_family_dwellings._By_type_of_tenancy_Total, dtype: object
0    1975
1 

0    NaN
1    NaN
0    512
Name: Economic_sectors_Family_dwellings._By_type_Total_Fleet_of_vehicles_Total, dtype: object
0     NaN
1     NaN
0    1.24
Name: Environment_Family_dwellings._By_type_Total_Municipal_waste_Generation_per_capita, dtype: object
0     NaN
1     NaN
0    40.7
Name: Environment_Family_dwellings._By_type_Total_Municipal_waste_Selective_collection, dtype: object
0    NaN
1    NaN
0      0
Name: Environment_Family_dwellings._By_type_Total_Industrial_waste_Establishments_with_yearly_waste_declarations, dtype: object
0     NaN
1     NaN
0    0.00
Name: Environment_Family_dwellings._By_type_Total_Industrial_waste_Industrial_waste, dtype: object
