In [1]:
from pyrosm.data import sources
import pyrosm

In [2]:
#print(sources.cities.available)
if "Amsterdam" in sources.cities.available:
    print(True)

True


In [3]:
#Parse tags
from collections import Counter, defaultdict
import json
import pandas as pd
import ast

def parse_tags(val):
    if pd.isna(val) or val in ["None", "nan", None]:
        return {}
    try:
        return ast.literal_eval(val) if isinstance(val, str) else val
    
    except json.JSONDecodeError:
        return {}

#pois["tags"] = pois["tags"].apply(parse_tags) #only need to run it once



In [4]:
def create_tag_lists(pois, n):
    tag_freq = defaultdict(int)
    #print(tag_freq)



    for idx, row in pois.iterrows(): 
        tags = row.get("tags", {})
        #print(f"Row {idx}, Tags: {tags}, Type: {type(tags)}")  # Check type of tags

        if isinstance(tags, dict) and tags:
            for tag_key in tags:
                tag_freq[tag_key] += 1


    # print(tag_freq)
    all_tags = list(tag_freq.keys())
    all_good_tags = [tag for tag in tag_freq if tag_freq[tag] > n]
    tag2idx = {tag: i for i, tag in enumerate(all_good_tags)}
    idx2tag = {i: tag for tag, i in tag2idx.items()}

    tag_freq = dict(sorted(tag_freq.items(), key = lambda x: x[1], reverse = True))
    print(f"All tags sorted by frequency: {tag_freq}")
    print(f" All allowed tags: {all_good_tags}")
    # print(tag2idx)
    # print(idx2tag)
    print(f"Len all tags: {len(all_tags)}, Len good tags: {len(all_good_tags)}")
    print("\n Returning (all_good_tags, tag2idx)")


    return (all_good_tags, tag2idx)

In [5]:
def remove_bad_tags(good_tags, pois):
    for i in pois["tags"]:
        remove_list = []
        for j in i:
            if j not in good_tags:
                remove_list.append(j)
        for removable in remove_list:
            #print(f"removing {removable} from {i}")
            i.pop(removable)
            #print(f"after remnoval: {i}")           
   

In [6]:
import numpy as np
def tags_to_vec(tag_dict, tag2idx):
    vector = np.zeros(len(tag2idx), dtype= np.float32)
    if isinstance(tag_dict, dict):
        for tag in tag_dict:
            if tag in tag2idx:
                vector[tag2idx[tag]] = 1
    return vector

In [7]:
def vector_pois(pois, tag2idx, n):
    X = []
    y = []
    N_shuffle = 4

    for tag in pois["tags"]:
        if not isinstance(tag, dict) or len(tag) < n: # set the len to 1 will include empty {}
            continue

        tag_keys = list(tag.keys())
        for _ in range(N_shuffle):

            np.random.shuffle(tag_keys)
            
            mid_idx = len(tag_keys) // 2
            #print(mid_idx)
            input_tags = {k: tag[k] for k in tag_keys[:mid_idx]}
            output_tags = {k: tag[k] for k in tag_keys[mid_idx:]}

            # print("input_tags", input_tags)
            # print("   ")
            # print("output_tags", output_tags)
            # print("--------------------------------")
            vec = tags_to_vec(input_tags, tag2idx)
            #print("vector",vec) #tag to vector testing


            X.append(tags_to_vec(input_tags, tag2idx))
            y.append(tags_to_vec(output_tags, tag2idx))

    X = np.stack(X)
    #print(X)
    y = np.stack(y)
    print("X shape:", X.shape)

    print("y_shape", y.shape)

    return X,y

In [8]:
fp = pyrosm.get_data("Amsterdam")
osm = pyrosm.OSM(fp)
pois = osm.get_pois()
pois["tags"] = pois["tags"].apply(parse_tags) #only need to run it once


In [9]:
n_good_tags = 30 # Frequency of tags for them to be allowed
n_per_instance = 2 # Amount of tags an instance needs to have to be part of the data

good_tags, tag2idx = create_tag_lists(pois, n_good_tags)
remove_bad_tags(good_tags, pois)
X, Y = vector_pois(pois, tag2idx, n_per_instance)

All tags sorted by frequency: {'capacity': 23189, 'surface': 23145, 'parking_space': 15738, 'access': 14730, 'smoothness': 14013, 'source:date': 13136, 'fee': 10279, 'orientation': 9062, 'wheelchair': 6794, 'backrest': 6263, 'brand': 4630, 'recycling_type': 4610, 'brand:wikidata': 4455, 'check_date': 4434, 'maxstay:conditional': 4007, 'material': 3736, 'lit': 3240, 'cuisine': 3057, 'operator:wikidata': 2857, 'covered': 2682, 'brand:wikipedia': 2173, 'waste': 1988, 'ref:bag': 1546, 'colour': 1495, 'note': 1308, 'artwork_type': 1276, 'outdoor_seating': 1236, 'seats': 1222, 'recycling:waste': 1209, 'wikidata': 1054, 'brand:website': 1040, 'recycling:paper': 1021, 'operator:wikipedia': 954, 'takeaway': 911, 'layer': 862, 'description': 827, 'recycling:glass_bottles': 822, 'artist_name': 820, 'level': 777, 'healthcare': 685, 'wikimedia_commons': 673, 'indoor_seating': 651, 'vending': 622, 'contact:facebook': 591, 'branch': 573, 'denomination': 567, 'payment:cash': 556, 'shelter_type': 548, 

In [10]:
# !pip install tensorflow

In [11]:
import tensorflow as tf
import keras
#from tensorflow.python import keras
from keras import layers

In [12]:
print(X[1], Y[0])
print(pois["tags"][0])

[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0

In [13]:
from sklearn.model_selection import train_test_split

In [33]:
X_temp, X_test, y_temp, y_test = train_test_split(
    X, Y, test_size = 0.2, random_state = 42)

X_train, X_val, y_train, y_val = train_test_split(
X_temp, y_temp, test_size = 0.2, random_state = 42 )

In [37]:
import numpy as np
X_train = np.array(X_train)
X_test = np.array(X_test)
X_val = np.array(X_val)
y_train = np.array(y_train)
y_test = np.array(y_test)
y_val =np.array(y_val)



In [38]:
#X_train.shape
y_val.shape


(35490, 263)

In [18]:
def class_weight(y_train):
    epislon = 1e-7
    tag_counts = np.sum(y_train, axis = 0)
    total_samples = y_train.shape[0]

    tag_counts = np.where(tag_counts ==0, 1, tag_counts)
    class_weights = total_samples / tag_counts
    class_weights = class_weights / np.sum(class_weights)* len(class_weights)
    return class_weights

In [None]:
def weighted_binary_crossentropy(class_weights):
    class_weights = tf.constant(class_weights, dtype = tf.float32)
    def loss_function(y_true,y_pred):
        y_pred = tf.clip_by_value(y_pred, 1e-7, 1 - 1e-7)
        loss = -(
            class_weights * y_true * tf.math.log(y_pred)+ (
                1 - y_true) * tf.math.log(1-y_pred)
        )
        return tf.reduce_mean(loss)
    return loss_function

weights = class_weight(y_train)

In [29]:
model= keras.Sequential([
keras.layers.Input(shape = X_train[1].shape),
layers.Dense(256, activation = "relu", name = "Layer1"),
layers.Dropout(0.2),
layers.Dense(128, activation = "relu", name = "Layer2"),
# layers.Dense(64, activation = "relu",  name = "Layer3"),
layers.Dense(y_train.shape[1], activation = "sigmoid", name = "Layer4")



]
)
#model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss = tf.keras.losses.BinaryCrossentropy(from_logits= True), metrics = ["binary_accuracy"])
loss_func = weighted_binary_crossentropy(weights)
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001),
              loss=tf.keras.losses.BinaryFocalCrossentropy,
            metrics=["accuracy"])


In [30]:
model.summary()

In [42]:
model.fit(X_train, y_train, epochs = 10, batch_size =32, verbose = 1 , validation_data= (X_val, y_val))

Epoch 1/10
[1m4437/4437[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 11ms/step - accuracy: 0.3906 - loss: 0.0032 - val_accuracy: 0.3631 - val_loss: 0.0034
Epoch 2/10
[1m4437/4437[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 11ms/step - accuracy: 0.3951 - loss: 0.0032 - val_accuracy: 0.3687 - val_loss: 0.0034
Epoch 3/10
[1m4437/4437[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 10ms/step - accuracy: 0.3951 - loss: 0.0032 - val_accuracy: 0.4061 - val_loss: 0.0034
Epoch 4/10
[1m4437/4437[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 10ms/step - accuracy: 0.3990 - loss: 0.0032 - val_accuracy: 0.3625 - val_loss: 0.0034
Epoch 5/10
[1m4437/4437[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 10ms/step - accuracy: 0.3939 - loss: 0.0032 - val_accuracy: 0.4247 - val_loss: 0.0035
Epoch 6/10
[1m4437/4437[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 10ms/step - accuracy: 0.3913 - loss: 0.0032 - val_accuracy: 0.3712 - val_loss: 0.0034
Epoc

<keras.src.callbacks.history.History at 0x1790815bb80>

In [52]:
# customize threshold for each tags
from sklearn.metrics import f1_score, precision_score, recall_score, multilabel_confusion_matrix
#y_pred = model.predict(X_test)
y_val_prob = model.predict(X_val)
def find_best_threshold(y_true, y_probs, thresholds = np.arange(0.05, 0.95, 0.05)):
    best_threshold = []
    
    for tag_idx in range(y_true.shape[1]):
        best_thresh = 0.5
        best_f1 = 0
        for thresh in thresholds:
            preds = (y_probs[:,tag_idx] >= thresh).astype(int)
            f1 = f1_score(y_true[:, tag_idx], preds, zero_division= 0)
            if f1 > best_f1:
                best_f1 = f1
                best_thresh = thresh
        best_threshold.append(best_thresh)
        
    return best_threshold

best_thresholds = find_best_threshold(y_val, y_val_prob)

[1m1110/1110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step


In [53]:
def apply_thresholds(y_probs, thresholds):
    return (y_probs >= thresholds).astype(int)
y_test_probs = model.predict(X_test)

y_test_pred = apply_thresholds(y_test_probs, best_thresholds)

[1m1387/1387[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step


In [54]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_test_pred, zero_division = 0))

              precision    recall  f1-score   support

           0       0.27      0.47      0.34      1646
           1       0.44      0.50      0.47      2600
           2       0.30      0.24      0.27       317
           3       0.63      0.65      0.64      4694
           4       0.79      0.60      0.68      1226
           5       0.49      0.90      0.64      3855
           6       0.65      0.49      0.56       487
           7       0.55      0.59      0.57       332
           8       0.51      0.63      0.56        63
           9       0.24      0.30      0.27        54
          10       0.93      0.81      0.87      2050
          11       0.86      0.34      0.49        73
          12       0.59      0.55      0.57       385
          13       0.81      0.70      0.75      5844
          14       0.01      0.86      0.01        14
          15       0.34      0.27      0.30       355
          16       0.61      0.41      0.49       369
          17       0.87    

In [55]:
idx2tag = {i: tag for tag, i in tag2idx.items()}

def vector_to_tags(binary_vec, idx2tag):
    return [idx2tag[i] for i, val in enumerate(binary_vec) if val == 1]


for i in range(20):  # check first 10 predictions
    predicted_tags = vector_to_tags(y_test_pred[i], idx2tag)
    actual_tags = vector_to_tags(y_test[i], idx2tag)
    
    print(f"POI {i}")
    print("Predicted:", predicted_tags)
    print("Actual   :", actual_tags)
    print("-----------")

POI 0
Predicted: ['studio']
Actual   : ['cuisine']
-----------
POI 1
Predicted: ['fee', 'access', 'surface']
Actual   : ['fee', 'access', 'surface']
-----------
POI 2
Predicted: ['operator:wikidata', 'operator:wikipedia']
Actual   : ['operator:wikidata', 'operator:wikipedia']
-----------
POI 3
Predicted: ['check_date', 'wheelchair', 'contact:instagram', 'ref:gers']
Actual   : ['contact:instagram', 'cuisine']
-----------
POI 4
Predicted: ['surface', 'lit', 'parking_space']
Actual   : ['surface', 'lit', 'parking_space']
-----------
POI 5
Predicted: ['source:date']
Actual   : ['wheelchair']
-----------
POI 6
Predicted: ['smoothness', 'parking_space']
Actual   : ['smoothness', 'parking_space']
-----------
POI 7
Predicted: ['operator:wikidata', 'recycling:glass_bottles', 'location', 'recycling:shoes']
Actual   : ['wheelchair', 'recycling:shoes']
-----------
POI 8
Predicted: ['smoothness', 'surface', 'parking_space']
Actual   : ['smoothness', 'surface', 'parking_space']
-----------
POI 9
Pre