In [3]:
#!pip install scikit-multilearn

In [4]:
from pyrosm.data import sources
import pyrosm
from collections import Counter, defaultdict
import json
import pandas as pd
import ast
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report
from skmultilearn.model_selection import iterative_train_test_split

In [5]:
#Parse tags

def parse_tags(val):
    if pd.isna(val) or val in ["None", "nan", None]:
        return {}
    try:
        return ast.literal_eval(val) if isinstance(val, str) else val
    
    except json.JSONDecodeError:
        return {}

#pois["tags"] = pois["tags"].apply(parse_tags) #only need to run it once

In [6]:
def create_tag_lists(pois, n):
    tag_freq = defaultdict(int)
    #print(tag_freq)



    for idx, row in pois.iterrows(): 
        tags = row.get("tags", {})
        #print(f"Row {idx}, Tags: {tags}, Type: {type(tags)}")  # Check type of tags

        if isinstance(tags, dict) and tags:
            for tag_key in tags:
                tag_freq[tag_key] += 1


    # print(tag_freq)
    all_tags = list(tag_freq.keys())
    all_good_tags = [tag for tag in tag_freq if tag_freq[tag] > n]
    tag2idx = {tag: i for i, tag in enumerate(all_good_tags)}
    idx2tag = {i: tag for tag, i in tag2idx.items()}

    tag_freq = dict(sorted(tag_freq.items(), key = lambda x: x[1], reverse = True))
    print(f"All tags sorted by frequency: {tag_freq}")
    print(f" All allowed tags: {all_good_tags}")
    # print(tag2idx)
    # print(idx2tag)
    print(f"Len all tags: {len(all_tags)}, Len good tags: {len(all_good_tags)}")
    #print("\n Returning (all_good_tags, tag2idx)")


    return (all_good_tags, tag2idx)

In [7]:
def remove_bad_tags(good_tags, pois):
    for i in pois["tags"]:
        remove_list = []
        for j in i:
            if j not in good_tags:
                remove_list.append(j)
        for removable in remove_list:
            #print(f"removing {removable} from {i}")
            i.pop(removable)
            #print(f"after remnoval: {i}")           

In [8]:
def tags_to_vec(tag_dict, tag2idx):
    vector = np.zeros(len(tag2idx), dtype= np.float32)
    if isinstance(tag_dict, dict):
        for tag in tag_dict:
            if tag in tag2idx:
                vector[tag2idx[tag]] = 1
    return vector

In [10]:
def vector_pois(pois, tag2idx, n):
    X = []
    y = []

    for tag in pois["tags"]:
        if not isinstance(tag, dict) or len(tag) < n: # set the len to 1 will include empty {}
            continue

        tag_keys = list(tag.keys())

        np.random.shuffle(tag_keys)
        
        mid_idx = len(tag_keys) // 2
        #print(mid_idx)
        input_tags = {k: tag[k] for k in tag_keys[:mid_idx]}
        output_tags = {k: tag[k] for k in tag_keys[mid_idx:]}

        # print("input_tags", input_tags)
        # print("   ")
        # print("output_tags", output_tags)
        # print("--------------------------------")
        #vec = tags_to_vec(input_tags, tag2idx)
        #print("vector",vec) #tag to vector testing


        X.append(tags_to_vec(input_tags, tag2idx))
        y.append(tags_to_vec(output_tags, tag2idx))

    X = np.stack(X)
    #print(X)
    y = np.stack(y)
    print("X shape:", X.shape)

    print("y_shape", y.shape)

    return X,y

def vector_pois_opt(pois, tag2idx, n):
    X = []
    y = []

    for tag in pois["tags"]:
        if not isinstance(tag, dict) or len(tag) < n: # set the len to 1 will include empty {}
            continue

        tag_keys = list(tag.keys())

        np.random.shuffle(tag_keys)
        
        for feature_idx in range(len(tag)//2):
            input_tags = {k: tag[k] for k in tag_keys[:feature_idx] + tag_keys[feature_idx + 1:]}
            output_tags = {tag_keys[feature_idx] : tag[tag_keys[feature_idx]]}


            X.append(tags_to_vec(input_tags, tag2idx))
            y.append(tags_to_vec(output_tags, tag2idx))
    
    X = np.stack(X)
    #print(X)
    y = np.stack(y)
    print("X shape:", X.shape)

    print("y_shape", y.shape)

    return X,y

In [11]:
fp = pyrosm.get_data("Amsterdam")
osm = pyrosm.OSM(fp)
pois = osm.get_pois()
pois["tags"] = pois["tags"].apply(parse_tags) #only need to run it once

In [31]:
n_good_tags = 10 # Frequency of tags for them to be allowed
n_per_instance = 5 # Amount of tags an instance needs to have to be part of the data
                   # If set to 1 it will include empty cells


good_tags, tag2idx = create_tag_lists(pois, n_good_tags)
remove_bad_tags(good_tags, pois)
#X, Y = vector_pois_opt(pois, tag2idx, n_per_instance)
X, Y = vector_pois(pois, tag2idx, n_per_instance)

All tags sorted by frequency: {'surface': 23184, 'capacity': 23044, 'parking_space': 15768, 'access': 14815, 'smoothness': 14031, 'source:date': 13195, 'fee': 10329, 'orientation': 9191, 'wheelchair': 6801, 'backrest': 6314, 'brand': 4639, 'recycling_type': 4633, 'check_date': 4527, 'brand:wikidata': 4461, 'maxstay:conditional': 4009, 'material': 3769, 'lit': 3253, 'cuisine': 3065, 'operator:wikidata': 2889, 'covered': 2711, 'brand:wikipedia': 2156, 'waste': 2040, 'ref:bag': 1526, 'colour': 1499, 'note': 1307, 'artwork_type': 1285, 'outdoor_seating': 1248, 'seats': 1226, 'recycling:waste': 1214, 'wikidata': 1055, 'brand:website': 1046, 'recycling:paper': 1025, 'operator:wikipedia': 954, 'takeaway': 912, 'layer': 861, 'description': 829, 'recycling:glass_bottles': 827, 'artist_name': 821, 'level': 784, 'wikimedia_commons': 711, 'healthcare': 689, 'indoor_seating': 664, 'vending': 629, 'contact:facebook': 596, 'branch': 595, 'payment:cash': 569, 'denomination': 568, 'shelter_type': 549, 

In [32]:
#X_train, X_test, y_train, y_test = train_test_split(X, Y)
#X_val, _, y_val, _ = train_test_split(X, Y)

X_train, y_train, X_test, y_test = iterative_train_test_split(X, Y, test_size=0.2)
X_val, y_val, _, _ = iterative_train_test_split(X, Y, test_size = 0.2)

# Just checking, ive had some issues
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

X_train shape: (11888, 392)
y_train shape: (11888, 392)
X_test shape: (3006, 392)
y_test shape: (3006, 392)
X_val shape: (11875, 392)
y_val shape: (11875, 392)


In [33]:
check_model = False
if check_model:
    model = RandomForestClassifier(random_state=42)

    # Define search space
    param_dist = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'max_features': ['sqrt', 'log2'],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
    }

    # Wrap in RandomizedSearchCV
    search = RandomizedSearchCV(
        model,
        param_distributions=param_dist,
        n_iter=20,  # Try 20 random combinations
        cv=3,       # 3-fold cross-validation
        scoring='f1_micro',  # Or accuracy, or a custom metric
        n_jobs=-1,  # Use all cores
        verbose=1
    )

    search.fit(X_train, y_train)

    # Best model after search
    best_model = search.best_estimator_
    print(best_model)

In [34]:
if not check_model:
    best_model = RandomForestClassifier(max_features='log2', n_estimators=300, random_state=42) # Thepreviousbestmodel

In [35]:
best_model.fit(X_train, y_train)
#best_model.score(X_test, y_test)

In [36]:
def find_best_thresholds(y_true, y_probs, thresholds=np.linspace(0.1, 0.9, 9)):
    best_thresholds = []
    for i in range(y_probs.shape[1]):
        best_f1 = 0
        best_thresh = 0.5
        for t in thresholds:
            preds = (y_probs[:, i] >= t).astype(int)
            score = f1_score(y_true[:, i], preds)
            if score > best_f1:
                best_f1 = score
                best_thresh = t
        best_thresholds.append(best_thresh)
    return best_thresholds

In [37]:
def apply_thresholds(y_probs, thresholds):
    thresholds = np.array(thresholds)
    return (y_probs >= thresholds).astype(int)

In [38]:
#print(Y_probs_val[0][0])

In [39]:
Y_probs_val = best_model.predict_proba(X_val)      
prob_matrix_val = np.array([
    probs[:, 1] if probs.shape[1] > 1 else np.zeros(probs.shape[0])
    for probs in Y_probs_val]).T

thresholds = find_best_thresholds(y_val, prob_matrix_val)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

In [40]:
print("Min and max probabilities in prob_matrix_test:", prob_matrix_test.min(), prob_matrix_test.max())
print("Thresholds range:", min(thresholds), max(thresholds))


Min and max probabilities in prob_matrix_test: 0.0 1.0
Thresholds range: 0.1 0.6


In [41]:
print("Number of samples in test set:", y_test.shape[0])
print("Number of tags:", y_test.shape[1])
print("Total positive labels in y_test:", y_test.sum())
print("Total positive predictions in Y_preds:", Y_preds.sum())


Number of samples in test set: 3006
Number of tags: 392
Total positive labels in y_test: 10103.0
Total positive predictions in Y_preds: 6121


In [42]:
print("Total labels in full dataset:", Y.sum())
print("Per-tag label count:", Y.sum(axis=0))


Total labels in full dataset: 49794.0
Per-tag label count: [7.320e+02 1.457e+03 2.050e+02 3.461e+03 7.330e+02 1.580e+03 3.490e+02
 2.770e+02 5.000e+01 4.300e+01 1.487e+03 5.800e+01 1.800e+02 3.689e+03
 1.000e+00 1.410e+02 1.370e+02 2.250e+02 3.510e+02 1.120e+02 9.600e+01
 7.900e+01 1.470e+02 1.390e+02 2.300e+01 1.502e+03 2.100e+01 4.600e+01
 4.100e+01 2.660e+02 1.400e+02 1.310e+02 1.100e+02 2.000e+00 5.340e+02
 8.000e+01 1.590e+02 8.690e+02 6.800e+01 1.160e+02 1.400e+01 1.900e+01
 9.500e+01 2.560e+02 1.450e+02 7.700e+01 1.300e+02 5.700e+01 9.400e+01
 1.030e+02 1.600e+01 5.800e+01 3.100e+01 1.010e+02 3.350e+02 3.500e+01
 1.230e+02 2.160e+02 2.400e+01 1.010e+02 2.050e+02 3.740e+02 1.620e+02
 4.300e+01 1.500e+01 4.400e+01 1.780e+02 4.320e+02 7.000e+01 4.000e+00
 2.100e+01 2.000e+01 4.500e+01 1.200e+01 1.000e+01 5.000e+00 1.300e+01
 1.300e+01 1.200e+01 1.300e+01 1.100e+01 1.200e+01 9.100e+01 2.600e+01
 1.330e+02 1.790e+02 3.400e+01 2.140e+02 1.700e+01 6.000e+00 4.000e+00
 3.539e+03 2.400e+

In [43]:
Y_probs_test = best_model.predict_proba(X_test)
prob_matrix_test = np.array([
    probs[:, 1] if probs.shape[1] > 1 else np.zeros(probs.shape[0])
    for probs in Y_probs_test]).T

Y_preds = apply_thresholds(prob_matrix_test, np.array(thresholds))

In [44]:
# Macro = average over tags, treating each equally
print(len(y_test), len(Y_preds), len(X_test))
f1 = f1_score(y_test, Y_preds, average='macro')
precision = precision_score(y_test, Y_preds, average='macro')
recall = recall_score(y_test, Y_preds, average='macro')

print("Macro F1 Score:", f1)
print("Macro Precision:", precision)
print("Macro Recall:", recall)

3006 3006 3006
Macro F1 Score: 0.2863803135094211
Macro Precision: 0.3368093223556371
Macro Recall: 0.27315139389616255


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [45]:
for i in range(len(y_test)):
    #print(f"Prediction = {Y_preds[i]} \t Actual = {y_test[i]}")
    #print(f"length pred = {len(Y_preds[i])} \t len actual = {len(y_test[i])}")
    if len(Y_preds[i]) != len(y_test[i]):
        print("oops")
        print(i)

#print(len(Y_preds))
#print(len(y_test))

In [46]:
print(classification_report(y_test, Y_preds, zero_division=0))

              precision    recall  f1-score   support

           0       0.41      0.29      0.34       148
           1       0.59      0.47      0.52       300
           2       0.00      0.00      0.00        43
           3       0.91      0.90      0.90       692
           4       0.80      0.47      0.59       161
           5       0.54      0.63      0.58       323
           6       0.79      0.51      0.62        74
           7       0.61      0.62      0.62        58
           8       0.33      0.18      0.24        11
           9       0.00      0.00      0.00         9
          10       0.94      0.84      0.88       289
          11       0.71      0.83      0.77        12
          12       0.76      0.50      0.60        38
          13       0.90      0.84      0.87       729
          14       0.00      0.00      0.00         0
          15       0.42      0.16      0.23        32
          16       0.65      0.74      0.69        23
          17       0.86    