In [82]:
from pyrosm.data import sources
import pyrosm
from collections import Counter, defaultdict
import json
import pandas as pd
import ast
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report

In [83]:
#Parse tags

def parse_tags(val):
    if pd.isna(val) or val in ["None", "nan", None]:
        return {}
    try:
        return ast.literal_eval(val) if isinstance(val, str) else val
    
    except json.JSONDecodeError:
        return {}

#pois["tags"] = pois["tags"].apply(parse_tags) #only need to run it once

In [84]:
def create_tag_lists(pois, n):
    tag_freq = defaultdict(int)
    #print(tag_freq)



    for idx, row in pois.iterrows(): 
        tags = row.get("tags", {})
        #print(f"Row {idx}, Tags: {tags}, Type: {type(tags)}")  # Check type of tags

        if isinstance(tags, dict) and tags:
            for tag_key in tags:
                tag_freq[tag_key] += 1


    # print(tag_freq)
    all_tags = list(tag_freq.keys())
    all_good_tags = [tag for tag in tag_freq if tag_freq[tag] > n]
    tag2idx = {tag: i for i, tag in enumerate(all_good_tags)}
    idx2tag = {i: tag for tag, i in tag2idx.items()}

    tag_freq = dict(sorted(tag_freq.items(), key = lambda x: x[1], reverse = True))
    print(f"All tags sorted by frequency: {tag_freq}")
    print(f" All allowed tags: {all_good_tags}")
    # print(tag2idx)
    # print(idx2tag)
    print(f"Len all tags: {len(all_tags)}, Len good tags: {len(all_good_tags)}")
    #print("\n Returning (all_good_tags, tag2idx)")


    return (all_good_tags, tag2idx)

In [85]:
def remove_bad_tags(good_tags, pois):
    for i in pois["tags"]:
        remove_list = []
        for j in i:
            if j not in good_tags:
                remove_list.append(j)
        for removable in remove_list:
            #print(f"removing {removable} from {i}")
            i.pop(removable)
            #print(f"after remnoval: {i}")           

In [86]:
def tags_to_vec(tag_dict, tag2idx):
    vector = np.zeros(len(tag2idx), dtype= np.float32)
    if isinstance(tag_dict, dict):
        for tag in tag_dict:
            if tag in tag2idx:
                vector[tag2idx[tag]] = 1
    return vector

In [87]:
def vector_pois(pois, tag2idx, n):
    X = []
    y = []

    for tag in pois["tags"]:
        if not isinstance(tag, dict) or len(tag) < n: # set the len to 1 will include empty {}
            continue

        tag_keys = list(tag.keys())

        np.random.shuffle(tag_keys)
        
        mid_idx = len(tag_keys) // 2
        #print(mid_idx)
        input_tags = {k: tag[k] for k in tag_keys[:mid_idx]}
        output_tags = {k: tag[k] for k in tag_keys[mid_idx:]}

        # print("input_tags", input_tags)
        # print("   ")
        # print("output_tags", output_tags)
        # print("--------------------------------")
        vec = tags_to_vec(input_tags, tag2idx)
        #print("vector",vec) #tag to vector testing


        X.append(tags_to_vec(input_tags, tag2idx))
        y.append(tags_to_vec(output_tags, tag2idx))

    X = np.stack(X)
    #print(X)
    y = np.stack(y)
    print("X shape:", X.shape)

    print("y_shape", y.shape)

    return X,y

In [88]:
fp = pyrosm.get_data("Amsterdam")
osm = pyrosm.OSM(fp)
pois = osm.get_pois()
pois["tags"] = pois["tags"].apply(parse_tags) #only need to run it once

In [89]:
n_good_tags = 10 # Frequency of tags for them to be allowed
n_per_instance = 2 # Amount of tags an instance needs to have to be part of the data

good_tags, tag2idx = create_tag_lists(pois, n_good_tags)
remove_bad_tags(good_tags, pois)
X, Y = vector_pois(pois, tag2idx, n_per_instance)

All tags sorted by frequency: {'surface': 23180, 'capacity': 23016, 'parking_space': 15767, 'access': 14792, 'smoothness': 14028, 'source:date': 13182, 'fee': 10316, 'orientation': 9071, 'wheelchair': 6798, 'backrest': 6265, 'brand': 4633, 'recycling_type': 4624, 'check_date': 4482, 'brand:wikidata': 4457, 'maxstay:conditional': 4007, 'material': 3742, 'lit': 3240, 'cuisine': 3064, 'operator:wikidata': 2880, 'covered': 2694, 'brand:wikipedia': 2158, 'waste': 2031, 'ref:bag': 1530, 'colour': 1499, 'note': 1310, 'artwork_type': 1280, 'outdoor_seating': 1246, 'seats': 1225, 'recycling:waste': 1213, 'wikidata': 1053, 'brand:website': 1046, 'recycling:paper': 1024, 'operator:wikipedia': 954, 'takeaway': 912, 'layer': 861, 'description': 826, 'recycling:glass_bottles': 826, 'artist_name': 819, 'level': 776, 'wikimedia_commons': 711, 'healthcare': 687, 'indoor_seating': 664, 'vending': 623, 'contact:facebook': 596, 'branch': 590, 'denomination': 567, 'payment:cash': 559, 'shelter_type': 548, 

In [90]:
X_train, X_test, y_train, y_test = train_test_split(X, Y)
X_val, _, y_val, _ = train_test_split(X, Y)

In [91]:
if False:
    model = RandomForestClassifier(random_state=42)

    # Define search space
    param_dist = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'max_features': ['sqrt', 'log2'],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
    }

    # Wrap in RandomizedSearchCV
    search = RandomizedSearchCV(
        model,
        param_distributions=param_dist,
        n_iter=20,  # Try 20 random combinations
        cv=3,       # 3-fold cross-validation
        scoring='f1_micro',  # Or accuracy, or a custom metric
        n_jobs=-1,  # Use all cores
        verbose=1
    )

    search.fit(X_train, y_train)

    # Best model after search
    best_model = search.best_estimator_

In [92]:
#print(best_model)
best_model = RandomForestClassifier(min_samples_split=5, n_estimators=200, random_state=42)

In [93]:
best_model.fit(X_train, y_train)
#best_model.score(X_test, y_test)

In [94]:
def find_best_thresholds(y_true, y_probs, thresholds=np.linspace(0.1, 0.9, 9)):
    best_thresholds = []
    for i in range(y_probs.shape[1]):
        best_f1 = 0
        best_thresh = 0.5
        for t in thresholds:
            preds = (y_probs[:, i] >= t).astype(int)
            score = f1_score(y_true[:, i], preds)
            if score > best_f1:
                best_f1 = score
                best_thresh = t
        best_thresholds.append(best_thresh)
    return best_thresholds

In [95]:
def apply_thresholds(y_probs, thresholds):
    thresholds = np.array(thresholds)
    return (y_probs >= thresholds).astype(int)

In [96]:
Y_probs_val = best_model.predict_proba(X_val)      # Should do this with a validatio set, but will implement later
prob_matrix_val = np.array([probs[:, 1] for probs in Y_probs_val]).T

thresholds = find_best_thresholds(y_val, prob_matrix_val)      # Again, should be using val set

In [97]:
Y_probs_test = best_model.predict_proba(X_test)
prob_matrix_test = np.array([probs[:, 1] for probs in Y_probs_test]).T

Y_preds = apply_thresholds(prob_matrix_test, np.array(thresholds))

In [98]:
# Macro = average over tags, treating each equally
f1 = f1_score(y_test, Y_preds, average='macro')
precision = precision_score(y_test, Y_preds, average='macro')
recall = recall_score(y_test, Y_preds, average='macro')

print("Macro F1 Score:", f1)
print("Macro Precision:", precision)
print("Macro Recall:", recall)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Macro F1 Score: 0.2812614690745214
Macro Precision: 0.3615948096155521
Macro Recall: 0.26581285557011725


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [99]:
for i in range(len(y_test)):
    #print(f"Prediction = {Y_preds[i]} \t Actual = {y_test[i]}")
    #print(f"length pred = {len(Y_preds[i])} \t len actual = {len(y_test[i])}")
    if len(Y_preds[i]) != len(y_test[i]):
        print("oops")
        print(i)

#print(len(Y_preds))
#print(len(y_test))