In [95]:
#!pip install scikit-multilearn

In [96]:
from pyrosm.data import sources
import pyrosm
from collections import Counter, defaultdict
import json
import pandas as pd
import ast
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report
from skmultilearn.model_selection import iterative_train_test_split
import pickle

In [97]:
#Parse tags

def parse_tags(val):
    if pd.isna(val) or val in ["None", "nan", None]:
        return {}
    try:
        return ast.literal_eval(val) if isinstance(val, str) else val
    
    except json.JSONDecodeError:
        return {}

#pois["tags"] = pois["tags"].apply(parse_tags) #only need to run it once

In [98]:
def create_tag_lists(pois, n):
    tag_freq = defaultdict(int)
    #print(tag_freq)



    for idx, row in pois.iterrows(): 
        tags = row.get("tags", {})
        #print(f"Row {idx}, Tags: {tags}, Type: {type(tags)}")  # Check type of tags

        if isinstance(tags, dict) and tags:
            for tag_key in tags:
                tag_freq[tag_key] += 1


    # print(tag_freq)
    all_tags = list(tag_freq.keys())
    all_good_tags = [tag for tag in tag_freq if tag_freq[tag] > n]
    tag2idx = {tag: i for i, tag in enumerate(all_good_tags)}
    idx2tag = {i: tag for tag, i in tag2idx.items()}

    tag_freq = dict(sorted(tag_freq.items(), key = lambda x: x[1], reverse = True))
    print(f"All tags sorted by frequency: {tag_freq}")
    print(f" All allowed tags: {all_good_tags}")
    # print(tag2idx)
    # print(idx2tag)
    print(f"Len all tags: {len(all_tags)}, Len good tags: {len(all_good_tags)}")
    #print("\n Returning (all_good_tags, tag2idx)")


    return (all_good_tags, tag2idx)

In [99]:
def remove_bad_tags(good_tags, pois):
    for i in pois["tags"]:
        remove_list = []
        for j in i:
            if j not in good_tags:
                remove_list.append(j)
        for removable in remove_list:
            #print(f"removing {removable} from {i}")
            i.pop(removable)
            #print(f"after remnoval: {i}")     

def remove_bad_tags_test(good_tags, poisX, poisy):
    for i in poisX:
        remove_list = []
        for j in i:
            if j not in good_tags:
                remove_list.append(j)
        for removable in remove_list:
            #print(f"removing {removable} from {i}")
            print(i, removable)
            i.remove(removable)
            #print(f"after remnoval: {i}")
    for i in poisy:
        remove_list = []
        for j in i:
            if j not in good_tags:
                remove_list.append(j)
        for removable in remove_list:
            #print(f"removing {removable} from {i}")
            i.remove(removable)
            #print(f"after remnoval: {i}")      

In [100]:
def tags_to_vec(tag_dict, tag2idx):
    vector = np.zeros(len(tag2idx), dtype= np.float32)
    if isinstance(tag_dict, dict):
        for tag in tag_dict:
            if tag in tag2idx:
                vector[tag2idx[tag]] = 1
    return vector

def tags_to_vec_singular(solo_tag, tag2idx):
    vector = np.zeros(len(tag2idx), dtype= np.float32)
    #if isinstance(tag_dict, dict):
    if solo_tag in tag2idx:
        vector[tag2idx[solo_tag]] = 1
    return vector

In [101]:
def vector_pois(pois, tag2idx, n):
    X = []
    y = []

    for tag in pois["tags"]:
        if not isinstance(tag, dict) or len(tag) < n: # set the len to 1 will include empty {}
            continue

        tag_keys = list(tag.keys())

        np.random.shuffle(tag_keys)
        
        mid_idx = len(tag_keys) // 2
        #print(mid_idx)
        input_tags = {k: tag[k] for k in tag_keys[:mid_idx]}
        output_tags = {k: tag[k] for k in tag_keys[mid_idx:]}

        # print("input_tags", input_tags)
        # print("   ")
        # print("output_tags", output_tags)
        # print("--------------------------------")
        #vec = tags_to_vec(input_tags, tag2idx)
        #print("vector",vec) #tag to vector testing


        X.append(tags_to_vec(input_tags, tag2idx))
        y.append(tags_to_vec(output_tags, tag2idx))

    X = np.stack(X)
    #print(X)
    y = np.stack(y)
    print("X shape:", X.shape)

    print("y_shape", y.shape)

    return X,y

def vector_pois_opt(pois, tag2idx, n):
    X = []
    y = []

    for tag in pois["tags"]:
        if not isinstance(tag, dict) or len(tag) < n: # set the len to 1 will include empty {}
            continue

        tag_keys = list(tag.keys())

        np.random.shuffle(tag_keys)
        
        for feature_idx in range(len(tag)//2):
            input_tags = {k: tag[k] for k in tag_keys[:feature_idx] + tag_keys[feature_idx + 1:]}
            output_tags = {tag_keys[feature_idx] : tag[tag_keys[feature_idx]]}


            X.append(tags_to_vec(input_tags, tag2idx))
            y.append(tags_to_vec(output_tags, tag2idx))
    
    X = np.stack(X)
    #print(X)
    y = np.stack(y)
    print("X shape:", X.shape)

    print("y_shape", y.shape)

    return X,y

def vector_pois_test(poisX, poisy, tag2idx, n):
    X = []
    y = []

    for tag in poisX:
        if len(tag) < n: # set the len to 1 will include empty {}
            print(tag)
            continue
        X.append(tags_to_vec(tag, tag2idx))
    for tag in poisy:
        y.append(tags_to_vec_singular(tag, tag2idx))

    
    X = np.stack(X)
    y = np.stack(y)
    print("X_shape:", X.shape)

    print("y_shape", y.shape)
    
    return X,y

In [102]:
# fp = pyrosm.get_data("Amsterdam")
# osm = pyrosm.OSM(fp)
# pois = osm.get_pois()
# pois["tags"] = pois["tags"].apply(parse_tags) #only need to run it once

with open('trainingset', 'rb') as fp:
    pois = pickle.load(fp)
with open('testset_questions', 'rb') as fp:
    test_pois_X = pickle.load(fp)
with open('testset_answer', 'rb') as fp:
    test_pois_y = pickle.load(fp)

In [103]:
n_good_tags = 10 # Frequency of tags for them to be allowed
n_per_instance = 5 # Amount of tags an instance needs to have to be part of the data
                   # If set to 1 it will include empty cells


good_tags, tag2idx = create_tag_lists(pois, n_good_tags)
remove_bad_tags(good_tags, pois)
#X, Y = vector_pois_opt(pois, tag2idx, n_per_instance)
X, Y = vector_pois(pois, tag2idx, n_per_instance)

All tags sorted by frequency: {'surface': 17633, 'smoothness': 11035, 'parking_space': 10687, 'access': 8889, 'capacity': 8692, 'fee': 7354, 'orientation': 6272, 'source:date': 5013, 'wheelchair': 3692, 'brand': 3571, 'brand:wikidata': 3456, 'lit': 2499, 'maxstay:conditional': 2375, 'check_date': 2178, 'material': 1939, 'operator:wikidata': 1917, 'brand:wikipedia': 1726, 'cuisine': 1554, 'backrest': 1403, 'recycling_type': 1334, 'colour': 1034, 'covered': 955, 'ref:bag': 933, 'seats': 865, 'outdoor_seating': 853, 'brand:website': 850, 'wikidata': 781, 'takeaway': 685, 'operator:wikipedia': 665, 'recycling:paper': 664, 'artwork_type': 639, 'wikimedia_commons': 557, 'level': 552, 'artist_name': 539, 'recycling:glass_bottles': 508, 'branch': 501, 'description': 499, 'layer': 484, 'indoor_seating': 476, 'note': 446, 'socket:type2': 434, 'payment:cash': 434, 'toilets:wheelchair': 408, 'contact:facebook': 398, 'denomination': 362, 'capacity:disabled': 361, 'contact:instagram': 348, 'min_age'

In [104]:
X_test, y_test = vector_pois_test(test_pois_X, test_pois_y, tag2idx, 1)

X_shape: (8291, 353)
y_shape (8291, 353)


In [137]:
#X_train, X_test, y_train, y_test = train_test_split(X, Y)
#X_val, _, y_val, _ = train_test_split(X, Y)

X_train, y_train, X_val, y_val = iterative_train_test_split(X, Y, test_size=0.2)
#_, _, X_val, y_val = iterative_train_test_split(X, Y, test_size = 0.2)

# Just checking, ive had some issues
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

X_train shape: (9595, 353)
y_train shape: (9595, 353)
X_test shape: (8291, 353)
y_test shape: (8291, 353)
X_val shape: (2474, 353)
y_val shape: (2474, 353)


In [138]:
check_model = False
if check_model:
    model = RandomForestClassifier(random_state=42)

    # Define search space
    param_dist = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'max_features': ['sqrt', 'log2'],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
    }

    # Wrap in RandomizedSearchCV
    search = RandomizedSearchCV(
        model,
        param_distributions=param_dist,
        n_iter=20,  # Try 20 random combinations
        cv=3,       # 3-fold cross-validation
        scoring='f1_micro',  # Or accuracy, or a custom metric
        n_jobs=-1,  # Use all cores
        verbose=1
    )

    search.fit(X_train, y_train)

    # Best model after search
    best_model = search.best_estimator_
    print(best_model)

In [139]:
if not check_model:
    best_model = RandomForestClassifier(max_features='log2', n_estimators=300, random_state=42) # Thepreviousbestmodel

In [140]:
best_model.fit(X_train, y_train)
#best_model.score(X_test, y_test)

MemoryError: could not allocate 46268416 bytes

In [None]:
def find_best_thresholds(y_true, y_probs, thresholds=np.arange(0.1, 0.9, 0.05)):
    best_thresholds = []
    for i in range(y_true.shape[1]):
        best_f1 = 0
        best_thresh = 0.5
        for thresh in thresholds:
            preds = (y_probs[:, i] >= thresh).astype(int)
            f1 = f1_score(y_true[:, i], preds, zero_division=0)
            if f1 > best_f1:
                best_f1 = f1
                best_thresh = thresh
        best_thresholds.append(best_thresh)
    return np.array(best_thresholds)


In [None]:
Y_probs_val = best_model.predict_proba(X_val)      
prob_matrix_val = np.array([
    probs[:, 1] if probs.shape[1] > 1 else np.zeros(probs.shape[0])
    for probs in Y_probs_val]).T

thresholds = find_best_thresholds(y_val, prob_matrix_val)

In [None]:
Y_probs_test = best_model.predict_proba(X_test)
probs_list = []
for probs in Y_probs_test:
    if probs.shape[1] == 2:
        probs_list.append(probs[:, 1])  # P(class 1)
    else:
        # Only class 0 was seen — so class 1 probability is 0
        probs_list.append(np.zeros(probs.shape[0]))

prob_matrix = np.array(probs_list).T
# prob_matrix = np.array([probs[:, 1] for probs in Y_probs_test]).T
# prob_matrix_test = np.array([
#     probs[:, 1] if probs.shape[1] > 1 else np.zeros(probs.shape[0])
#     for probs in Y_probs_test]).T

Y_preds = (prob_matrix >= thresholds).astype(int)
#Y_preds = (prob_matrix >= 0.01).astype(int)

In [None]:
print("Min prob:", prob_matrix.min())
print("Max prob:", prob_matrix.max())

print("Min prob:", thresholds.min())
print("Max prob:", thresholds.max())


Min prob: 0.0
Max prob: 0.2491307189542484
Min prob: 0.1
Max prob: 0.8000000000000002


In [None]:
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)
print("prob_matrix shape:", prob_matrix.shape)
print("thresholds shape:", thresholds.shape)
print("Y_preds shape:", Y_preds.shape)


X_test shape: (8291, 353)
y_test shape: (8291, 353)
prob_matrix shape: (8291, 353)
thresholds shape: (353,)
Y_preds shape: (8291, 353)


In [None]:
print("Total labels in full dataset:", Y_preds.sum())
print("Per-tag label count:", y_test.sum(axis=0))

print("Total labels in full dataset:", Y.sum())
print("Per-tag label count:", Y.sum(axis=0))

Total labels in full dataset: 654989
Per-tag label count: [4.21e+02 7.79e+02 8.40e+02 1.17e+03 1.30e+01 1.79e+02 3.50e+01 1.95e+02
 2.86e+02 5.09e+02 4.88e+02 3.04e+02 3.00e+00 5.00e+00 1.00e+01 1.10e+01
 3.30e+01 2.90e+01 1.80e+02 1.47e+02 1.15e+02 1.05e+02 3.00e+00 1.21e+02
 6.00e+00 1.00e+02 4.10e+01 3.00e+00 5.50e+01 6.00e+00 8.30e+01 2.40e+01
 1.10e+01 1.00e+01 7.00e+00 2.20e+01 1.50e+01 5.00e+01 3.70e+01 2.40e+01
 9.40e+01 1.59e+02 6.90e+01 6.70e+01 1.70e+01 4.00e+00 1.70e+01 1.00e+00
 1.00e+00 0.00e+00 7.60e+01 1.20e+01 1.50e+01 2.00e+00 3.20e+01 3.40e+01
 1.10e+01 3.00e+00 7.00e+00 8.50e+01 1.00e+00 0.00e+00 2.00e+00 0.00e+00
 0.00e+00 2.00e+00 4.00e+00 1.00e+00 4.00e+00 1.30e+01 3.00e+00 1.20e+01
 0.00e+00 1.00e+00 5.00e+00 1.50e+01 2.30e+01 1.90e+01 7.00e+00 1.30e+01
 0.00e+00 8.00e+00 3.00e+00 7.00e+00 1.10e+01 0.00e+00 2.00e+00 1.00e+00
 2.70e+01 1.40e+01 0.00e+00 2.00e+00 5.00e+00 1.00e+00 1.80e+01 7.00e+00
 2.90e+01 1.00e+00 1.70e+01 1.00e+01 7.00e+00 3.00e+00 0.00e+00 4.

In [None]:
# Macro = average over tags, treating each equally
print(len(y_test), len(Y_preds), len(X_test))
f1 = f1_score(y_test, Y_preds, average='macro')
precision = precision_score(y_test, Y_preds, average='macro')
recall = recall_score(y_test, Y_preds, average='macro')

print("Macro F1 Score:", f1)
print("Macro Precision:", precision)
print("Macro Recall:", recall)

8291 8291 8291


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Macro F1 Score: 0.0037687287357157334
Macro Precision: 0.00199813921577136
Macro Recall: 0.2096317280453258


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
for i in range(len(y_test)):
    #print(f"Prediction = {Y_preds[i]} \t Actual = {y_test[i]}")
    #print(f"length pred = {len(Y_preds[i])} \t len actual = {len(y_test[i])}")
    if len(Y_preds[i]) != len(y_test[i]):
        print("oops")
        print(i)

#print(len(Y_preds))
#print(len(y_test))

In [None]:
print(classification_report(y_test, Y_preds, zero_division=0))

              precision    recall  f1-score   support

           0       0.05      1.00      0.10       421
           1       0.00      0.00      0.00       779
           2       0.10      1.00      0.18       840
           3       0.14      1.00      0.25      1170
           4       0.00      0.00      0.00        13
           5       0.02      1.00      0.04       179
           6       0.00      1.00      0.01        35
           7       0.02      1.00      0.05       195
           8       0.03      1.00      0.07       286
           9       0.06      1.00      0.12       509
          10       0.06      1.00      0.11       488
          11       0.00      0.00      0.00       304
          12       0.00      0.00      0.00         3
          13       0.00      0.00      0.00         5
          14       0.00      1.00      0.00        10
          15       0.00      1.00      0.00        11
          16       0.00      1.00      0.01        33
          17       0.00    