In [225]:
#!pip install scikit-multilearn

In [226]:
from pyrosm.data import sources
import pyrosm
from collections import Counter, defaultdict
import json
import pandas as pd
import ast
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report
from skmultilearn.model_selection import iterative_train_test_split
import pickle

In [227]:
#Parse tags

def parse_tags(val):
    if pd.isna(val) or val in ["None", "nan", None]:
        return {}
    try:
        return ast.literal_eval(val) if isinstance(val, str) else val
    
    except json.JSONDecodeError:
        return {}

#pois["tags"] = pois["tags"].apply(parse_tags) #only need to run it once

In [228]:
def create_tag_lists(pois, n):
    tag_freq = defaultdict(int)
    #print(tag_freq)



    for idx, row in pois.iterrows(): 
        tags = row.get("tags", {})
        #print(f"Row {idx}, Tags: {tags}, Type: {type(tags)}")  # Check type of tags

        if isinstance(tags, dict) and tags:
            for tag_key in tags:
                tag_freq[tag_key] += 1


    # print(tag_freq)
    all_tags = list(tag_freq.keys())
    all_good_tags = [tag for tag in tag_freq if tag_freq[tag] > n]
    tag2idx = {tag: i for i, tag in enumerate(all_good_tags)}
    idx2tag = {i: tag for tag, i in tag2idx.items()}

    tag_freq = dict(sorted(tag_freq.items(), key = lambda x: x[1], reverse = True))
    print(f"All tags sorted by frequency: {tag_freq}")
    print(f" All allowed tags: {all_good_tags}")
    # print(tag2idx)
    # print(idx2tag)
    print(f"Len all tags: {len(all_tags)}, Len good tags: {len(all_good_tags)}")
    #print("\n Returning (all_good_tags, tag2idx)")


    return (all_good_tags, tag2idx)

In [229]:
def remove_bad_tags(good_tags, pois):
    for i in pois["tags"]:
        remove_list = []
        for j in i:
            if j not in good_tags:
                remove_list.append(j)
        for removable in remove_list:
            #print(f"removing {removable} from {i}")
            i.pop(removable)
            #print(f"after remnoval: {i}")     

def remove_bad_tags_test(good_tags, poisX, poisy):
    for i in poisX:
        remove_list = []
        for j in i:
            if j not in good_tags:
                remove_list.append(j)
        for removable in remove_list:
            #print(f"removing {removable} from {i}")
            print(i, removable)
            i.pop(removable)
            #print(f"after remnoval: {i}")
    for i in poisy:
        remove_list = []
        for j in i:
            if j not in good_tags:
                remove_list.append(j)
        for removable in remove_list:
            #print(f"removing {removable} from {i}")
            i.pop(removable)
            #print(f"after remnoval: {i}")      

In [250]:
def tags_to_vec(tag_dict, tag2idx):
    vector = np.zeros(len(tag2idx), dtype= np.float32)
    if isinstance(tag_dict, dict):
        for tag in tag_dict:
            if tag in tag2idx:
                vector[tag2idx[tag]] = 1
    return vector

def tags_to_vec_singular(solo_tag, tag2idx):
    vector = np.zeros(len(tag2idx), dtype= np.float32)
    #if isinstance(tag_dict, dict):
    if solo_tag in tag2idx:
        vector[tag2idx[solo_tag]] = 1
    return vector

In [269]:
def vector_pois(pois, tag2idx, n):
    X = []
    y = []

    for tag in pois["tags"]:
        if not isinstance(tag, dict) or len(tag) < n: # set the len to 1 will include empty {}
            continue

        tag_keys = list(tag.keys())

        np.random.shuffle(tag_keys)
        
        mid_idx = len(tag_keys) // 2
        #print(mid_idx)
        input_tags = {k: tag[k] for k in tag_keys[:mid_idx]}
        output_tags = {k: tag[k] for k in tag_keys[mid_idx:]}

        # print("input_tags", input_tags)
        # print("   ")
        # print("output_tags", output_tags)
        # print("--------------------------------")
        #vec = tags_to_vec(input_tags, tag2idx)
        #print("vector",vec) #tag to vector testing


        X.append(tags_to_vec(input_tags, tag2idx))
        y.append(tags_to_vec(output_tags, tag2idx))

    X = np.stack(X)
    #print(X)
    y = np.stack(y)
    print("X shape:", X.shape)

    print("y_shape", y.shape)

    return X,y

def vector_pois_opt(pois, tag2idx, n):
    X = []
    y = []

    for tag in pois["tags"]:
        if not isinstance(tag, dict) or len(tag) < n: # set the len to 1 will include empty {}
            continue

        tag_keys = list(tag.keys())

        np.random.shuffle(tag_keys)
        
        for feature_idx in range(len(tag)//2):
            input_tags = {k: tag[k] for k in tag_keys[:feature_idx] + tag_keys[feature_idx + 1:]}
            output_tags = {tag_keys[feature_idx] : tag[tag_keys[feature_idx]]}


            X.append(tags_to_vec(input_tags, tag2idx))
            y.append(tags_to_vec(output_tags, tag2idx))
    
    X = np.stack(X)
    #print(X)
    y = np.stack(y)
    print("X shape:", X.shape)

    print("y_shape", y.shape)

    return X,y

def vector_pois_test(poisX, poisy, tag2idx, n):
    X = []
    y = []

    for tag in poisX:
        if len(tag) < n: # set the len to 1 will include empty {}
            print(tag)
            continue
        X.append(tags_to_vec(tag, tag2idx))
    for tag in poisy:
        y.append(tags_to_vec_singular(tag, tag2idx))

    
    X = np.stack(X)
    y = np.stack(y)
    print("X_shape:", X.shape)

    print("y_shape", y.shape)
    
    return X,y

In [232]:
# fp = pyrosm.get_data("Amsterdam")
# osm = pyrosm.OSM(fp)
# pois = osm.get_pois()
# pois["tags"] = pois["tags"].apply(parse_tags) #only need to run it once

with open('trainingset', 'rb') as fp:
    pois = pickle.load(fp)
with open('testset_questions', 'rb') as fp:
    test_pois_X = pickle.load(fp)
with open('testset_answer', 'rb') as fp:
    test_pois_y = pickle.load(fp)

In [233]:
n_good_tags = 10 # Frequency of tags for them to be allowed
n_per_instance = 5 # Amount of tags an instance needs to have to be part of the data
                   # If set to 1 it will include empty cells


good_tags, tag2idx = create_tag_lists(pois, n_good_tags)
remove_bad_tags(good_tags, pois)
#X, Y = vector_pois_opt(pois, tag2idx, n_per_instance)
X, Y = vector_pois(pois, tag2idx, n_per_instance)

All tags sorted by frequency: {'source:date': 1450, 'brand': 864, 'brand:wikidata': 833, 'access': 713, 'operator:wikidata': 664, 'fee': 582, 'capacity': 570, 'wheelchair': 499, 'brand:wikipedia': 377, 'material': 354, 'colour': 345, 'operator:wikipedia': 344, 'cuisine': 308, 'check_date': 300, 'operator:type': 282, 'backrest': 263, 'collection_times': 252, 'covered': 246, 'ref:bag': 239, 'orientation': 237, 'operator:website': 220, 'outdoor_seating': 208, 'contact:phone': 208, 'brand:website': 206, 'contact:website': 181, 'surface': 179, 'takeaway': 168, 'payment:cash': 164, 'contact:email': 161, 'branch': 138, 'level': 136, 'artwork_type': 132, 'seats': 130, 'artist_name': 129, 'check_date:opening_hours': 123, 'recycling_type': 120, 'denomination': 111, 'payment:cards': 109, 'capacity:disabled': 108, 'wikidata': 104, 'charge': 98, 'panoramax': 98, 'armrest': 94, 'direction': 84, 'fee:conditional': 83, 'indoor_seating': 83, 'delivery': 81, 'indoor': 78, 'self_service': 72, 'descriptio

In [270]:
#remove_bad_tags_test(good_tags, test_pois_X, test_pois_y)
X_test, y_test = vector_pois_test(test_pois_X, test_pois_y, tag2idx, 1)

X_shape: (941, 164)
y_shape (941, 164)


In [254]:
#X_train, X_test, y_train, y_test = train_test_split(X, Y)
#X_val, _, y_val, _ = train_test_split(X, Y)

X_train, y_train, X_val, y_val = iterative_train_test_split(X, Y, test_size=0.2)
#X_val, y_val, _, _ = iterative_train_test_split(X, Y, test_size = 0.2)

# Just checking, ive had some issues
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

X_train shape: (1075, 164)
y_train shape: (1075, 164)
X_test shape: (941, 164)
y_test shape: (941, 164)
X_val shape: (294, 164)
y_val shape: (294, 164)


In [255]:
check_model = False
if check_model:
    model = RandomForestClassifier(random_state=42)

    # Define search space
    param_dist = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'max_features': ['sqrt', 'log2'],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
    }

    # Wrap in RandomizedSearchCV
    search = RandomizedSearchCV(
        model,
        param_distributions=param_dist,
        n_iter=20,  # Try 20 random combinations
        cv=3,       # 3-fold cross-validation
        scoring='f1_micro',  # Or accuracy, or a custom metric
        n_jobs=-1,  # Use all cores
        verbose=1
    )

    search.fit(X_train, y_train)

    # Best model after search
    best_model = search.best_estimator_
    print(best_model)

In [256]:
if not check_model:
    best_model = RandomForestClassifier(max_features='log2', n_estimators=300, random_state=42) # Thepreviousbestmodel

In [257]:
best_model.fit(X_train, y_train)
#best_model.score(X_test, y_test)

In [258]:
def find_best_thresholds(y_true, y_probs, thresholds=np.linspace(0.1, 0.9, 9)):
    best_thresholds = []
    for i in range(y_probs.shape[1]):
        best_f1 = 0
        best_thresh = 0.5
        for t in thresholds:
            preds = (y_probs[:, i] >= t).astype(int)
            score = f1_score(y_true[:, i], preds)
            if score > best_f1:
                best_f1 = score
                best_thresh = t
        best_thresholds.append(best_thresh)
    return best_thresholds

In [259]:
def apply_thresholds(y_probs, thresholds):
    thresholds = np.array(thresholds)
    return (y_probs >= thresholds).astype(int)

In [260]:
#print(Y_probs_val[0][0])

In [261]:
Y_probs_val = best_model.predict_proba(X_val)      
prob_matrix_val = np.array([
    probs[:, 1] if probs.shape[1] > 1 else np.zeros(probs.shape[0])
    for probs in Y_probs_val]).T

thresholds = find_best_thresholds(y_val, prob_matrix_val)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

In [262]:
# print("Min and max probabilities in prob_matrix_test:", prob_matrix_test.min(), prob_matrix_test.max())
# print("Thresholds range:", min(thresholds), max(thresholds))


In [263]:
# print("Number of samples in test set:", y_test.shape[0])
# print("Number of tags:", y_test.shape[1])
# print("Total positive labels in y_test:", y_test.sum())
# print("Total positive predictions in Y_preds:", Y_preds.sum())


In [264]:
print("Total labels in full dataset:", y_test.sum())
print("Per-tag label count:", Y.sum(axis=0))


Total labels in full dataset: 885.0
Per-tag label count: [253. 101. 271.  55.  19.  34. 316.  98. 122. 243. 134. 139. 126.  13.
 139.  53.  39.  81.  67.  68. 122. 127.  31.  38.   6.   9.  16. 157.
  11.   6.  16.   7.  21.  45.  77.  45.  37.   0.  11.  17.   6.   1.
  11.   2.  17.  13.  10.  23.  55.  65.  46.  28.   3.  15.   7.  15.
  84.  53.  14.  25.  26.   9.  21.  24.  10.  28.  15.   0. 167.  11.
  20.  19.  27.  55.  31.  44.   3.   2.   2.   9.   3.   9.  23.  18.
   4.  10.  12.  23.  21.  20.   2.   6.   3.   1.  10.  25.   4.   7.
   7.   4.  23.   1.   0.   6.   7.   7.  13.   6.  23.  10.  16.   0.
   0.   1.   0.   9.  11.   6.  16.   0.   0.  10.   7.  10.   8.   0.
   8.   7.   1.   0.   9.   5.   2.   4.   4.  14.   6.  12.  12.   1.
  13.   5.   7.   6.   7.   8.  12.  12.   4.   6.   5.   2.   4.   3.
   9.   9.   3.   4.   8.   2.   9.   8.   3.   3.]


In [265]:
Y_probs_test = best_model.predict_proba(X_test)
prob_matrix_test = np.array([
    probs[:, 1] if probs.shape[1] > 1 else np.zeros(probs.shape[0])
    for probs in Y_probs_test]).T

Y_preds = apply_thresholds(prob_matrix_test, np.array(thresholds))

In [266]:
# Macro = average over tags, treating each equally
print(len(y_test), len(Y_preds), len(X_test))
f1 = f1_score(y_test, Y_preds, average='macro')
precision = precision_score(y_test, Y_preds, average='macro')
recall = recall_score(y_test, Y_preds, average='macro')

print("Macro F1 Score:", f1)
print("Macro Precision:", precision)
print("Macro Recall:", recall)

941 941 941
Macro F1 Score: 0.0010103171207150668
Macro Precision: 0.0005507892485938674
Macro Recall: 0.006097560975609756


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [267]:
for i in range(len(y_test)):
    #print(f"Prediction = {Y_preds[i]} \t Actual = {y_test[i]}")
    #print(f"length pred = {len(Y_preds[i])} \t len actual = {len(y_test[i])}")
    if len(Y_preds[i]) != len(y_test[i]):
        print("oops")
        print(i)

#print(len(Y_preds))
#print(len(y_test))

In [268]:
print(classification_report(y_test, Y_preds, zero_division=0))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        43
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00        49
           3       0.00      0.00      0.00        15
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         1
           6       0.09      1.00      0.17        85
           7       0.00      0.00      0.00        16
           8       0.00      0.00      0.00        13
           9       0.00      0.00      0.00        25
          10       0.00      0.00      0.00        23
          11       0.00      0.00      0.00        48
          12       0.00      0.00      0.00        31
          13       0.00      0.00      0.00        20
          14       0.00      0.00      0.00        38
          15       0.00      0.00      0.00         3
          16       0.00      0.00      0.00         1
          17       0.00    