In [1]:
#!pip install scikit-multilearn

In [2]:
from pyrosm.data import sources
import pyrosm
from collections import Counter, defaultdict
import json
import pandas as pd
import ast
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report
from skmultilearn.model_selection import iterative_train_test_split
import pickle
from openpyxl import Workbook
from convert_report2excel import convert_report2excel

In [3]:
#Parse tags

def parse_tags(val):
    if pd.isna(val) or val in ["None", "nan", None]:
        return {}
    try:
        return ast.literal_eval(val) if isinstance(val, str) else val
    
    except json.JSONDecodeError:
        return {}

#pois["tags"] = pois["tags"].apply(parse_tags) #only need to run it once

In [4]:
def create_tag_lists(pois, n):
    tag_freq = defaultdict(int)
    #print(tag_freq)



    for idx, row in pois.iterrows(): 
        tags = row.get("tags", {})
        #print(f"Row {idx}, Tags: {tags}, Type: {type(tags)}")  # Check type of tags

        if isinstance(tags, dict) and tags:
            for tag_key in tags:
                tag_freq[tag_key] += 1


    # print(tag_freq)
    all_tags = list(tag_freq.keys())
    all_good_tags = [tag for tag in tag_freq if tag_freq[tag] > n]
    tag2idx = {tag: i for i, tag in enumerate(all_good_tags)}
    idx2tag = {i: tag for tag, i in tag2idx.items()}

    tag_freq = dict(sorted(tag_freq.items(), key = lambda x: x[1], reverse = True))
    print(f"All tags sorted by frequency: {tag_freq}")
    print(f" All allowed tags: {all_good_tags}")
    # print(tag2idx)
    # print(idx2tag)
    print(f"Len all tags: {len(all_tags)}, Len good tags: {len(all_good_tags)}")
    #print("\n Returning (all_good_tags, tag2idx)")


    return (all_good_tags, tag2idx)

In [5]:
def remove_bad_tags(good_tags, pois):
    for i in pois["tags"]:
        remove_list = []
        for j in i:
            if j not in good_tags:
                remove_list.append(j)
        for removable in remove_list:
            #print(f"removing {removable} from {i}")
            i.pop(removable)
            #print(f"after remnoval: {i}")     

def remove_bad_tags_test(good_tags, poisX, poisy):
    for i in poisX:
        remove_list = []
        for j in i:
            if j not in good_tags:
                remove_list.append(j)
        for removable in remove_list:
            #print(f"removing {removable} from {i}")
            print(i, removable)
            i.remove(removable)
            #print(f"after remnoval: {i}")
    for i in poisy:
        remove_list = []
        for j in i:
            if j not in good_tags:
                remove_list.append(j)
        for removable in remove_list:
            #print(f"removing {removable} from {i}")
            i.remove(removable)
            #print(f"after remnoval: {i}")      

In [6]:
def tags_to_vec(tag_dict, tag2idx):
    vector = np.zeros(len(tag2idx), dtype= np.float32)
    if isinstance(tag_dict, dict) or isinstance(tag_dict, list):
        for tag in tag_dict:
            if tag in tag2idx:
                vector[tag2idx[tag]] = 1
    return vector

def tags_to_vec_singular(solo_tag, tag2idx):
    vector = np.zeros(len(tag2idx), dtype= np.float32)
    #if isinstance(tag_dict, dict):
    if solo_tag in tag2idx:
        vector[tag2idx[solo_tag]] = 1
    return vector

In [7]:
def vector_pois(pois, tag2idx, n):
    X = []
    y = []

    for tag in pois["tags"]:
        if not isinstance(tag, dict) or len(tag) < n: # set the len to 1 will include empty {}
            continue

        tag_keys = list(tag.keys())

        np.random.shuffle(tag_keys)
        
        feature_idx = len(tag_keys) // 2
        #print(mid_idx)
        input_tags = {k: tag[k] for k in tag_keys[:feature_idx] + tag_keys[feature_idx + 1:]}
        output_tags = {tag_keys[feature_idx] : tag[tag_keys[feature_idx]]}

        # print("input_tags", input_tags)
        # print("   ")
        # print("output_tags", output_tags)
        # print("--------------------------------")
        #vec = tags_to_vec(input_tags, tag2idx)
        #print("vector",vec) #tag to vector testing


        X.append(tags_to_vec(input_tags, tag2idx))
        y.append(tags_to_vec(output_tags, tag2idx))

    X = np.stack(X)
    #print(X)
    y = np.stack(y)
    print("X shape:", X.shape)

    print("y_shape", y.shape)

    return X,y

def vector_pois_opt(pois, tag2idx, n):
    X = []
    y = []

    for tag in pois["tags"]:
        if not isinstance(tag, dict) or len(tag) < n: # set the len to 1 will include empty {}
            continue

        tag_keys = list(tag.keys())

        np.random.shuffle(tag_keys)
        
        for feature_idx in range(len(tag)//2):
            input_tags = {k: tag[k] for k in tag_keys[:feature_idx] + tag_keys[feature_idx + 1:]}
            output_tags = {tag_keys[feature_idx] : tag[tag_keys[feature_idx]]}


            X.append(tags_to_vec(input_tags, tag2idx))
            y.append(tags_to_vec(output_tags, tag2idx))
    
    X = np.stack(X)
    #print(X)
    y = np.stack(y)
    print("X shape:", X.shape)

    print("y_shape", y.shape)

    return X,y

def vector_pois_test(poisX, poisy, tag2idx, n):
    X = []
    y = []

    for tag in poisX:
        
        if len(tag) < n: # set the len to 1 will include empty {}
            print(tag)
            continue
        X.append(tags_to_vec(tag, tag2idx))
    for tag in poisy:
        y.append(tags_to_vec_singular(tag, tag2idx))

    
    X = np.stack(X)
    y = np.stack(y)
    print("X_shape:", X.shape)

    print("y_shape", y.shape)
    
    return X,y

In [8]:
# fp = pyrosm.get_data("Amsterdam")
# osm = pyrosm.OSM(fp)
# pois = osm.get_pois()
# pois["tags"] = pois["tags"].apply(parse_tags) #only need to run it once

with open('trainingset', 'rb') as fp:
    pois = pickle.load(fp)
with open('testset_questions', 'rb') as fp:
    test_pois_X = pickle.load(fp)
with open('testset_answer', 'rb') as fp:
    test_pois_y = pickle.load(fp)

In [9]:
n_good_tags = 10 # Frequency of tags for them to be allowed
n_per_instance = 1 # Amount of tags an instance needs to have to be part of the data
                   # If set to 1 it will include empty cells


good_tags, tag2idx = create_tag_lists(pois, n_good_tags)
remove_bad_tags(good_tags, pois)
#X, Y = vector_pois_opt(pois, tag2idx, n_per_instance)
X, Y = vector_pois(pois, tag2idx, n_per_instance)

All tags sorted by frequency: {'surface': 17675, 'smoothness': 11149, 'parking_space': 10813, 'access': 8860, 'capacity': 8660, 'fee': 7387, 'orientation': 6220, 'source:date': 5042, 'wheelchair': 3638, 'brand': 3590, 'brand:wikidata': 3478, 'lit': 2491, 'maxstay:conditional': 2379, 'check_date': 2155, 'operator:wikidata': 1938, 'material': 1909, 'brand:wikipedia': 1725, 'cuisine': 1524, 'backrest': 1385, 'recycling_type': 1315, 'colour': 1024, 'covered': 970, 'ref:bag': 954, 'seats': 866, 'outdoor_seating': 859, 'brand:website': 840, 'wikidata': 767, 'takeaway': 686, 'recycling:paper': 661, 'operator:wikipedia': 651, 'artwork_type': 610, 'level': 546, 'wikimedia_commons': 538, 'artist_name': 519, 'description': 510, 'recycling:glass_bottles': 509, 'indoor_seating': 499, 'branch': 485, 'layer': 474, 'payment:cash': 442, 'socket:type2': 434, 'note': 430, 'contact:facebook': 415, 'denomination': 388, 'toilets:wheelchair': 383, 'contact:instagram': 366, 'diet:vegetarian': 340, 'location':

In [10]:
print(type(test_pois_X))
print(test_pois_X)
X_test, y_test = vector_pois_test(test_pois_X, test_pois_y, tag2idx, 1)

<class 'list'>
[['recycling:glass_bottles', 'recycling_type'], ['access', 'smoothness', 'surface'], ['parking_space', 'smoothness', 'surface'], ['access', 'orientation', 'surface'], ['brand', 'brand:website', 'brand:wikidata', 'outdoor_seating', 'self_service', 'source:date'], ['location', 'recycling:glass_bottles', 'recycling:paper'], ['smoothness', 'surface'], ['access', 'parking_space'], ['backrest', 'material', 'seats'], ['access', 'capacity', 'surface'], ['access', 'capacity', 'fee', 'surface'], ['parking_space', 'smoothness', 'surface'], ['access', 'fee', 'orientation', 'surface'], ['capacity', 'fee', 'orientation', 'surface'], ['parking_space', 'smoothness'], ['access', 'fee', 'orientation', 'surface'], ['smoothness', 'surface'], ['access', 'capacity', 'covered', 'fee'], ['capacity', 'paving_stones:length', 'paving_stones:shape', 'paving_stones:width', 'smoothness', 'surface'], ['access', 'capacity', 'fee', 'surface'], ['brand', 'brand:wikipedia', 'check_date', 'source:date', 'w

In [11]:
#X_train, X_test, y_train, y_test = train_test_split(X, Y)
#X_val, _, y_val, _ = train_test_split(X, Y)

X_train, y_train, X_val, y_val = iterative_train_test_split(X, Y, test_size=0.2)
#X_val, y_val, _, _ = iterative_train_test_split(X, Y, test_size = 0.2)

# Just checking, ive had some issues
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

X_train shape: (26527, 345)
y_train shape: (26527, 345)
X_test shape: (8291, 345)
y_test shape: (8291, 345)
X_val shape: (6636, 345)
y_val shape: (6636, 345)


In [12]:
# print(" Comp ar ison time \n")
# print("Xtest shape  ", X_test.shape, "  Xtest1 shape  ", X_test1.shape)
# print("Ytest shape  ", y_test.shape, "  ytest1 shape  ", y_test1.shape, "\n")
# print("Sum Xtest  ", X_test.sum(), "  Sum Xtest1  ", X_test1.sum())
# print("Sum ytest  ", y_test.sum(), "  Sum ytest1  ", y_test1.sum(), "\n")

In [13]:
check_model = False
if check_model:
    model = RandomForestClassifier(random_state=42)

    # Define search space
    param_dist = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'max_features': ['sqrt', 'log2'],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
    }

    # Wrap in RandomizedSearchCV
    search = RandomizedSearchCV(
        model,
        param_distributions=param_dist,
        n_iter=20,  # Try 20 random combinations
        cv=3,       # 3-fold cross-validation
        scoring='f1_micro',  # Or accuracy, or a custom metric
        n_jobs=-1,  # Use all cores
        verbose=1
    )

    search.fit(X_train, y_train)

    # Best model after search
    best_model = search.best_estimator_
    print(best_model)

In [14]:
if not check_model:
    best_model = RandomForestClassifier(max_features='log2', n_estimators=300, random_state=42) # Thepreviousbestmodel

In [15]:
best_model.fit(X_train, y_train)
#best_model.score(X_test, y_test)

In [16]:
def find_best_thresholds(y_true, y_probs, thresholds=np.arange(0.1, 0.9, 9)):
    best_thresholds = []
    for i in range(y_true.shape[1]):
        best_f1 = 0
        best_thresh = 0.5
        for thresh in thresholds:
            preds = (y_probs[:, i] >= thresh).astype(int)
            f1 = f1_score(y_true[:, i], preds, zero_division=0)
            if f1 > best_f1:
                best_f1 = f1
                best_thresh = thresh
        best_thresholds.append(best_thresh)
    return np.array(best_thresholds)


In [17]:
Y_probs_val = best_model.predict_proba(X_val)      
prob_matrix_val = np.array([
    probs[:, 1] if probs.shape[1] > 1 else np.zeros(probs.shape[0])
    for probs in Y_probs_val]).T

thresholds = find_best_thresholds(y_val, prob_matrix_val)

In [18]:
Y_probs_test = best_model.predict_proba(X_test)
# probs_list = []
# for probs in Y_probs_test:
#     if probs.shape[1] == 2:
#         probs_list.append(probs[:, 1])  # P(class 1)
#     else:
#         # Only class 0 was seen — so class 1 probability is 0
#         probs_list.append(np.zeros(probs.shape[0]))

# rob_matrix = np.array(probs_list).T
# prob_matrix = np.array([probs[:, 1] for probs in Y_probs_test]).T
prob_matrix_test = np.array([
     probs[:, 1] if probs.shape[1] > 1 else np.zeros(probs.shape[0])
     for probs in Y_probs_test]).T

Y_preds = (prob_matrix_test >= thresholds).astype(int)
#Y_preds = (prob_matrix >= 0.01).astype(int)

In [19]:
# print("Min prob:", prob_matrix_test.min())
# print("Max prob:", prob_matrix_test.max())

# print("Min prob:", thresholds.min())
# print("Max prob:", thresholds.max())


In [20]:
# print("X_test shape:", X_test.shape)
# print("y_test shape:", y_test.shape)
# print("prob_matrix shape:", prob_matrix.shape)
# print("thresholds shape:", thresholds.shape)
# print("Y_preds shape:", Y_preds.shape)


In [21]:
print("Total labels in full dataset:", Y_preds.sum())
print("Per-tag label count:", y_test.sum(axis=0))

print("Total labels in full dataset:", Y.sum())
print("Per-tag label count:", Y.sum(axis=0))

Total labels in full dataset: 13309
Per-tag label count: [9.600e+01 3.000e+00 1.580e+02 4.950e+02 8.080e+02 1.123e+03 5.050e+02
 1.600e+01 1.000e+00 3.910e+02 3.550e+02 7.350e+02 1.000e+00 1.570e+02
 1.190e+02 6.200e+01 9.000e+00 2.100e+02 2.400e+01 8.400e+01 7.000e+00
 1.600e+01 2.000e+01 4.600e+01 2.900e+01 2.000e+01 4.900e+01 3.380e+02
 2.600e+01 8.000e+00 6.000e+00 7.700e+01 7.000e+00 1.050e+02 5.000e+01
 1.150e+02 0.000e+00 1.810e+02 1.760e+02 1.000e+00 3.400e+01 3.000e+01
 2.700e+01 3.000e+00 6.700e+01 7.000e+00 2.400e+01 1.000e+01 9.000e+00
 9.200e+01 2.500e+01 4.000e+01 2.500e+01 1.700e+01 3.000e+00 7.000e+01
 4.200e+01 3.000e+00 1.500e+01 1.100e+01 2.300e+01 7.000e+00 0.000e+00
 4.600e+01 4.300e+01 7.000e+00 0.000e+00 3.000e+00 0.000e+00 2.000e+00
 1.000e+00 0.000e+00 1.900e+01 1.900e+01 1.600e+01 4.000e+00 2.000e+01
 7.000e+00 5.000e+00 5.000e+00 2.000e+00 1.900e+01 1.400e+01 1.900e+01
 1.600e+01 1.000e+00 3.000e+00 1.000e+00 1.000e+00 0.000e+00 1.600e+01
 2.200e+01 6.000e+00

In [22]:
# Macro = average over tags, treating each equally
print(len(y_test), len(Y_preds), len(X_test))
f1 = f1_score(y_test, Y_preds, average='macro')
precision = precision_score(y_test, Y_preds, average='macro')
recall = recall_score(y_test, Y_preds, average='macro')

print("Macro F1 Score:", f1)
print("Macro Precision:", precision)
print("Macro Recall:", recall)

8291 8291 8291


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Macro F1 Score: 0.2815623881123322
Macro Precision: 0.24819259322615203
Macro Recall: 0.3874249551521193


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [23]:
for i in range(len(y_test)):
    #print(f"Prediction = {Y_preds[i]} \t Actual = {y_test[i]}")
    #print(f"length pred = {len(Y_preds[i])} \t len actual = {len(y_test[i])}")
    if len(Y_preds[i]) != len(y_test[i]):
        print("oops")
        print(i)

#print(len(Y_preds))
#print(len(y_test))

In [24]:
print(classification_report(y_test, Y_preds, zero_division=0))

              precision    recall  f1-score   support

           0       0.66      0.99      0.79        96
           1       0.00      0.00      0.00         3
           2       0.83      0.97      0.89       158
           3       0.60      0.91      0.73       495
           4       0.88      0.98      0.92       808
           5       0.89      0.99      0.93      1123
           6       0.53      0.93      0.67       505
           7       0.06      0.06      0.06        16
           8       0.00      0.00      0.00         1
           9       0.45      0.85      0.59       391
          10       0.73      0.95      0.82       355
          11       0.86      0.99      0.92       735
          12       0.00      0.00      0.00         1
          13       0.71      0.92      0.80       157
          14       0.57      0.98      0.72       119
          15       0.28      0.84      0.42        62
          16       0.22      0.44      0.30         9
          17       0.26    

In [25]:
workbook = Workbook()
workbook.remove(workbook.active) # Delete default sheet.

report = classification_report(
    y_test,
    Y_preds,
    zero_division=0,
    output_dict=True
)

workbook = convert_report2excel(
    workbook=workbook,
    report=report,
    sheet_name = "randomforest10_report"
)
workbook.save("randomforest10_report.xlsx")