This notebook file contains all the code used for the fetching, processing and splitting of the data. Data and results can be found in the offically used data folder. This file is a cleaned up version of the file the experiments were run in. I kept that file accesible in the legacy code folder as "traintest_legacy.ipynb", but this file contains all the useful code with none of the errant test functions and print statements. I also added more comments to this file for readability.

This file was created so that both models would have the exact same training and test data, and that the test data would also be split the same way into x_test and y_test

In [None]:
from pyrosm.data import sources
import pyrosm
from collections import Counter, defaultdict
import json
import pandas as pd
import ast
import numpy as np
from sklearn.model_selection import train_test_split
import pickle
import random

In [None]:
def parse_tags(val):
    """
    This code is used to quickly parse through the data once as we get it and remove any odd or missing values
    """
    if pd.isna(val) or val in ["None", "nan", None]:
        return {}
    try:
        return ast.literal_eval(val) if isinstance(val, str) else val
    
    except json.JSONDecodeError:
        return {}

def create_tag_lists(pois, n):
    """
    Returns all_good_tags, a list of the tags that appear more than n
    Returns tag2idx, a dictionary linking every allowed tag to an index number
    """
    
    tag_freq = defaultdict(int)

    for idx, row in pois.iterrows(): 
        tags = row.get("tags", {})
        
        # Counting the frequency of tags
        # This was also used for creating some graphs for analysis
        if isinstance(tags, dict) and tags:
            for tag_key in tags:
                tag_freq[tag_key] += 1

    # All good tags are the tags that have a higher frequency than N 
    all_tags = list(tag_freq.keys())
    all_good_tags = [tag for tag in tag_freq if tag_freq[tag] > n]

    # tag2idx and idx2tag arent used in this notebook
    # This function was originally only for random forest, where it would also vectorize objects
    # I was kinda too lazy to create two functions so half of the function is only utilized here while the other half of the function is used there
    tag2idx = {tag: i for i, tag in enumerate(all_good_tags)}
    idx2tag = {i: tag for tag, i in tag2idx.items()}


    tag_freq = dict(sorted(tag_freq.items(), key = lambda x: x[1], reverse = True))
    print(f"All tags sorted by frequency: {tag_freq}")
    print(f" All allowed tags: {all_good_tags}")
    print(f"Len all tags: {len(all_tags)}, Len good tags: {len(all_good_tags)}")

    # We only return these two objects
    # During testing and in other versions I have returned tag_freq or idx2tag but that wasn't needed for the experiments on random forest
    return (all_good_tags, tag2idx)

def remove_bad_tags(good_tags, pois):
    """
    This function removes every tag from pois that is not in good_tags
    """
    for i in pois["tags"]:
        remove_list = []
        for j in i:
            # Checking per object and per tag in that object whether it is in good_tags or not
            if j not in good_tags:
                remove_list.append(j)
        # Once we find all the tags to remove, we remove the tags
        for removable in remove_list:
            i.pop(removable)
             

In [None]:
# Loading the data from pyrosm
# I am unsure how often pyrosm or even OSM gets updated but we fetched the data on the 15th of June 2025
fp = pyrosm.get_data("Amsterdam")
osm = pyrosm.OSM(fp)
pois = osm.get_pois()
# Quickly parse through the pois
pois["tags"] = pois["tags"].apply(parse_tags) #only need to run it once

In [None]:
# Frequency of tags for them to be allowed
n_good_tags = 10 

# Finding and keeping only the good tags
good_tags, tag2idx = create_tag_lists(pois, n_good_tags)
remove_bad_tags(good_tags, pois)

# Keep only rows where 'tags' is more than two
pois = pois[pois["tags"].apply(lambda x: isinstance(x, dict) and len(x) > 2)]


In [None]:
# Splitting our data into train and test
final_train,  final_test = train_test_split(pois, test_size=0.2)


test_questions = []
test_answers = []

# Taking the last tag off of every object in the test set
for i in final_test['tags']:
    inew = list(i.keys())
    idx = random.randint(0, len(inew)-1)
    # This last tag is put into answers while the rest goes into questions
    test_answers.append(inew[idx])
    test_questions.append(inew[:idx] + inew[idx+1:])

print("q-a", len(test_questions), len(test_answers))


# Saving the data through pickle
# The data used for the experiments is in the officially used data folder
with open('trainingset', 'wb') as fp:
    pickle.dump(final_train, fp)
with open('testset_questions', 'wb') as fb:
    pickle.dump(test_questions, fb)
with open('testset_answer', 'wb') as fq:
    pickle.dump(test_answers, fq)