*Openstreetmap new tag recommendations*

possible improvements:    
-   use library pyrosm (https://pyrosm.readthedocs.io/en/latest/)    
-   write the results of queries into a file (results.json or smth) instead of just printing      
-   create a function that takes lines from the tsv file and deletes tags for use of evaluation     
-   create a function that takes as input an evaluation set (list of lists of strings) and just runs all the queries together    

Ideas for RQ or topics:


In [75]:
import osmium
import csv
import subprocess
import time
import pickle
import json
from sklearn.metrics import classification_report
import re
from openpyxl import Workbook
from convert_report2excel import convert_report2excel
import pandas as pd
import numpy as np

The function convert_tsv takes the path to a .osm.pbf file with geodata and will convert it into a tsv file (called filename) usable for the recommenderserver.
It uses osmium to open it, then takes all the points with tags and adds the tags in a new line.

In [13]:
def convert_tsv(path: str, filename: str):
    """
    Converts a .osm.pbf file with geodata from osm to a tsv file usable for RecommenderServer
    """
    with open(filename, "w") as tsvfile:
        tsv_writer = csv.writer(tsvfile, delimiter='\t')
        

        for obj in osmium.FileProcessor(path):
            if len(obj.tags) > 0:
                object = []
                for i in obj.tags:
                    i = str(i).split("=")
                    object.append(i[0])
                tsv_writer.writerow(object)
        
        tsvfile.close()

The function create_tree calls convert_tsv to create a tsv, then runs the recommenderserver build-tree command to build a tree from that.

In [14]:
def create_tree(path_to_source_file: str, tsvfilename: str, path_to_server_dir: str):
    """
    Calls convert_tsv and and then creates a tree with it
    """
    convert_tsv(path_to_source_file, tsvfilename)
    result = subprocess.run(['cmd', '/c', 'cd'], capture_output=True, text=True)
    subprocess.run(['RecommenderServer', 'build-tree', 'from-tsv', result.stdout.strip() + '/' + tsvfilename], cwd= path_to_server_dir)

def create_tree_train_set(training_set:dict, tsvfilename:str, path_to_server_dir: str):
    """
    Converts to tsv and creates a tree
    Takes as input a pois["tags"] column of a pandas geodata object
    Meant for use in final experiments
    """
    
    with open(tsvfilename, "w") as tsvfile:
        tsv_writer = csv.writer(tsvfile, delimiter='\t')
        for obj in training_set:
            listed = list(obj.keys())
            if len(listed) > 0:
                tsv_writer.writerow(listed)
        
        tsvfile.close()
    result = subprocess.run(['cmd', '/c', 'cd'], capture_output=True, text=True)
    subprocess.run(['RecommenderServer', 'build-tree', 'from-tsv', result.stdout.strip() + '/' + tsvfilename], cwd= path_to_server_dir)


The query function will query a recommender tree that was already created from file filename, and in the recommenderserver directory path_to_server_dir. It will query a list of properties, and print the n most probable recommendations

In [15]:
def query(tsvfilename: str, path_to_server_dir: str, property_list: list[str], n:int = 1):
    """
    Opens a recommenderserver and queries it with a property list. 
    n: number of recommendations to print
    """
    open_server = subprocess.Popen(['RecommenderServer', 'serve', tsvfilename + '.schemaTree.typed.pb'], cwd= path_to_server_dir)
    time.sleep(1)
    powershell_command = """
    $body = '{"properties": """ + property_list + ""","types":[]}'
    $response = Invoke-WebRequest -Uri "http://localhost:8080/recommender" -Method POST -Body $body -ContentType "application/json"
    $response.Content
    """
    result = subprocess.run(["powershell", "-Command", powershell_command], capture_output=True, text=True)
        
    output_string = result.stdout
    recommendations_list = output_string.split("{")
    for i in recommendations_list[2:n+2]:
        # A possible improvement is to not print, but store it (perhaps write it in a file)
        print(i)
    
    open_server.terminate()


In [16]:
def multiquery_test(tsvfilename: str, path_to_server_dir: str, query_list: list[list[str]], n:int = 1) -> list[str]:
    """
    Opens a server and queries it multiple times without closing the server
    Stores the query results in order
    This function in its current form only works for n = 1 (if it only takes the first and most likely response)
    """

    open_server = subprocess.Popen(['RecommenderServer', 'serve', tsvfilename + '.schemaTree.typed.pb'], cwd= path_to_server_dir)
    response_list = []

    for i in range(len(query_list)):
        
        #time.sleep(.5)
        powershell_command = """
        $body = '{"properties": """ + query_list[i] + ""","types":[]}'
        $response = Invoke-WebRequest -Uri "http://localhost:8080/recommender" -Method POST -Body $body -ContentType "application/json"
        $response.Content
        """
        result = subprocess.run(["powershell", "-Command", powershell_command], capture_output=True, text=True)
        print(result.stdout)
        if not _JSON_RE.search(result.stdout):
            time.sleep(1)
            print("Taking a nap")
            #continue
        parsed = json.loads(result.stdout)
            
        for rec in parsed["recommendations"][:n]:
            if rec["property"] is None or rec is None:
                response_list.append("None")
                print("None")
            else: 
                response_list.append(rec["property"])
                #print(rec['property'])

        print(i)
        print(query_list[i])
        print([rec["property"]])
        print("\n")
        print(i)
        query(tsvfilename, path_to_server_dir, query_list[i])
        print("axtual ansewer", answers[i])
        

        # output_string = result.stdout
        # recommendations_list = output_string.split(":")
        # print(output_string)
        # for i in recommendations_list[2:n+2]:
        #     print("Querying", i)
        #     response_list.append(i)
        
    open_server.terminate()
    
    return response_list


In [17]:
def multiquery(tsvfilename: str, path_to_server_dir: str, query_list: list[list[str]], n:int = 1) -> list[str]:
    """
    Opens a server and queries it multiple times without closing the server
    Stores the query results in order
    This function in its current form only works for n = 1 (if it only takes the first and most likely response)
    """

    open_server = subprocess.Popen(['RecommenderServer', 'serve', tsvfilename + '.schemaTree.typed.pb'], cwd= path_to_server_dir)
    response_list = []

    for property_list in query_list:
        
        #time.sleep(.5)
        powershell_command = """
        $body = '{"properties": """ + property_list + ""","types":[]}'
        $response = Invoke-WebRequest -Uri "http://localhost:8080/recommender" -Method POST -Body $body -ContentType "application/json"
        $response.Content
        """
        result = subprocess.run(["powershell", "-Command", powershell_command], capture_output=True, text=True)
        #print(result.stdout)
        if not _JSON_RE.search(result.stdout):
            time.sleep(1)
            print("Taking a nap")
            #continue
        parsed = json.loads(result.stdout)
            
        for rec in parsed["recommendations"][:n]:
            if rec["property"] is None:
                response_list.append("None")
                print("None")
            else: 
                response_list.append(rec["property"])
                #print(rec['property'])
        # output_string = result.stdout
        # recommendations_list = output_string.split(":")
        # print(output_string)
        # for i in recommendations_list[2:n+2]:
        #     print("Querying", i)
        #     response_list.append(i)
        
    open_server.terminate()
    
    return response_list


Testing the code and running early experiments

In [18]:
# Change for use:

# Path to a geodata file (.osm.pbf format)
path_to_source_file = 'C:/Users/jotan/Downloads/groningen-latest.osm.pbf'    
# What to call your file (and your tree)
tsvfilename = "groningen.tsv"
# Path to the RecommenderServer folder
path_to_server_dir = 'C:/Users/jotan/SchoolStuffs/2024-25/BachelorProject/RecommenderServer'

# For querying: must be a stringed lists of strings
example_q1 = '["name", "traffic sign", "type"]'
example_q2 = '["type"]'
example_q3 = '["brand",	"brand:wikidata", "brand:wikipedia", "operator:wikidata"]'

multi_q1 = [example_q1, example_q2]
_JSON_RE = re.compile(r"\{.*\}", re.S) 

In [19]:
create_tree(path_to_source_file, tsvfilename, path_to_server_dir)

In [20]:
query(tsvfilename, path_to_server_dir, example_q3, 5)
query(tsvfilename, path_to_server_dir, example_q2, 5)

responses = multiquery(tsvfilename, path_to_server_dir, multi_q1, 1)
print(responses)

"property":"name","probability":1},
"property":"operator","probability":1},
"property":"amenity","probability":0.5616438356164384},
"property":"network","probability":0.5068493150684932},
"property":"network:wikidata","probability":0.4931506849315068},
"property":"ref","probability":0.43542393874563673},
"property":"network","probability":0.4324963405021957},
"property":"route","probability":0.4317081409751154},
"property":"network:type","probability":0.36223398265961043},
"property":"source","probability":0.2830762301542619},
['operator', 'ref']


Running the experimetns

In [21]:
# Importing the training set to create the tree
with open('trainingset', 'rb') as fp:
    trainingloaded = pickle.load(fp)
# And the test set for querying and answers for checking
with open('testset_questions', 'rb') as fp:
    questions = pickle.load(fp)
with open('testset_answer', 'rb') as fp:
    answers = pickle.load(fp)

newtsvfilename = "amsterdam.tsv"
testtsv = "test.tsv"

In [22]:
create_tree_train_set(trainingloaded["tags"], testtsv, path_to_server_dir)

In [23]:
# Send a single query to check if the tree works
query(testtsv, path_to_server_dir, example_q3, 5)

"property":"operator:wikipedia","probability":0.6875},
"property":"wheelchair","probability":0.2708333333333333},
"property":"fee","probability":0.19444444444444445},
"property":"source:date","probability":0.1875},
"property":"self_service","probability":0.16666666666666666},


In [24]:
# Creating a multiquery from our test questions set
questions_multi = [json.dumps(q) for q in questions]
len(questions_multi)


8291

In [25]:
#response = multiquery_test(testtsv, path_to_server_dir, questions_multi)

In [26]:
print(questions)
print(questions_multi[1])
query(newtsvfilename, path_to_server_dir, questions_multi[0])
print(repr(questions_multi[1]))


[['recycling:glass_bottles', 'recycling_type'], ['access', 'smoothness', 'surface'], ['parking_space', 'smoothness', 'surface'], ['access', 'orientation', 'surface'], ['brand', 'brand:website', 'brand:wikidata', 'outdoor_seating', 'self_service', 'source:date'], ['location', 'recycling:glass_bottles', 'recycling:paper'], ['smoothness', 'surface'], ['access', 'parking_space'], ['backrest', 'material', 'seats'], ['access', 'capacity', 'surface'], ['access', 'capacity', 'fee', 'surface'], ['parking_space', 'smoothness', 'surface'], ['access', 'fee', 'orientation', 'surface'], ['capacity', 'fee', 'orientation', 'surface'], ['parking_space', 'smoothness'], ['access', 'fee', 'orientation', 'surface'], ['smoothness', 'surface'], ['access', 'capacity', 'covered', 'fee'], ['capacity', 'paving_stones:length', 'paving_stones:shape', 'paving_stones:width', 'smoothness', 'surface'], ['access', 'capacity', 'fee', 'surface'], ['brand', 'brand:wikipedia', 'check_date', 'source:date', 'wheelchair'], ['

In [27]:
full_results = []

for i in range(0, len(questions_multi), 100):
    print(i, i+100)
    batch_results = multiquery(testtsv, path_to_server_dir, questions_multi[i:i+100])
    #break
    full_results.append(batch_results)
print(batch_results)
print(len(batch_results))

0 100
100 200
200 300
300 400
400 500
500 600
600 700
700 800
800 900
900 1000
1000 1100
1100 1200
1200 1300
1300 1400
1400 1500
1500 1600
1600 1700
1700 1800
1800 1900
1900 2000
2000 2100
2100 2200
2200 2300
2300 2400
2400 2500
2500 2600
2600 2700
2700 2800
2800 2900
2900 3000
3000 3100
3100 3200
3200 3300
3300 3400
3400 3500
3500 3600
3600 3700
3700 3800
3800 3900
3900 4000
4000 4100
4100 4200
4200 4300
4300 4400
4400 4500
4500 4600
4600 4700
4700 4800
4800 4900
4900 5000
5000 5100
5100 5200
5200 5300
5300 5400
5400 5500
5500 5600
5600 5700
5700 5800
5800 5900
5900 6000
6000 6100
6100 6200
6200 6300
6300 6400
6400 6500
6500 6600
6600 6700
6700 6800
6800 6900
6900 7000
7000 7100
7100 7200
7200 7300
7300 7400
7400 7500
7500 7600
7600 7700
7700 7800
7800 7900
7900 8000
8000 8100
8100 8200
8200 8300
['recycling:clothes', 'access', 'capacity', 'surface', 'access', 'surface', 'fee', 'orientation', 'smoothness', 'brand:wikidata', 'lit', 'recycling:paper', 'parking_space', 'brand:wikipedia',

In [33]:
if False:
    with open('predicted_answers_schematree_test', 'wb') as fb:
        pickle.dump(full_results, fb)

with open('predicted_answers_schematree_test', 'rb') as fb:
    full_results = pickle.load(fb)


In [68]:
all_results = []

for i in full_results:
    if len(i) < 100:
        i += ["None"] * (100 - len(i))
    all_results += i

In [74]:
print(len(all_results))
print(len(answers))
print(len(batch_results))
print(classification_report(answers, all_results[:8291]))

8300
8291
91
                                 precision    recall  f1-score   support

                           None       0.00      0.00      0.00         0
                         access       0.60      0.62      0.61       505
               air_conditioning       0.00      0.00      0.00         2
                        airside       0.00      0.00      0.00         3
                       alt_name       0.17      0.05      0.08        20
                       amperage       0.60      0.75      0.67         4
                      architect       0.00      0.00      0.00         1
                           area       0.00      0.00      0.00         4
                        armrest       1.00      0.33      0.50         6
               artist:wikipedia       0.00      0.00      0.00         3
                    artist_name       0.65      0.71      0.68        42
                artwork_subject       0.00      0.00      0.00         3
                   artwork_type      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# workbook = Workbook()
# workbook.remove(workbook.active) # Delete default sheet.

# report = classification_report(
#     answers,
#     all_results[:8291],
#     zero_division=0,
#     output_dict=True
# )

# workbook = convert_report2excel(
#     workbook=workbook,
#     report=report,
#     sheet_name = "schematree1_report"
# )
# workbook.save("schematree1_report.xlsx")

In [32]:
for i in full_results:
    for j in range(len(i)):
        print(i[j], answers[j])

recycling:paper recycling:glass
parking_space parking_space
maxstay:conditional access
capacity capacity
ref:rustpunt ref:rustpunt
recycling_type recycling_type
parking_space parking_space
surface capacity
colour colour
orientation orientation
orientation orientation
maxstay:conditional access
capacity capacity
access access
surface surface
capacity capacity
parking_space parking_space
surface surface
access access
orientation orientation
brand:wikidata brand:wikidata
brand:wikidata brand:wikidata
indoor payment:contactless
surface surface
source:date wheelchair
service:bicycle:second_hand check_date
parking_space parking_space
surface fee
colour colour
operator:wikidata operator:wikidata
brand:wikidata brand:wikidata
material material
source:date wikimedia_commons
description description
map_type map_type
surface surface
smoothness smoothness
brand:website brand:website
source:date check_date
operator:website wikimedia_commons
material material
smoothness smoothness
access access
park