*Openstreetmap new tag recommendations*

possible improvements:    
-   use library pyrosm (https://pyrosm.readthedocs.io/en/latest/)    
-   write the results of queries into a file (results.json or smth) instead of just printing      
-   create a function that takes lines from the tsv file and deletes tags for use of evaluation     
-   create a function that takes as input an evaluation set (list of lists of strings) and just runs all the queries together    

Ideas for RQ or topics:


In [141]:
import osmium
import csv
import subprocess
import time
import pickle
import json
from sklearn.metrics import classification_report

The function convert_tsv takes the path to a .osm.pbf file with geodata and will convert it into a tsv file (called filename) usable for the recommenderserver.
It uses osmium to open it, then takes all the points with tags and adds the tags in a new line.

In [142]:
def convert_tsv(path: str, filename: str):
    """
    Converts a .osm.pbf file with geodata from osm to a tsv file usable for RecommenderServer
    """
    with open(filename, "w") as tsvfile:
        tsv_writer = csv.writer(tsvfile, delimiter='\t')
        

        for obj in osmium.FileProcessor(path):
            if len(obj.tags) > 0:
                object = []
                for i in obj.tags:
                    i = str(i).split("=")
                    object.append(i[0])
                tsv_writer.writerow(object)
        
        tsvfile.close()

The function create_tree calls convert_tsv to create a tsv, then runs the recommenderserver build-tree command to build a tree from that.

In [143]:
def create_tree(path_to_source_file: str, tsvfilename: str, path_to_server_dir: str):
    """
    Calls convert_tsv and and then creates a tree with it
    """
    convert_tsv(path_to_source_file, tsvfilename)
    result = subprocess.run(['cmd', '/c', 'cd'], capture_output=True, text=True)
    subprocess.run(['RecommenderServer', 'build-tree', 'from-tsv', result.stdout.strip() + '/' + tsvfilename], cwd= path_to_server_dir)

def create_tree_train_set(training_set:dict, tsvfilename:str, path_to_server_dir: str):
    """
    Converts to tsv and creates a tree
    Takes as input a pois["tags"] column of a pandas geodata object
    Meant for use in final experiments
    """
    
    with open(tsvfilename, "w") as tsvfile:
        tsv_writer = csv.writer(tsvfile, delimiter='\t')
        for obj in training_set:
            listed = list(obj.keys())
            if len(listed) > 0:
                tsv_writer.writerow(listed)
        
        tsvfile.close()
    result = subprocess.run(['cmd', '/c', 'cd'], capture_output=True, text=True)
    subprocess.run(['RecommenderServer', 'build-tree', 'from-tsv', result.stdout.strip() + '/' + tsvfilename], cwd= path_to_server_dir)


The query function will query a recommender tree that was already created from file filename, and in the recommenderserver directory path_to_server_dir. It will query a list of properties, and print the n most probable recommendations

In [144]:
def query(tsvfilename: str, path_to_server_dir: str, property_list: list[str], n:int = 1):
    """
    Opens a recommenderserver and queries it with a property list. 
    n: number of recommendations to print
    """
    open_server = subprocess.Popen(['RecommenderServer', 'serve', tsvfilename + '.schemaTree.typed.pb'], cwd= path_to_server_dir)
    time.sleep(1)
    powershell_command = """
    $body = '{"properties": """ + property_list + ""","types":[]}'
    $response = Invoke-WebRequest -Uri "http://localhost:8080/recommender" -Method POST -Body $body -ContentType "application/json"
    $response.Content
    """
    result = subprocess.run(["powershell", "-Command", powershell_command], capture_output=True, text=True)
        
    output_string = result.stdout
    recommendations_list = output_string.split("{")
    for i in recommendations_list[2:n+2]:
        # A possible improvement is to not print, but store it (perhaps write it in a file)
        print(i)
    
    open_server.terminate()


In [145]:
def multiquery(tsvfilename: str, path_to_server_dir: str, query_list: list[list[str]], n:int = 1) -> list[str]:
    """
    Opens a server and queries it multiple times without closing the server
    Stores the query results in order
    This function in its current form only works for n = 1 (if it only takes the first and most likely response)
    """

    open_server = subprocess.Popen(['RecommenderServer', 'serve', tsvfilename + '.schemaTree.typed.pb'], cwd= path_to_server_dir)
    response_list = []

    for property_list in query_list:
        
        #time.sleep(.5)
        powershell_command = """
        $body = '{"properties": """ + property_list + ""","types":[]}'
        $response = Invoke-WebRequest -Uri "http://localhost:8080/recommender" -Method POST -Body $body -ContentType "application/json"
        $response.Content
        """
        result = subprocess.run(["powershell", "-Command", powershell_command], capture_output=True, text=True)
        
        parsed = json.loads(result.stdout)
        for rec in parsed["recommendations"][:n]:
            if rec["property"] is None:
                response_list.append("None")
                print("None")
            else: 
                response_list.append(rec["property"])
                #print(rec['property'])
        # output_string = result.stdout
        # recommendations_list = output_string.split(":")
        # print(output_string)
        # for i in recommendations_list[2:n+2]:
        #     print("Querying", i)
        #     response_list.append(i)
        
    open_server.terminate()
    
    return response_list


Testing the code and running early experiments

In [146]:
# Change for use:

# Path to a geodata file (.osm.pbf format)
path_to_source_file = 'C:/Users/jotan/Downloads/groningen-latest.osm.pbf'    
# What to call your file (and your tree)
tsvfilename = "groningen.tsv"
# Path to the RecommenderServer folder
path_to_server_dir = 'C:/Users/jotan/SchoolStuffs/2024-25/BachelorProject/RecommenderServer'

# For querying: must be a stringed lists of strings
example_q1 = '["name", "traffic sign", "type"]'
example_q2 = '["type"]'
example_q3 = '["brand",	"brand:wikidata", "brand:wikipedia", "operator:wikidata"]'

multi_q1 = [example_q1, example_q2]

In [147]:
create_tree(path_to_source_file, tsvfilename, path_to_server_dir)

In [148]:
query(tsvfilename, path_to_server_dir, example_q3, 5)
# query(tsvfilename, path_to_server_dir, example_q2, 5)

# responses = multiquery(tsvfilename, path_to_server_dir, multi_q1, 1)
# print(responses)

"property":"name","probability":1},
"property":"operator","probability":1},
"property":"amenity","probability":0.5616438356164384},
"property":"network","probability":0.5068493150684932},
"property":"network:wikidata","probability":0.4931506849315068},


Running the experimetns

In [149]:
# Importing the training set to create the tree
with open('trainingset', 'rb') as fp:
    trainingloaded = pickle.load(fp)
# And the test set for querying and answers for checking
with open('testset_questions', 'rb') as fp:
    questions = pickle.load(fp)
with open('testset_answer', 'rb') as fp:
    answers = pickle.load(fp)

newtsvfilename = "amsterdam.tsv"

In [150]:
create_tree_train_set(trainingloaded["tags"], newtsvfilename, path_to_server_dir)

In [151]:
# Send a single query to check if the tree works
query(newtsvfilename, path_to_server_dir, example_q3, 5)

"property":"operator","probability":1},
"property":"name","probability":1},
"property":"amenity","probability":0.5616438356164384},
"property":"network","probability":0.5068493150684932},
"property":"network:wikidata","probability":0.4931506849315068},


In [152]:
# Creating a multiquery from our test questions set
questions_multi = [json.dumps(q) for q in questions]
len(questions_multi)


8291

In [153]:
full_results = []

for i in range(0, len(questions_multi), 100):
    print(i, i+100)
    batch_results = multiquery(newtsvfilename, path_to_server_dir, questions_multi[i:i+100])
    #break
    full_results.append(batch_results)
print(batch_results)
print(len(batch_results))

0 100
100 200
200 300
300 400
400 500
500 600
600 700
700 800
800 900
900 1000
1000 1100
1100 1200
1200 1300
1300 1400
1400 1500
1500 1600
1600 1700
1700 1800
1800 1900
1900 2000
2000 2100
2100 2200
2200 2300
2300 2400
2400 2500
2500 2600
2600 2700
2700 2800
2800 2900
2900 3000
3000 3100
3100 3200
3200 3300
3300 3400
3400 3500
3500 3600
3600 3700
3700 3800
3800 3900
3900 4000
4000 4100
4100 4200
4200 4300
4300 4400
4400 4500
4500 4600
4600 4700
4700 4800


JSONDecodeError: Expecting value: line 2 column 1 (char 1)

In [None]:
# with open('predicted_answers_schematree', 'wb') as fb:
#     pickle.dump(full_results, fb)

In [154]:
all_results = []

for i in full_results:
    all_results += i

In [155]:
print(len(all_results))
print(len(answers))
print(len(batch_results))
print(classification_report(answers[:97], batch_results[:97]))

4587
8291
98
                         precision    recall  f1-score   support

                 access       0.00      0.00      0.00       5.0
       addr:housenumber       0.00      0.00      0.00       0.0
                amenity       0.00      0.00      0.00       0.0
           artwork_type       0.00      0.00      0.00       1.0
                bicycle       0.00      0.00      0.00       0.0
          brand:website       0.00      0.00      0.00       1.0
         brand:wikidata       0.00      0.00      0.00       4.0
                    bus       0.00      0.00      0.00       0.0
               capacity       0.00      0.00      0.00       7.0
             check_date       0.00      0.00      0.00       2.0
                 colour       0.00      0.00      0.00       2.0
                covered       0.00      0.00      0.00       1.0
                cuisine       0.00      0.00      0.00       1.0
            description       0.00      0.00      0.00       1.0
           

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
print(answers[0])
query(newtsvfilename, path_to_server_dir, json.dumps(questions[0]), 5)

recycling:glass
"property":"amenity","probability":1},
"property":"recycling:paper","probability":0.35},
"property":"opening_hours","probability":0.2},
"property":"recycling:clothes","probability":0.15833333333333333},
"property":"check_date:recycling","probability":0.11666666666666667},


In [None]:
for i in full_results:
    for j in range(len(i)):
        print(i[j], answers[j])