In [1]:
import pandas as pd
import numpy as np
import json
import glob

# Parsing polygon shaped poins of interest (mostly buildings)

In [140]:
# Writing all data from OSM "ways" into one dataframe
all_way_files = glob.glob('poi_data/' + '/*way.json')

li = []

for filename in all_way_files:
    data = json.load(open(filename))
    df = pd.DataFrame(data["elements"])
    li.append(df)

ways_df = pd.concat(li, axis=0, ignore_index=True)

ways_df

Unnamed: 0,type,id,nodes,tags,lat,lon
0,way,322936864,"[943686247, 3297316598, 3297316599, 3297316597...","{'amenity': 'atm', 'building': 'yes', 'name': ...",,
1,way,502703971,"[4930919833, 4930919832, 2776837140, 493091983...","{'amenity': 'atm', 'brand': 'Euronet', 'brand:...",,
2,way,601748205,"[5718064674, 5718064673, 5718064672, 571806467...","{'amenity': 'atm', 'building': 'yes'}",,
3,way,628330908,"[6286985933, 6286985934, 6286985935, 628698593...","{'amenity': 'atm', 'building': 'yes'}",,
4,node,4476883190,,,52.519416,13.407335
...,...,...,...,...,...,...
37223,node,6910636867,,,52.202738,13.092623
37224,node,6910636868,,,52.202741,13.092537
37225,node,9338022401,,,52.417329,13.625106
37226,node,9338022402,,,52.417332,13.625071


In [141]:
# Dropping duplicates (converting nodes and tags to strings first, because lists and dicts aren't hashable)
ways_df['tags_string'] = ways_df['tags'].astype(str)
ways_df['nodes_string'] = ways_df['nodes'].astype(str)
ways_df = ways_df.drop_duplicates(['nodes_string', 'tags_string', 'lat', 'lon'])

ways_df.shape


(19352, 8)

In [142]:
# Creating a subset of the df for easier coding
ways_df = ways_df.iloc[0:300,:]

### Extracting all the nodes that are part of a polygon by comparing the ids

In [143]:
# creating an empty df to start with
polygon_nodes_df = pd.DataFrame({'type': [],'id':[],'nodes':[],'tags':[],'lat':[],'lon':[]})

# filtering for all the polygons
polygon_df = ways_df[ways_df['type'] == 'way']

# appending all nodes with ids matching a polygon into a new df
for list in polygon_df['nodes']:
    for node_id in list:
        new_df = ways_df[ways_df['id'] == node_id]
        new_df['polygon_id'] = polygon_df[polygon_df['nodes'].apply(lambda x: x == list)]['id'].item() # storing the polygon_id for each node
        new_df['tags'] = polygon_df[polygon_df['nodes'].apply(lambda x: x == list)]['tags_string'].item() # storing the polygon tags for each node
        polygon_nodes_df = pd.concat([new_df, polygon_nodes_df])

polygon_nodes_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['polygon_id'] = polygon_df[polygon_df['nodes'].apply(lambda x: x == list)]['id'].item() # storing the polygon_id for each node
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = polygon_df[polygon_df['nodes'].apply(lambda x: x == list)]['tags_string'].item() # storing the polygon tags for each node


Unnamed: 0,type,id,nodes,tags,lat,lon,tags_string,nodes_string,polygon_id
308,node,6.341448e+09,,"{'addr:city': 'Berlin', 'addr:country': 'DE', ...",52.535762,13.492009,,,982009425.0
309,node,6.341448e+09,,"{'addr:city': 'Berlin', 'addr:country': 'DE', ...",52.535788,13.492029,,,982009425.0
310,node,6.341448e+09,,"{'addr:city': 'Berlin', 'addr:country': 'DE', ...",52.535795,13.492140,,,982009425.0
311,node,6.341448e+09,,"{'addr:city': 'Berlin', 'addr:country': 'DE', ...",52.535787,13.492146,,,982009425.0
312,node,6.341448e+09,,"{'addr:city': 'Berlin', 'addr:country': 'DE', ...",52.535780,13.492154,,,982009425.0
...,...,...,...,...,...,...,...,...,...
27,node,9.436862e+08,,"{'amenity': 'atm', 'building': 'yes', 'name': ...",52.443939,13.530523,,,322936864.0
28,node,3.297317e+09,,"{'amenity': 'atm', 'building': 'yes', 'name': ...",52.443972,13.530583,,,322936864.0
30,node,3.297317e+09,,"{'amenity': 'atm', 'building': 'yes', 'name': ...",52.444072,13.530439,,,322936864.0
29,node,3.297317e+09,,"{'amenity': 'atm', 'building': 'yes', 'name': ...",52.444040,13.530378,,,322936864.0


In [144]:
# Shaping the df into the same format as all_nodes_uncleaned.csv, so that it fits into Emilio's before created data cleaning procedure
polygon_nodes_df.drop(['id','nodes', 'tags_string', 'nodes_string'], axis=1, inplace=True)
polygon_nodes_df.rename({'polygon_id': 'id'}, axis=1, inplace=True)
polygon_nodes_df.sort_values(by=['id'], inplace=True)

polygon_nodes_df

Unnamed: 0,type,tags,lat,lon,id
119,node,"{'addr:city': 'Berlin', 'addr:country': 'DE', ...",52.433261,13.191137,35739760.0
121,node,"{'addr:city': 'Berlin', 'addr:country': 'DE', ...",52.433283,13.191331,35739760.0
125,node,"{'addr:city': 'Berlin', 'addr:country': 'DE', ...",52.433377,13.191304,35739760.0
123,node,"{'addr:city': 'Berlin', 'addr:country': 'DE', ...",52.433357,13.191103,35739760.0
126,node,"{'addr:city': 'Berlin', 'addr:country': 'DE', ...",52.433382,13.191047,35739760.0
...,...,...,...,...,...
322,node,"{'addr:city': 'Berlin', 'addr:country': 'DE', ...",52.535724,13.491996,982009425.0
323,node,"{'addr:city': 'Berlin', 'addr:country': 'DE', ...",52.535752,13.492045,982009425.0
308,node,"{'addr:city': 'Berlin', 'addr:country': 'DE', ...",52.535762,13.492009,982009425.0
312,node,"{'addr:city': 'Berlin', 'addr:country': 'DE', ...",52.535780,13.492154,982009425.0
