In [1]:
import pandas as pd
import numpy as np
import json
import glob

In [2]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)

#### If you don't have a merged poi dataset to read in, read in uncleaned Points of Interest data and merge into a single pandas dataframe:

In [3]:
#this code assumes you have the raw data in a folder titled 'poi_data', which should be in the same folder as this file
all_node_files = glob.glob('poi_data/' + '/*node.json')

li = []

for filename in all_node_files:
    data = json.load(open(filename))
    df = pd.DataFrame(data["elements"])
    li.append(df)

all_nodes = pd.concat(li, axis=0, ignore_index=True)

# Cleaning Points of Interest data

#### Exploring the data (first pass):

In [5]:
all_nodes.isna().sum()

type    0
id      0
lat     0
lon     0
tags    0
dtype: int64

There are no missing values

## Extracting relevant features

#### Extracting the name of each POI and adding it as a separate feature 

In [8]:
# extracting the 'name' data and adding it to a new 'name' column 
def extract_name(df):
    new_df = df
    for i, dict in enumerate(new_df.tags):
        try:
            all_nodes.loc[i, 'name'] = dict['name']
        except:
            pass
    return new_df


#### Extracting relevant categories

In [7]:
"""
Function to extract categories from the 'tags' column of any version of the dataframe (must be parsed in as argument)

It contains the dictionary and loop from earlier category extraction.

A 'tags' column in its original raw form is a prerequisite. 
Pandas must be imported as pd and numpy as np.

Looping through keys should be enough, so I have added the extra lines back in.

I have included a line that will record 'historic: yes' categories as 'historic' rather than 'yes'.

"""

def extract_categories(df):
    
    # make a copy of the dataframe
    new_df = df
    
    # create a new 'category' column / fill it with empty lists
    new_df['category'] = np.empty((len(new_df), 0)).tolist()
    
    # dictionary of category key/value pairs that were scraped
    categories = {'amenity': ['atm',
                          'bar',
                          'bench',
                          'cafe',
                          'drinking_water',
                          'ice_cream',
                          'nightclub',
                          'restaurant'
                         ],
              
              'historic': ['yes'], # fixed bug to put all values into lists to avoid looping on strings
              'leisure': ['picnic_table'],
              'natural': ['tree'],
              'shop': ['convenience'],
              'tourism': ['art_work',
                          'attraction',
                          'gallery',
                          'museum',
                          'viewpoint',
                          ] 
              }

    # for the new df copy, loop through the 'tags' column, keeping track of the row index
    for index, row in enumerate(new_df.tags):        
        # for each row, loop through every key in the 'categories' dictionary
        for key in categories:
            # if the row contains a matching key
            if key in row:
                # if the key's value is equal to 'historic'
                if key == 'historic':
                    # for the current row, append 'historic' to the 'category' column
                    new_df.loc[index, 'category'].append('historic')
                else:
                    # check the key's value from the row against the key's values from the 'categories' variable
                    for value in categories[key]:
                        # if there is a match
                        if value in row[key]:
                            # for the current row, append the key's value to the 'category' column
                            new_df.loc[index, 'category'].append(value) # fixed bug to add matched value only
                     
    return new_df


#### Making a new 'lat_lgt' column with coordinate tuples

In [6]:
"""Extracts lat and lon values for each row and adds them to a new 'lat_lgt' column as a tuple
This matches the format of lat/lon data in the routes dataset """
def extract_lat_lgt_tuple(df):
    # create a copy of df
    new_df = df
    # create a new empty column 'lat_lgt' with dtype object (otherwise tuples won't be accepted as values)
    df['lat_lgt'] = pd.Series(dtype='object')
    # for every row
    for index in new_df.index:
        lat = df.loc[index, 'lat']
        lgt = df.loc[index, 'lon']
        # add the lat/lgt pairs as a tuple to the new column
        new_df.at[index, 'lat_lgt'] = (lat, lgt)                    
    return new_df


## Creating the new dataframes

In [9]:
#'all_nodes' is the merged, uncleaned dataframe read in at the start of the file

all_nodes_extracted = extract_name(all_nodes)
all_nodes_extracted = extract_categories(all_nodes)
all_nodes_extracted = extract_lat_lgt_tuple(all_nodes)
all_nodes_extracted

Unnamed: 0,type,id,lat,lon,tags,name,category,lat_lgt
0,node,78252154,52.523744,13.398627,"{'addr:city': 'Berlin', 'addr:country': 'DE', ...",Bank für Sozialwirtschaft,[atm],"(52.5237445, 13.3986266)"
1,node,87036263,52.532985,13.384282,"{'amenity': 'atm', 'brand': 'Sparda-Bank', 'br...",Sparda-Bank,[atm],"(52.5329853, 13.3842822)"
2,node,89275133,52.518025,13.406956,"{'amenity': 'atm', 'name': 'Bankhaus August Le...",Bankhaus August Lenz,[atm],"(52.5180249, 13.4069563)"
3,node,213106623,52.542170,13.441137,"{'addr:country': 'DE', 'addr:housenumber': '87...",,[atm],"(52.5421697, 13.4411367)"
4,node,213113204,52.542750,13.392862,"{'amenity': 'atm', 'name': 'Berliner Sparkasse...",Berliner Sparkasse,[atm],"(52.5427503, 13.3928618)"
...,...,...,...,...,...,...,...,...
213006,node,8931299152,52.487989,13.275393,{'tourism': 'viewpoint'},,[viewpoint],"(52.4879893, 13.2753934)"
213007,node,9024702237,52.506772,13.334563,{'tourism': 'viewpoint'},,[viewpoint],"(52.506772, 13.3345627)"
213008,node,9026936271,52.401704,13.366960,"{'name': 'Alpengipfel', 'tourism': 'viewpoint'}",Alpengipfel,[viewpoint],"(52.4017038, 13.3669598)"
213009,node,9038673666,52.482133,13.291911,"{'direction': 'E', 'tourism': 'viewpoint'}",,[viewpoint],"(52.4821326, 13.2919107)"


## Inspecting the full dataframe

#### Checking for double categories

In [11]:
double_categories = []
for index, value in enumerate(all_nodes_extracted.category):
    if len(value) > 1:
      
        double_categories.append(all_nodes_extracted.iloc[index])
        
print(len(double_categories))
print(double_categories)


65
[type                                                     node
id                                                  429741495
lat                                                 52.528762
lon                                                 13.408603
tags        {'addr:city': 'Berlin', 'addr:country': 'DE', ...
name                                            Kaffee Burger
category                                      [bar, historic]
lat_lgt                               (52.528762, 13.4086025)
Name: 1041, dtype: object, type                                                     node
id                                                  746853061
lat                                                 52.580255
lon                                                 13.398819
tags        {'amenity': 'bar', 'internet_access': 'termina...
name                                         Babylon Café Bar
category                                   [bar, convenience]
lat_lgt                              (5

*There are 65 rows with multiple categories, all of which are instances of the desired scraped categories!*

#### Checking for null values

In [12]:
# checking for missing values
all_nodes_extracted.isna().sum()

type             0
id               0
lat              0
lon              0
tags             0
name        202443
category         0
lat_lgt          0
dtype: int64

*There are 202443 rows that don't have a name - this will need to be accounted for if names will be displayed in the final product*

#### Check the number of categories (double categories are excluded)

In [16]:
all_nodes_extracted['category'].value_counts()

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[tree]                       182285
[bench]                       19024
[restaurant]                   4685
[cafe]                         2475
[atm]                          1011
[convenience]                   869
[bar]                           818
[picnic_table]                  434
[ice_cream]                     289
[viewpoint]                     256
[gallery]                       189
[museum]                        156
[drinking_water]                148
[nightclub]                     141
[attraction]                     94
[historic]                       72
[tree, attraction]               18
[cafe, convenience]              16
[historic, attraction]           13
[bench, viewpoint]                4
[bar, historic]                   2
[bench, picnic_table]             2
[historic, tree]                  2
[restaurant, convenience]         2
[bar, convenience]                2
[bench, tree]                     2
[bar, viewpoint]                  2
Name: category, dtype: int64

### Inspecting the cleaned dataset

In [17]:
all_nodes_extracted.sort_values(by='lat')

Unnamed: 0,type,id,lat,lon,tags,name,category,lat_lgt
25859,node,1767860179,52.049367,12.897805,"{'addr:city': 'Treuenbrietzen', 'addr:housenum...",Zur alten Eiche,[restaurant],"(52.0493672, 12.8978051)"
16458,node,7809140205,52.056699,12.890166,"{'amenity': 'bench', 'backrest': 'yes', 'mater...",,[bench],"(52.056699, 12.8901664)"
136263,node,4867051858,52.056718,12.947945,"{'leaf_cycle': 'deciduous', 'leaf_type': 'broa...",,[tree],"(52.0567179, 12.9479447)"
136261,node,4867051856,52.056942,12.947483,"{'leaf_cycle': 'deciduous', 'leaf_type': 'broa...",,[tree],"(52.0569422, 12.9474833)"
136262,node,4867051857,52.056995,12.947478,"{'leaf_cycle': 'deciduous', 'leaf_type': 'broa...",,[tree],"(52.056995, 12.947478)"
...,...,...,...,...,...,...,...,...
177138,node,6210317133,52.726197,13.504110,"{'leaf_cycle': 'deciduous', 'leaf_type': 'broa...",100,[tree],"(52.726197, 13.50411)"
118784,node,4069649001,52.738323,13.294171,"{'name': 'Kaisereiche', 'natural': 'tree'}",Kaisereiche,[tree],"(52.738323, 13.2941708)"
118785,node,4069649010,52.738795,13.290276,"{'name': 'Zareneiche', 'natural': 'tree'}",Zareneiche,[tree],"(52.7387954, 13.2902757)"
79524,node,3647555641,52.749586,13.289174,"{'description': 'markanter toter Baum', 'descr...",,[tree],"(52.7495864, 13.2891739)"


In [18]:
all_nodes_extracted.sort_values(by='lon')

Unnamed: 0,type,id,lat,lon,tags,name,category,lat_lgt
190098,node,8357706309,52.113896,12.813895,"{'leaf_cycle': 'deciduous', 'leaf_type': 'broa...",,[tree],"(52.1138959, 12.8138946)"
190097,node,8357706308,52.113984,12.814045,"{'leaf_cycle': 'deciduous', 'leaf_type': 'broa...",,[tree],"(52.1139844, 12.8140453)"
190099,node,8357706310,52.113648,12.814063,"{'leaf_cycle': 'deciduous', 'leaf_type': 'broa...",,[tree],"(52.1136476, 12.8140628)"
190096,node,8357706307,52.114133,12.814331,"{'leaf_cycle': 'deciduous', 'leaf_type': 'broa...",,[tree],"(52.1141327, 12.8143305)"
190100,node,8357706311,52.113413,12.814358,"{'leaf_cycle': 'deciduous', 'leaf_type': 'broa...",,[tree],"(52.1134134, 12.8143578)"
...,...,...,...,...,...,...,...,...
123335,node,4456998785,52.591379,13.973305,{'natural': 'tree'},,[tree],"(52.5913789, 13.973305)"
123331,node,4456986483,52.613533,13.974341,{'natural': 'tree'},,[tree],"(52.6135334, 13.9743414)"
123332,node,4456986498,52.607256,13.978784,{'natural': 'tree'},,[tree],"(52.6072561, 13.9787835)"
123336,node,4457068147,52.575151,13.986716,{'natural': 'tree'},,[tree],"(52.5751515, 13.9867155)"


*A lot of the data points are outside the boundary of Berlin!*

#### (Optional!) Function to filter non-Berlin POIs

Unsure how much computation this would save. Likely that we could leave this unfiltered, since all poi that do not match a route are unlikely to be kept at merging.

In [1]:
# function to filter most POIs falling outside Berlin boundaries
def berlin_only(df):
    
    new_df = df
    
    filt = new_df['lat'] > 52.3
    new_df = new_df.loc[filt]
    
    filt = new_df['lat'] < 52.7
    new_df = new_df.loc[filt]
    
    filt = new_df['lon'] > 12.9
    new_df = new_df.loc[filt]
    
    filt = new_df['lon'] < 13.8
    new_df = new_df.loc[filt]
    
    return new_df


In [21]:
all_nodes_extracted_berlin_only = berlin_only(all_nodes_extracted)
all_nodes_extracted_berlin_only

Unnamed: 0,type,id,lat,lon,tags,name,category,lat_lgt
0,node,78252154,52.523744,13.398627,"{'addr:city': 'Berlin', 'addr:country': 'DE', ...",Bank für Sozialwirtschaft,[atm],"(52.5237445, 13.3986266)"
1,node,87036263,52.532985,13.384282,"{'amenity': 'atm', 'brand': 'Sparda-Bank', 'br...",Sparda-Bank,[atm],"(52.5329853, 13.3842822)"
2,node,89275133,52.518025,13.406956,"{'amenity': 'atm', 'name': 'Bankhaus August Le...",Bankhaus August Lenz,[atm],"(52.5180249, 13.4069563)"
3,node,213106623,52.542170,13.441137,"{'addr:country': 'DE', 'addr:housenumber': '87...",,[atm],"(52.5421697, 13.4411367)"
4,node,213113204,52.542750,13.392862,"{'amenity': 'atm', 'name': 'Berliner Sparkasse...",Berliner Sparkasse,[atm],"(52.5427503, 13.3928618)"
...,...,...,...,...,...,...,...,...
213006,node,8931299152,52.487989,13.275393,{'tourism': 'viewpoint'},,[viewpoint],"(52.4879893, 13.2753934)"
213007,node,9024702237,52.506772,13.334563,{'tourism': 'viewpoint'},,[viewpoint],"(52.506772, 13.3345627)"
213008,node,9026936271,52.401704,13.366960,"{'name': 'Alpengipfel', 'tourism': 'viewpoint'}",Alpengipfel,[viewpoint],"(52.4017038, 13.3669598)"
213009,node,9038673666,52.482133,13.291911,"{'direction': 'E', 'tourism': 'viewpoint'}",,[viewpoint],"(52.4821326, 13.2919107)"


In [22]:
213011-207531

5480

*The berlin_only filter with current parameters filters out 5480 POIs. This is likely to be negligible in terms of computational cost*

## Writing the cleaned dataframe

#### 1. Make a copy with desired columns

In [24]:
# Columns based on latest stand that geopandas requires lat and lon as separate features
# Dataframe is not currently filtered for lat/lon boundaries
cleaned_all_poi_data = all_nodes_extracted[['category', 'name', 'id', 'lat', 'lon']] 
cleaned_all_poi_data

Unnamed: 0,category,name,id,lat,lon
0,[atm],Bank für Sozialwirtschaft,78252154,52.523744,13.398627
1,[atm],Sparda-Bank,87036263,52.532985,13.384282
2,[atm],Bankhaus August Lenz,89275133,52.518025,13.406956
3,[atm],,213106623,52.542170,13.441137
4,[atm],Berliner Sparkasse,213113204,52.542750,13.392862
...,...,...,...,...,...
213006,[viewpoint],,8931299152,52.487989,13.275393
213007,[viewpoint],,9024702237,52.506772,13.334563
213008,[viewpoint],Alpengipfel,9026936271,52.401704,13.366960
213009,[viewpoint],,9038673666,52.482133,13.291911


#### 2. Write the cleaned dataset to file (choose json to retain list functionality)

In [25]:
cleaned_all_poi_data.to_csv('cleaned_all_poi_data.csv')
cleaned_all_poi_data.to_json('cleaned_all_poi_data.json')