In [1]:
import pandas as pd
import numpy as np
import json
import glob

In [14]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)

# Cleaning Points of Interest data

#### Read in uncleaned Points of Interest data and merge into a single pandas dataframe:

In [2]:
#this code assumes you have the raw data in a folder titled 'poi_data', which should be in the same folder as this file
all_node_files = glob.glob('poi_data/' + '/*node.json')

li = []

for filename in all_node_files:
    data = json.load(open(filename))
    df = pd.DataFrame(data["elements"])
    li.append(df)

all_nodes = pd.concat(li, axis=0, ignore_index=True)

#### Exploring the data (first pass):

In [7]:
all_nodes.dropna()

Unnamed: 0,type,id,lat,lon,tags
0,node,78252154,52.523744,13.398627,"{'addr:city': 'Berlin', 'addr:country': 'DE', ..."
1,node,87036263,52.532985,13.384282,"{'amenity': 'atm', 'brand': 'Sparda-Bank', 'br..."
2,node,89275133,52.518025,13.406956,"{'amenity': 'atm', 'name': 'Bankhaus August Le..."
3,node,213106623,52.542170,13.441137,"{'addr:country': 'DE', 'addr:housenumber': '87..."
4,node,213113204,52.542750,13.392862,"{'amenity': 'atm', 'name': 'Berliner Sparkasse..."
...,...,...,...,...,...
213006,node,8931299152,52.487989,13.275393,{'tourism': 'viewpoint'}
213007,node,9024702237,52.506772,13.334563,{'tourism': 'viewpoint'}
213008,node,9026936271,52.401704,13.366960,"{'name': 'Alpengipfel', 'tourism': 'viewpoint'}"
213009,node,9038673666,52.482133,13.291911,"{'direction': 'E', 'tourism': 'viewpoint'}"


There are no missing values

## Extracting relevant features from 'tags' attribute

#### Extracting the name of each POI and adding it as a separate feature 

In [3]:
# extracting the 'name' data and adding it to a new 'name' column 
for i, dict in enumerate(all_nodes.tags):
    try:
        all_nodes.loc[i, 'name'] = dict['name']
    except:
        pass

all_nodes

Unnamed: 0,type,id,lat,lon,tags,name
0,node,78252154,52.523744,13.398627,"{'addr:city': 'Berlin', 'addr:country': 'DE', ...",Bank für Sozialwirtschaft
1,node,87036263,52.532985,13.384282,"{'amenity': 'atm', 'brand': 'Sparda-Bank', 'br...",Sparda-Bank
2,node,89275133,52.518025,13.406956,"{'amenity': 'atm', 'name': 'Bankhaus August Le...",Bankhaus August Lenz
3,node,213106623,52.542170,13.441137,"{'addr:country': 'DE', 'addr:housenumber': '87...",
4,node,213113204,52.542750,13.392862,"{'amenity': 'atm', 'name': 'Berliner Sparkasse...",Berliner Sparkasse
...,...,...,...,...,...,...
213006,node,8931299152,52.487989,13.275393,{'tourism': 'viewpoint'},
213007,node,9024702237,52.506772,13.334563,{'tourism': 'viewpoint'},
213008,node,9026936271,52.401704,13.366960,"{'name': 'Alpengipfel', 'tourism': 'viewpoint'}",Alpengipfel
213009,node,9038673666,52.482133,13.291911,"{'direction': 'E', 'tourism': 'viewpoint'}",


# Category extraction

### Extracting relevant categories

In [23]:
# dictionary of category key/value pairs that were scraped
categories = {'amenity': ['atm',
                          'bar',
                          'bench',
                          'cafe',
                          'drinking_water',
                          'ice_cream',
                          'nightclub',
                          'restaurant'
                         ],
              
              'historic': 'yes',
              'leisure': 'picnic_table',
              'natural': 'tree',
              'shop':'convenience',
              'tourism': ['art_work',
                          'attraction',
                          'gallery',
                          'museum',
                          'viewpoint',
                          ]
              }


# Version 4

In [4]:
"""
Function to extract categories from the 'tags' column of any version of the dataframe (must be parsed in as argument)

It contains the dictionary and loop from earlier category extraction.

A 'tags' column in its original raw form is a prerequisite. 
Pandas must be imported as pd and numpy as np.

Looping through keys should be enough, so I have added the extra lines back in.

I have included a line that will record 'historic: yes' categories as 'historic' rather than 'yes'.

"""

def extract_categories(df):
    
    # make a copy of the dataframe
    new_df = df
    
    # create a new 'category' column / fill it with empty lists
    new_df['category'] = np.empty((len(new_df), 0)).tolist()
    
    # dictionary of category key/value pairs that were scraped
    categories = {'amenity': ['atm',
                          'bar',
                          'bench',
                          'cafe',
                          'drinking_water',
                          'ice_cream',
                          'nightclub',
                          'restaurant'
                         ],
              
              'historic': ['yes'], # fixed bug to put all values into lists to avoid looping on strings
              'leisure': ['picnic_table'],
              'natural': ['tree'],
              'shop': ['convenience'],
              'tourism': ['art_work',
                          'attraction',
                          'gallery',
                          'museum',
                          'viewpoint',
                          ] 
              }

    # for the new df copy, loop through the 'tags' column, keeping track of the row index
    for index, row in enumerate(new_df.tags):        
        # for each row, loop through every key in the 'categories' dictionary
        for key in categories:
            # if the row contains a matching key
            if key in row:
                # if the key's value is equal to 'historic'
                if key == 'historic':
                    # for the current row, append 'historic' to the 'category' column
                    new_df.loc[index, 'category'].append('historic')
                else:
                    # check the key's value from the row against the key's values from the 'categories' variable
                    for value in categories[key]:
                        # if there is a match
                        if value in row[key]:
                            # for the current row, append the key's value to the 'category' column
                            new_df.loc[index, 'category'].append(value) # fixed bug to add matched value only
                     
    return new_df


In [73]:
version4 = extract_categories(all_nodes)
all_nodes

Unnamed: 0,type,id,lat,lon,tags,name,category
0,node,78252154,52.523744,13.398627,"{'addr:city': 'Berlin', 'addr:country': 'DE', ...",Bank für Sozialwirtschaft,[atm]
1,node,87036263,52.532985,13.384282,"{'amenity': 'atm', 'brand': 'Sparda-Bank', 'br...",Sparda-Bank,[atm]
2,node,89275133,52.518025,13.406956,"{'amenity': 'atm', 'name': 'Bankhaus August Le...",Bankhaus August Lenz,[atm]
3,node,213106623,52.542170,13.441137,"{'addr:country': 'DE', 'addr:housenumber': '87...",,[atm]
4,node,213113204,52.542750,13.392862,"{'amenity': 'atm', 'name': 'Berliner Sparkasse...",Berliner Sparkasse,[atm]
...,...,...,...,...,...,...,...
213006,node,8931299152,52.487989,13.275393,{'tourism': 'viewpoint'},,[viewpoint]
213007,node,9024702237,52.506772,13.334563,{'tourism': 'viewpoint'},,[viewpoint]
213008,node,9026936271,52.401704,13.366960,"{'name': 'Alpengipfel', 'tourism': 'viewpoint'}",Alpengipfel,[viewpoint]
213009,node,9038673666,52.482133,13.291911,"{'direction': 'E', 'tourism': 'viewpoint'}",,[viewpoint]


In [74]:
double_categories = []
for index, value in enumerate(version.category):
    if len(value) > 1:
      
        double_categories.append(version.iloc[index])
        
print(double_categories)
len(double_categories)

[type                                                     node
id                                                  429741495
lat                                                 52.528762
lon                                                 13.408603
tags        {'addr:city': 'Berlin', 'addr:country': 'DE', ...
name                                            Kaffee Burger
category                                      [bar, historic]
Name: 1041, dtype: object, type                                                     node
id                                                  746853061
lat                                                 52.580255
lon                                                 13.398819
tags        {'amenity': 'bar', 'internet_access': 'termina...
name                                         Babylon Café Bar
category                                   [bar, convenience]
Name: 1098, dtype: object, type                                                     node
id             

65

There are 65 rows with multiple categories, all of which are from the 'categories' dictionary (scraped categories)! 

In [75]:
version4.iloc[1041]

type                                                     node
id                                                  429741495
lat                                                 52.528762
lon                                                 13.408603
tags        {'addr:city': 'Berlin', 'addr:country': 'DE', ...
name                                            Kaffee Burger
category                                      [bar, historic]
Name: 1041, dtype: object

In [76]:
version4.iloc[1083]

type                                                     node
id                                                  667858502
lat                                                 52.503265
lon                                                 13.412501
tags        {'addr:city': 'Berlin', 'addr:country': 'DE', ...
name                                            Hürrem Sultan
category                                                [bar]
Name: 1083, dtype: object

In [77]:
version4.iloc[1303]

type                                                     node
id                                                 2541047698
lat                                                 52.470828
lon                                                 13.441197
tags        {'addr:city': 'Berlin', 'addr:country': 'DE', ...
name                                               Café & Bar
category                                                [bar]
Name: 1303, dtype: object

Rows that previously contained undesired categories now display desired categories only!

In [5]:
# Creating the updated dataframe with categories
all_nodes_extracted = extract_categories(all_nodes)
all_nodes_extracted

Unnamed: 0,type,id,lat,lon,tags,name,category
0,node,78252154,52.523744,13.398627,"{'addr:city': 'Berlin', 'addr:country': 'DE', ...",Bank für Sozialwirtschaft,[atm]
1,node,87036263,52.532985,13.384282,"{'amenity': 'atm', 'brand': 'Sparda-Bank', 'br...",Sparda-Bank,[atm]
2,node,89275133,52.518025,13.406956,"{'amenity': 'atm', 'name': 'Bankhaus August Le...",Bankhaus August Lenz,[atm]
3,node,213106623,52.542170,13.441137,"{'addr:country': 'DE', 'addr:housenumber': '87...",,[atm]
4,node,213113204,52.542750,13.392862,"{'amenity': 'atm', 'name': 'Berliner Sparkasse...",Berliner Sparkasse,[atm]
...,...,...,...,...,...,...,...
213006,node,8931299152,52.487989,13.275393,{'tourism': 'viewpoint'},,[viewpoint]
213007,node,9024702237,52.506772,13.334563,{'tourism': 'viewpoint'},,[viewpoint]
213008,node,9026936271,52.401704,13.366960,"{'name': 'Alpengipfel', 'tourism': 'viewpoint'}",Alpengipfel,[viewpoint]
213009,node,9038673666,52.482133,13.291911,"{'direction': 'E', 'tourism': 'viewpoint'}",,[viewpoint]


### Making a new 'lat_lgt' column with coordinate tuples

In [63]:
"""Extracts lat and lon values for each row and adds them to a new 'lat_lgt' column as a tuple
This matches the format of lat/lon data in the routes dataset """
def add_lat_lgt_tuple(df):
    # create a copy of df
    new_df = df
    # create a new empty column 'lat_lgt' with dtype object (otherwise tuples won't be accepted as values)
    df['lat_lgt'] = pd.Series(dtype='object')
    # for every row
    for index in new_df.index:
        lat = df.loc[index, 'lat']
        lgt = df.loc[index, 'lon']
        # add the lat/lgt pairs as a tuple to the new column
        new_df.at[index, 'lat_lgt'] = (lat, lgt)                    
    return new_df


In [64]:
all_nodes_extracted_lat_lgt = add_lat_lgt_tuple(all_nodes_extracted)

In [65]:
all_nodes_extracted_lat_lgt

Unnamed: 0,type,id,lat,lon,tags,name,category,lat_lgt
0,node,78252154,52.523744,13.398627,"{'addr:city': 'Berlin', 'addr:country': 'DE', ...",Bank für Sozialwirtschaft,[atm],"(52.5237445, 13.3986266)"
1,node,87036263,52.532985,13.384282,"{'amenity': 'atm', 'brand': 'Sparda-Bank', 'br...",Sparda-Bank,[atm],"(52.5329853, 13.3842822)"
2,node,89275133,52.518025,13.406956,"{'amenity': 'atm', 'name': 'Bankhaus August Le...",Bankhaus August Lenz,[atm],"(52.5180249, 13.4069563)"
3,node,213106623,52.542170,13.441137,"{'addr:country': 'DE', 'addr:housenumber': '87...",,[atm],"(52.5421697, 13.4411367)"
4,node,213113204,52.542750,13.392862,"{'amenity': 'atm', 'name': 'Berliner Sparkasse...",Berliner Sparkasse,[atm],"(52.5427503, 13.3928618)"
...,...,...,...,...,...,...,...,...
213006,node,8931299152,52.487989,13.275393,{'tourism': 'viewpoint'},,[viewpoint],"(52.4879893, 13.2753934)"
213007,node,9024702237,52.506772,13.334563,{'tourism': 'viewpoint'},,[viewpoint],"(52.506772, 13.3345627)"
213008,node,9026936271,52.401704,13.366960,"{'name': 'Alpengipfel', 'tourism': 'viewpoint'}",Alpengipfel,[viewpoint],"(52.4017038, 13.3669598)"
213009,node,9038673666,52.482133,13.291911,"{'direction': 'E', 'tourism': 'viewpoint'}",,[viewpoint],"(52.4821326, 13.2919107)"


In [66]:
# checking for missing values
all_nodes_extracted_lat_lgt['lat_lgt'].isna().sum()

0

### Making a new clean copy with useful columns only

In [67]:
# Making a copy of the dataset with needed columns only
cleaned_all_poi_data = all_nodes_extracted_lat_lgt[['lat_lgt', 'category', 'name', 'id']]
cleaned_all_poi_data

Unnamed: 0,lat_lgt,category,name,id
0,"(52.5237445, 13.3986266)",[atm],Bank für Sozialwirtschaft,78252154
1,"(52.5329853, 13.3842822)",[atm],Sparda-Bank,87036263
2,"(52.5180249, 13.4069563)",[atm],Bankhaus August Lenz,89275133
3,"(52.5421697, 13.4411367)",[atm],,213106623
4,"(52.5427503, 13.3928618)",[atm],Berliner Sparkasse,213113204
...,...,...,...,...
213006,"(52.4879893, 13.2753934)",[viewpoint],,8931299152
213007,"(52.506772, 13.3345627)",[viewpoint],,9024702237
213008,"(52.4017038, 13.3669598)",[viewpoint],Alpengipfel,9026936271
213009,"(52.4821326, 13.2919107)",[viewpoint],,9038673666


### Writing the cleaned dataset to file

In [74]:
cleaned_all_poi_data.to_csv('cleaned_all_poi_data.csv')
cleaned_all_poi_data.to_json('cleaned_all_poi_data.json')

### Inspecting the cleaned dataset

In [73]:
all_nodes_cleaned['category'].value_counts()

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[tree]                       182285
[bench]                       19024
[restaurant]                   4685
[cafe]                         2475
[atm]                          1011
[convenience]                   869
[bar]                           818
[picnic_table]                  434
[ice_cream]                     289
[viewpoint]                     256
[gallery]                       189
[museum]                        156
[drinking_water]                148
[nightclub]                     141
[attraction]                     94
[historic]                       72
[tree, attraction]               18
[cafe, convenience]              16
[historic, attraction]           13
[bench, viewpoint]                4
[bar, historic]                   2
[bench, picnic_table]             2
[historic, tree]                  2
[restaurant, convenience]         2
[bar, convenience]                2
[bench, tree]                     2
[bar, viewpoint]                  2
Name: category, dtype: int64