In [3]:
import pandas as pd
import numpy as np
import json
import glob

# Cleaning Points of Interest data

#### Read in uncleaned Points of Interest data and merge into a single pandas dataframe:

In [6]:
#this code assumes you have the raw data in a folder titled 'poi_data', which should be in the same folder as this file
all_node_files = glob.glob('poi_data/' + '/*node.json')

li = []

for filename in all_node_files:
    data = json.load(open(filename))
    df = pd.DataFrame(data["elements"])
    li.append(df)

all_nodes = pd.concat(li, axis=0, ignore_index=True)

#### Exploring the data with basic cleaning methods (first pass):

In [7]:
all_nodes.dropna()

Unnamed: 0,type,id,lat,lon,tags
0,node,78252154,52.523744,13.398627,"{'addr:city': 'Berlin', 'addr:country': 'DE', ..."
1,node,87036263,52.532985,13.384282,"{'amenity': 'atm', 'brand': 'Sparda-Bank', 'br..."
2,node,89275133,52.518025,13.406956,"{'amenity': 'atm', 'name': 'Bankhaus August Le..."
3,node,213106623,52.542170,13.441137,"{'addr:country': 'DE', 'addr:housenumber': '87..."
4,node,213113204,52.542750,13.392862,"{'amenity': 'atm', 'name': 'Berliner Sparkasse..."
...,...,...,...,...,...
213006,node,8931299152,52.487989,13.275393,{'tourism': 'viewpoint'}
213007,node,9024702237,52.506772,13.334563,{'tourism': 'viewpoint'}
213008,node,9026936271,52.401704,13.366960,"{'name': 'Alpengipfel', 'tourism': 'viewpoint'}"
213009,node,9038673666,52.482133,13.291911,"{'direction': 'E', 'tourism': 'viewpoint'}"


There are no missing values

## Extracting relevant features from 'tags' attribute

#### Extracting the name of each POI and adding it as a separate feature 

In [8]:
# extracting the 'name' data and adding it to a new 'name' column 
for i, dict in enumerate(all_nodes.tags):
    try:
        all_nodes.loc[i, 'name'] = dict['name']
    except:
        pass

all_nodes

Unnamed: 0,type,id,lat,lon,tags,name
0,node,78252154,52.523744,13.398627,"{'addr:city': 'Berlin', 'addr:country': 'DE', ...",Bank für Sozialwirtschaft
1,node,87036263,52.532985,13.384282,"{'amenity': 'atm', 'brand': 'Sparda-Bank', 'br...",Sparda-Bank
2,node,89275133,52.518025,13.406956,"{'amenity': 'atm', 'name': 'Bankhaus August Le...",Bankhaus August Lenz
3,node,213106623,52.542170,13.441137,"{'addr:country': 'DE', 'addr:housenumber': '87...",
4,node,213113204,52.542750,13.392862,"{'amenity': 'atm', 'name': 'Berliner Sparkasse...",Berliner Sparkasse
...,...,...,...,...,...,...
213006,node,8931299152,52.487989,13.275393,{'tourism': 'viewpoint'},
213007,node,9024702237,52.506772,13.334563,{'tourism': 'viewpoint'},
213008,node,9026936271,52.401704,13.366960,"{'name': 'Alpengipfel', 'tourism': 'viewpoint'}",Alpengipfel
213009,node,9038673666,52.482133,13.291911,"{'direction': 'E', 'tourism': 'viewpoint'}",


### Extracting relevant categories

In [23]:
# dictionary of category key/value pairs that were scraped
categories = {'amenity': ['atm',
                          'bar',
                          'bench',
                          'cafe',
                          'drinking_water',
                          'ice_cream',
                          'nightclub',
                          'restaurant'
                         ],
              
              'historic': 'yes',
              'leisure': 'picnic_table',
              'natural': 'tree',
              'shop':'convenience',
              'tourism': ['art_work',
                          'attraction',
                          'gallery',
                          'museum',
                          'viewpoint',
                          ]
              }


In [32]:
# loop through every row in all_nodes.tags, keeping track of the index
for index, row in enumerate(all_nodes.tags):
    # for each row, loop through every key from the 'categories' variable 
    for key in categories:
        # if the row contains a matching key    
        if key in row:
            # check the key's value from the row against the key's values from the 'categories' variable
            for value in categories[key]:
                # if these check out
                if value in row[key]:
                    # for the current row, add the value to a new 'category' column
                    all_nodes.loc[index, 'category'] = row[key]            

## Inspecting the data

In [14]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)

In [33]:
all_nodes.category

0               atm
1               atm
2               atm
3               atm
4               atm
            ...    
213006    viewpoint
213007    viewpoint
213008    viewpoint
213009    viewpoint
213010    viewpoint
Name: category, Length: 213011, dtype: object

In [38]:
all_nodes.loc[150000]

type                                                     node
id                                                 5322155730
lat                                                 52.475372
lon                                                 13.289718
tags        {'leaf_cycle': 'deciduous', 'leaf_type': 'broa...
name                                                      NaN
category                                                 tree
Name: 150000, dtype: object

#### Writing the updated dataframe to csv and json  

In [39]:
all_nodes.to_json('all_nodes_with_categories.json')

In [40]:
all_nodes.to_csv('all_nodes_with_categories.csv')

# Continue from here
Reading in all_nodes_with_categories

In [4]:
all_nodes_with_categories = pd.read_json('all_nodes_with_categories.json')

In [15]:
all_nodes_with_categories

Unnamed: 0,type,id,lat,lon,tags,name,category
0,node,78252154,52.523744,13.398627,"{'addr:city': 'Berlin', 'addr:country': 'DE', ...",Bank für Sozialwirtschaft,atm
1,node,87036263,52.532985,13.384282,"{'amenity': 'atm', 'brand': 'Sparda-Bank', 'br...",Sparda-Bank,atm
2,node,89275133,52.518025,13.406956,"{'amenity': 'atm', 'name': 'Bankhaus August Le...",Bankhaus August Lenz,atm
3,node,213106623,52.542170,13.441137,"{'addr:country': 'DE', 'addr:housenumber': '87...",,atm
4,node,213113204,52.542750,13.392862,"{'amenity': 'atm', 'name': 'Berliner Sparkasse...",Berliner Sparkasse,atm
...,...,...,...,...,...,...,...
213006,node,8931299152,52.487989,13.275393,{'tourism': 'viewpoint'},,viewpoint
213007,node,9024702237,52.506772,13.334563,{'tourism': 'viewpoint'},,viewpoint
213008,node,9026936271,52.401704,13.366960,"{'name': 'Alpengipfel', 'tourism': 'viewpoint'}",Alpengipfel,viewpoint
213009,node,9038673666,52.482133,13.291911,"{'direction': 'E', 'tourism': 'viewpoint'}",,viewpoint


In [19]:
all_nodes_with_categories.name.isna().sum()

202443

202443 missing values in 'name'

In [20]:
all_nodes_with_categories.category.isna().sum()

0

No missing values in 'category'

# Category extraction rerun

I realised that if any rows had multiple categories, this would not have been capture by the previous code. Therefore I decided to edit the code and rerun the category extraction.

# Version 2

In [43]:
"""
Function to extract categories from the 'tags' column of any version of the dataframe (must be parsed in as argument)

It contains the dictionary and loop from earlier category extraction.

A 'tags' column in its original raw form is a prerequisite. 
Pandas must be imported as pd and numpy as np.

To reduce computational cost, I have removed lines from the loop looping for the specific category value, 
since the data should contain only values that were scraped for. 
Looping through keys should be enough.

I have also added a line that will record 'historic: yes' categories as 'historic' rather than 'yes'.

"""

def extract_categories(df):
    
    # make a copy of the dataframe
    new_df = df
    
    # create a new 'category' column / fill it with empty lists
    new_df['category'] = np.empty((len(new_df), 0)).tolist()
    
    # dictionary of category key/value pairs that were scraped
    categories = {'amenity': ['atm',
                          'bar',
                          'bench',
                          'cafe',
                          'drinking_water',
                          'ice_cream',
                          'nightclub',
                          'restaurant'
                         ],
              
              'historic': 'yes',
              'leisure': 'picnic_table',
              'natural': 'tree',
              'shop':'convenience',
              'tourism': ['art_work',
                          'attraction',
                          'gallery',
                          'museum',
                          'viewpoint',
                          ]
              }

    # for the new df copy, loop through the 'tags' column, keeping track of the row index
    for index, row in enumerate(new_df.tags):        
        # for each row, loop through every key from the 'categories' variable
        for key in categories:
            # if the row contains a matching key
            if key in row:
                # if the key's value is equal to 'historic'
                if key == 'historic':
                    # for the current row, append 'historic' to the 'category' column
                    new_df.loc[index, 'category'].append('historic')
                else:
                    # for the current row, append the key's value to the 'category' column
                    new_df.loc[index, 'category'].append(row[key])
 
    return new_df


In [44]:
version2 = extract_categories(all_nodes)
version2

Unnamed: 0,type,id,lat,lon,tags,name,category
0,node,78252154,52.523744,13.398627,"{'addr:city': 'Berlin', 'addr:country': 'DE', ...",Bank für Sozialwirtschaft,[atm]
1,node,87036263,52.532985,13.384282,"{'amenity': 'atm', 'brand': 'Sparda-Bank', 'br...",Sparda-Bank,[atm]
2,node,89275133,52.518025,13.406956,"{'amenity': 'atm', 'name': 'Bankhaus August Le...",Bankhaus August Lenz,[atm]
3,node,213106623,52.542170,13.441137,"{'addr:country': 'DE', 'addr:housenumber': '87...",,[atm]
4,node,213113204,52.542750,13.392862,"{'amenity': 'atm', 'name': 'Berliner Sparkasse...",Berliner Sparkasse,[atm]
...,...,...,...,...,...,...,...
213006,node,8931299152,52.487989,13.275393,{'tourism': 'viewpoint'},,[viewpoint]
213007,node,9024702237,52.506772,13.334563,{'tourism': 'viewpoint'},,[viewpoint]
213008,node,9026936271,52.401704,13.366960,"{'name': 'Alpengipfel', 'tourism': 'viewpoint'}",Alpengipfel,[viewpoint]
213009,node,9038673666,52.482133,13.291911,"{'direction': 'E', 'tourism': 'viewpoint'}",,[viewpoint]


In [45]:
version1 = all_nodes_with_categories
version1

Unnamed: 0,type,id,lat,lon,tags,name,category
0,node,78252154,52.523744,13.398627,"{'addr:city': 'Berlin', 'addr:country': 'DE', ...",Bank für Sozialwirtschaft,atm
1,node,87036263,52.532985,13.384282,"{'amenity': 'atm', 'brand': 'Sparda-Bank', 'br...",Sparda-Bank,atm
2,node,89275133,52.518025,13.406956,"{'amenity': 'atm', 'name': 'Bankhaus August Le...",Bankhaus August Lenz,atm
3,node,213106623,52.542170,13.441137,"{'addr:country': 'DE', 'addr:housenumber': '87...",,atm
4,node,213113204,52.542750,13.392862,"{'amenity': 'atm', 'name': 'Berliner Sparkasse...",Berliner Sparkasse,atm
...,...,...,...,...,...,...,...
213006,node,8931299152,52.487989,13.275393,{'tourism': 'viewpoint'},,viewpoint
213007,node,9024702237,52.506772,13.334563,{'tourism': 'viewpoint'},,viewpoint
213008,node,9026936271,52.401704,13.366960,"{'name': 'Alpengipfel', 'tourism': 'viewpoint'}",Alpengipfel,viewpoint
213009,node,9038673666,52.482133,13.291911,"{'direction': 'E', 'tourism': 'viewpoint'}",,viewpoint


In [50]:
double_categories = []
for index, value in enumerate(version2.category):
    if len(value) > 1:
        
        double_categories.append(version2.iloc[index])
        
print(double_categories)
len(double_categories)

[type                                                     node
id                                                  429741495
lat                                                 52.528762
lon                                                 13.408603
tags        {'addr:city': 'Berlin', 'addr:country': 'DE', ...
name                                            Kaffee Burger
category                                      [bar, historic]
Name: 1041, dtype: object, type                                                     node
id                                                  667858502
lat                                                 52.503265
lon                                                 13.412501
tags        {'addr:city': 'Berlin', 'addr:country': 'DE', ...
name                                            Hürrem Sultan
category                                        [bar, tailor]
Name: 1083, dtype: object, type                                                     node
id             

310

'version2' contains 310 entries with double categories. Some of these categories are not from the scraped categories.

# Version 3

In [61]:
"""
Function to extract categories from the 'tags' column of any version of the dataframe (must be parsed in as argument)

It contains the dictionary and loop from earlier category extraction.

A 'tags' column in its original raw form is a prerequisite. 
Pandas must be imported as pd and numpy as np.

Looping through keys should be enough, so I have added the extra lines back in.

I have included a line that will record 'historic: yes' categories as 'historic' rather than 'yes'.

"""

def extract_categories(df):
    
    # make a copy of the dataframe
    new_df = df
    
    # create a new 'category' column / fill it with empty lists
    new_df['category'] = np.empty((len(new_df), 0)).tolist()
    
    # dictionary of category key/value pairs that were scraped
    categories = {'amenity': ['atm',
                          'bar',
                          'bench',
                          'cafe',
                          'drinking_water',
                          'ice_cream',
                          'nightclub',
                          'restaurant'
                         ],
              
              'historic': 'yes',
              'leisure': 'picnic_table',
              'natural': 'tree',
              'shop':'convenience',
              'tourism': ['art_work',
                          'attraction',
                          'gallery',
                          'museum',
                          'viewpoint',
                          ]
              }

    # for the new df copy, loop through the 'tags' column, keeping track of the row index
    for index, row in enumerate(new_df.tags):        
        # for each row, loop through every key in the 'categories' dictionary
        for key in categories:
            # if the row contains a matching key
            if key in row:
                # if the key's value is equal to 'historic'
                if key == 'historic':
                    # for the current row, append 'historic' to the 'category' column
                    new_df.loc[index, 'category'].append('historic')
                else:
                    # check the key's value from the row against the key's values from the 'categories' variable
                    for value in categories[key]:
                        # if there is a match
                        if value in row[key]:
                            # for the current row, append the key's value to the 'category' column
                            new_df.loc[index, 'category'].append(row[key])
                     
    return new_df


In [None]:
# loop through every row in all_nodes.tags, keeping track of the index
for index, row in enumerate(all_nodes.tags):
    # for each row, loop through every key from the 'categories' variable 
    for key in categories:
        # if the row contains a matching key    
        if key in row:
            # check the key's value from the row against the key's values from the 'categories' variable
            for value in categories[key]:
                # if these check out
                if value in row[key]:
                    # for the current row, add the value to a new 'category' column
                    all_nodes.loc[index, 'category'] = row[key]     

In [62]:
version3 = extract_categories(all_nodes)
version3

Unnamed: 0,type,id,lat,lon,tags,name,category
0,node,78252154,52.523744,13.398627,"{'addr:city': 'Berlin', 'addr:country': 'DE', ...",Bank für Sozialwirtschaft,[atm]
1,node,87036263,52.532985,13.384282,"{'amenity': 'atm', 'brand': 'Sparda-Bank', 'br...",Sparda-Bank,[atm]
2,node,89275133,52.518025,13.406956,"{'amenity': 'atm', 'name': 'Bankhaus August Le...",Bankhaus August Lenz,[atm]
3,node,213106623,52.542170,13.441137,"{'addr:country': 'DE', 'addr:housenumber': '87...",,[atm]
4,node,213113204,52.542750,13.392862,"{'amenity': 'atm', 'name': 'Berliner Sparkasse...",Berliner Sparkasse,[atm]
...,...,...,...,...,...,...,...
213006,node,8931299152,52.487989,13.275393,{'tourism': 'viewpoint'},,[viewpoint]
213007,node,9024702237,52.506772,13.334563,{'tourism': 'viewpoint'},,[viewpoint]
213008,node,9026936271,52.401704,13.366960,"{'name': 'Alpengipfel', 'tourism': 'viewpoint'}",Alpengipfel,[viewpoint]
213009,node,9038673666,52.482133,13.291911,"{'direction': 'E', 'tourism': 'viewpoint'}",,[viewpoint]


In [56]:
double_categories2 = []
for index, value in enumerate(version3.category):
    if len(value) > 1:
      
        double_categories2.append(version3.iloc[index])
        
print(double_categories2)
len(double_categories2)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



183816

In [63]:
version3.iloc[1041]

type                                                     node
id                                                  429741495
lat                                                 52.528762
lon                                                 13.408603
tags        {'addr:city': 'Berlin', 'addr:country': 'DE', ...
name                                            Kaffee Burger
category                                      [bar, historic]
Name: 1041, dtype: object

In [64]:
version3.iloc[1083]

type                                                     node
id                                                  667858502
lat                                                 52.503265
lon                                                 13.412501
tags        {'addr:city': 'Berlin', 'addr:country': 'DE', ...
name                                            Hürrem Sultan
category                                [bar, tailor, tailor]
Name: 1083, dtype: object

In [65]:
version3.iloc[1303]

type                                                     node
id                                                 2541047698
lat                                                 52.470828
lon                                                 13.441197
tags        {'addr:city': 'Berlin', 'addr:country': 'DE', ...
name                                               Café & Bar
category    [bar, adult_gaming_center, adult_gaming_center...
Name: 1303, dtype: object

# Version 4

In [72]:
"""
Function to extract categories from the 'tags' column of any version of the dataframe (must be parsed in as argument)

It contains the dictionary and loop from earlier category extraction.

A 'tags' column in its original raw form is a prerequisite. 
Pandas must be imported as pd and numpy as np.

Looping through keys should be enough, so I have added the extra lines back in.

I have included a line that will record 'historic: yes' categories as 'historic' rather than 'yes'.

"""

def extract_categories4(df):
    
    # make a copy of the dataframe
    new_df = df
    
    # create a new 'category' column / fill it with empty lists
    new_df['category'] = np.empty((len(new_df), 0)).tolist()
    
    # dictionary of category key/value pairs that were scraped
    categories = {'amenity': ['atm',
                          'bar',
                          'bench',
                          'cafe',
                          'drinking_water',
                          'ice_cream',
                          'nightclub',
                          'restaurant'
                         ],
              
              'historic': ['yes'], # fixed bug to put all values into lists to avoid looping on strings
              'leisure': ['picnic_table'],
              'natural': ['tree'],
              'shop': ['convenience'],
              'tourism': ['art_work',
                          'attraction',
                          'gallery',
                          'museum',
                          'viewpoint',
                          ] 
              }

    # for the new df copy, loop through the 'tags' column, keeping track of the row index
    for index, row in enumerate(new_df.tags):        
        # for each row, loop through every key in the 'categories' dictionary
        for key in categories:
            # if the row contains a matching key
            if key in row:
                # if the key's value is equal to 'historic'
                if key == 'historic':
                    # for the current row, append 'historic' to the 'category' column
                    new_df.loc[index, 'category'].append('historic')
                else:
                    # check the key's value from the row against the key's values from the 'categories' variable
                    for value in categories[key]:
                        # if there is a match
                        if value in row[key]:
                            # for the current row, append the key's value to the 'category' column
                            new_df.loc[index, 'category'].append(value) # fixed bug to add matched value only
                     
    return new_df


In [73]:
version4 = extract_categories4(all_nodes)
all_nodes

Unnamed: 0,type,id,lat,lon,tags,name,category
0,node,78252154,52.523744,13.398627,"{'addr:city': 'Berlin', 'addr:country': 'DE', ...",Bank für Sozialwirtschaft,[atm]
1,node,87036263,52.532985,13.384282,"{'amenity': 'atm', 'brand': 'Sparda-Bank', 'br...",Sparda-Bank,[atm]
2,node,89275133,52.518025,13.406956,"{'amenity': 'atm', 'name': 'Bankhaus August Le...",Bankhaus August Lenz,[atm]
3,node,213106623,52.542170,13.441137,"{'addr:country': 'DE', 'addr:housenumber': '87...",,[atm]
4,node,213113204,52.542750,13.392862,"{'amenity': 'atm', 'name': 'Berliner Sparkasse...",Berliner Sparkasse,[atm]
...,...,...,...,...,...,...,...
213006,node,8931299152,52.487989,13.275393,{'tourism': 'viewpoint'},,[viewpoint]
213007,node,9024702237,52.506772,13.334563,{'tourism': 'viewpoint'},,[viewpoint]
213008,node,9026936271,52.401704,13.366960,"{'name': 'Alpengipfel', 'tourism': 'viewpoint'}",Alpengipfel,[viewpoint]
213009,node,9038673666,52.482133,13.291911,"{'direction': 'E', 'tourism': 'viewpoint'}",,[viewpoint]


In [74]:
double_categories4 = []
for index, value in enumerate(version4.category):
    if len(value) > 1:
      
        double_categories4.append(version4.iloc[index])
        
print(double_categories4)
len(double_categories4)

[type                                                     node
id                                                  429741495
lat                                                 52.528762
lon                                                 13.408603
tags        {'addr:city': 'Berlin', 'addr:country': 'DE', ...
name                                            Kaffee Burger
category                                      [bar, historic]
Name: 1041, dtype: object, type                                                     node
id                                                  746853061
lat                                                 52.580255
lon                                                 13.398819
tags        {'amenity': 'bar', 'internet_access': 'termina...
name                                         Babylon Café Bar
category                                   [bar, convenience]
Name: 1098, dtype: object, type                                                     node
id             

65

In [75]:
version4.iloc[1041]

type                                                     node
id                                                  429741495
lat                                                 52.528762
lon                                                 13.408603
tags        {'addr:city': 'Berlin', 'addr:country': 'DE', ...
name                                            Kaffee Burger
category                                      [bar, historic]
Name: 1041, dtype: object

In [76]:
version4.iloc[1083]

type                                                     node
id                                                  667858502
lat                                                 52.503265
lon                                                 13.412501
tags        {'addr:city': 'Berlin', 'addr:country': 'DE', ...
name                                            Hürrem Sultan
category                                                [bar]
Name: 1083, dtype: object

In [77]:
version4.iloc[1303]

type                                                     node
id                                                 2541047698
lat                                                 52.470828
lon                                                 13.441197
tags        {'addr:city': 'Berlin', 'addr:country': 'DE', ...
name                                               Café & Bar
category                                                [bar]
Name: 1303, dtype: object