# We first import all modules we will need

In [1]:
import time

import json
import requests

import pandas as pd

from shapely.geometry.polygon import Polygon
from shapely.geometry.point import Point

print("Import succesful!")

Import succesful!


# Setting up the Foursquare credentials

In [2]:
def set_foursquare_credentials():
    """
set_foursquare_credentials()

    This function returns my client ID and client secret and the Foursquare version. This is needed to get results from the Foursquare API.

    Returns
    -------
    Tuple of str
        These are the credentials (CLIENT_ID, CLIENT_SECRET, VERSION) needed to make requests to the Foursquare API.
        
    """
    
    CLIENT_ID = None # SECRET!
    CLIENT_SECRET = None # SECRET!
    VERSION = '20180605'
    
    return CLIENT_ID, CLIENT_SECRET, VERSION

# Extracting venues in Hamburg

Unfortunately, we can extract at most 100 venues per request from the Foursquare API. We surround each area spanning a postal code by a rectangle. We split this rectangle into 16 smaller rectangles each with the same area and shape. For each of these smaller rectangles we send a request to the Foursquare API to return 100 venues.

In [5]:
def find_venues_Hamburg():
    """
find_venues_Hamburg()

    This function returns all venues found by splitting each postal code area into four pieces and \
    using the explore function with a limit of 100 on each of these pieces.
    
    Returns
    -------
    DataFrame
        This dataframe consists of the name, latitude and longitude, category and venue id of each venue \
        together with a tuple specifying which rectangle it was found in.
        
    """
    
    LIMIT = 100
    
    # We open the file containing the coordinates of the areas spanning the postal codes in Hamburg.
    with open("Postal_Codes_Coordinates_Hamburg.geojson") as file:
        geography = json.load(file)

        base = geography['features']

    df = pd.DataFrame(columns=['Name', 'Latitude', 'Longitude', 'Category', 'Venue_Id', 'Rectangle'])

    for i in range(len(base)):    

        coordinates = base[i]['geometry']['coordinates']
        postal_code = base[i]['properties']['plz']

        # We calculate the latitude and longitude of the most south-western and most north-eastern points 
        # of the rectangle surrounding the postal code area.
        lon_most_sw, lat_most_sw, lon_most_ne, lat_most_ne = Polygon(coordinates[0]).bounds

        # We calculate the latitudinal and longitudinal differences between neighbouring rectangles.
        lat_diff = (lat_most_ne - lat_most_sw)/4
        lon_diff = (lon_most_ne - lon_most_sw)/4

        for j in range(4):
            for k in range(4):
                coord_sw = (lat_most_sw + j*lat_diff, lon_most_sw + k*lon_diff)
                coord_ne = (lat_most_sw + (j+1)*lat_diff, lon_most_sw + (k+1)*lon_diff)

                # We structure the results of the Foursquare API as a json file.
                url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&sw={},{}&ne={},{}&limit={}'.format(
                    *set_foursquare_credentials(),
                    *coord_sw,
                    *coord_ne,
                    LIMIT
                )

                results = requests.get(url).json()
                root = results['response']['groups'][0]['items']
                
                # We extract the name, latitude, longitude, category name and venue id and structure it into a dataframe.
                for l in range(len(root)):
                    name = root[l]['venue']['name']
                    lat = root[l]['venue']['location']['lat']
                    lon = root[l]['venue']['location']['lng']
                    category = root[l]['venue']['categories'][0]['name']
                    venue_id = root[l]['venue']['id']
                    
                    # We introduce a rectangle column to keep track of which results belong to which rectangle.
                    df = df.append(
                        pd.DataFrame(
                            [[name, lat, lon, category, venue_id, (i,j,k)]], 
                            columns=['Name', 'Latitude', 'Longitude', 'Category', 'Venue_Id', 'Rectangle']
                                    ),
                        ignore_index=True
                    )

    return df

In [6]:
tic = time.time()
df = find_venues_Hamburg()
toc = time.time()

print("Requesting the information from Foursquare took {:.0f} minutes and {:.0f} seconds.".format(int((toc-tic)/60), (toc-tic)%60))

Requesting the information from Foursquare took 13 minutes and 10 seconds.


In [7]:
df['Rectangle'].value_counts()

(11, 3, 1)    100
(11, 3, 2)    100
(31, 3, 3)    100
(83, 1, 3)     72
(83, 2, 0)     71
             ... 
(52, 2, 0)      1
(69, 3, 3)      1
(43, 3, 0)      1
(21, 0, 1)      1
(17, 2, 3)      1
Name: Rectangle, Length: 1492, dtype: int64

We see that there are rectangles where we hit the limit of 100 venues. To ensure we really extract all venues in Hamburg, we have to look at these rectangles with a higher resolution.

In [8]:
def find_venues_in_rectangle(i, j, k):
    """
find_venues_in_rectangle(i, j, k)

    Given a rectangle as per the split in 4x4 rectangles of each area spanning a postal code, \
    we split each of these 4x4 rectangles into an additional 6x6 rectangles \
    and send a request to the Foursquare API for each smaller rectangle. 
    
    Parameters
    ----------
    i: int
        Parameter for the postal codes in 'Postal_Codes_Coordinates_Hamburg.geojson'.
    
    j: int
        Ranges from 0 to 3 and specifies the latitudinal position of the original rectangle.
    
    k: int
        Ranges from 0 to 3 and specifies the longitudinal position of the original rectangle.
        
    Returns
    -------
    DataFrame
        This dataframe contains the name, latitude, longitude, category and venue id of each venue \
        together with tuples corresponding to the rectangle and subrectangle it was found in.
        
    """
    
    LIMIT = 100

    with open("Postal_Codes_Coordinates_Hamburg.geojson") as file:
        geography = json.load(file)

        base = geography['features']

    df = pd.DataFrame(columns=['Name', 'Latitude', 'Longitude', 'Category', 'Venue_Id', 'Rectangle', 'Subrectangle'])  

    coordinates = base[i]['geometry']['coordinates']
    postal_code = base[i]['properties']['plz']

    # We calculate the latitude and longitude of the south-western and north-eastern most points of the original rectangle.
    lon_most_sw_original, lat_most_sw_original, lon_most_ne_original, lat_most_ne_original = Polygon(coordinates[0]).bounds

    # We calculate the latitudinal and longitudinal differences between neighbouring rectangles inside the original rectangle.
    lat_diff_original = (lat_most_ne_original - lat_most_sw_original)/4
    lon_diff_original = (lon_most_ne_original - lon_most_sw_original)/4
    
    # We calculate the coordinates of the rectangle specified by the (i, j, k) parameters.
    lat_most_sw = lat_most_sw_original + j*lat_diff_original
    lon_most_sw = lon_most_sw_original + k*lon_diff_original

    lat_most_ne = lat_most_sw_original + (j+1)*lat_diff_original
    lon_most_ne = lon_most_sw_original + (k+1)*lon_diff_original
    
    # We split this rectangle into 6x6 smaller rectangles.
    lat_diff = (lat_most_ne - lat_most_sw)/6
    lon_diff = (lon_most_ne - lon_most_sw)/6

    for m in range(6):
        for n in range(6):
            coord_sw = (lat_most_sw + m*lat_diff, lon_most_sw + n*lon_diff)
            coord_ne = (lat_most_sw + (m+1)*lat_diff, lon_most_sw + (n+1)*lon_diff)

            # We structure the results of the Foursquare API as a json file.
            url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}\
            &client_secret={}&v={}&sw={},{}&ne={},{}&limit={}'.format(
                *set_foursquare_credentials(),
                *coord_sw,
                *coord_ne,
                LIMIT
            )

            results = requests.get(url).json()
            root = results['response']['groups'][0]['items']

            # We extract the name, latitude, longitude, category name and venue id and structure it into a dataframe.
            for l in range(len(root)):
                name = root[l]['venue']['name']
                lat = root[l]['venue']['location']['lat']
                lon = root[l]['venue']['location']['lng']
                category = root[l]['venue']['categories'][0]['name']
                venue_id = root[l]['venue']['id']

                # We introduce a rectangle and subrectangle column to keep track of which results belong to which rectangle.
                df = df.append(
                    pd.DataFrame(
                        [[name, lat, lon, category, venue_id, (i,j,k), (m,n)]], 
                        columns=['Name', 'Latitude', 'Longitude', 'Category', 'Venue_Id', 'Rectangle', 'Subrectangle']
                                ),
                    ignore_index=True
                )

    return df

In [9]:
df_1 = find_venues_in_rectangle(31, 3, 3)
df_2 = find_venues_in_rectangle(11, 3, 1)
df_3 = find_venues_in_rectangle(11, 3, 2)

In [10]:
print(df_1.groupby('Subrectangle').size().max())
print(df_2.groupby('Subrectangle').size().max())
print(df_3.groupby('Subrectangle').size().max())

58
24
17


Because each subrectangle appears less than 100 times, we can be certain that we have found all venues in Hamburg. Now we only have to put all of these dataframes together.

In [27]:
print([x for x in list(df.loc[df['Rectangle'] == (31, 3, 3), 'Venue_Id']) if x not in list(df_1['Venue_Id'])])
print([x for x in list(df.loc[df['Rectangle'] == (11, 3, 1), 'Venue_Id']) if x not in list(df_2['Venue_Id'])])
print([x for x in list(df.loc[df['Rectangle'] == (11, 3, 2), 'Venue_Id']) if x not in list(df_3['Venue_Id'])])

[]
[]
[]


Indeed, we find that the dataframes df_1, df_2 and df_3 contain all the venues in the appropriate rectangles plus more. We can therefore be sure that we have looked at the three rectangles (31, 3, 3), (11, 3, 1) and (11, 3, 2) with a higher resolution and found all venues in Hamburg from the Foursquare API.

In [41]:
venue_df = df.append(df_1.drop(columns=['Subrectangle']), ignore_index=True)
venue_df = venue_df.append(df_2.drop(columns=['Subrectangle']), ignore_index=True)
venue_df = venue_df.append(df_3.drop(columns=['Subrectangle']), ignore_index=True)

In [42]:
venue_df

Unnamed: 0,Name,Latitude,Longitude,Category,Venue_Id,Rectangle
0,Himmelsleiter,53.545188,9.901605,Trail,4c28eb679eb1952149a12959,"(0, 0, 0)"
1,Oevelgönner Seekiste,53.544703,9.905130,Museum,4c28eaab9eb195212aa12959,"(0, 0, 0)"
2,H Liebermannstraße,53.546030,9.898941,Bus Stop,51dc0e9b498e6d7db55df732,"(0, 0, 0)"
3,Hafenrundfahrt HADAG,53.542231,9.902829,Boat or Ferry,4b05885df964a52017c122e3,"(0, 0, 0)"
4,62,53.542080,9.903445,Boat or Ferry,56afdc4c498ec155d3b31b79,"(0, 0, 0)"
...,...,...,...,...,...,...
11620,ZEIT Café,53.549735,9.998512,Café,5915c31f86f4cc184336e318,"(11, 3, 2)"
11621,Bären-Treff,53.550794,9.997854,Candy Store,4b56e91ff964a520f01e28e3,"(11, 3, 2)"
11622,Dr. Martens Store HH,53.550603,9.999493,Shoe Store,4e999ab36c252a15c17fa804,"(11, 3, 2)"
11623,WMF,53.550690,9.999983,Kitchen Supply Store,4b3f47fef964a52030a525e3,"(11, 3, 2)"


In [43]:
venue_df.to_csv('Venues_Hamburg.csv', index=False)

# Before we can use this data, we have to clean it

## Dropping duplicates

In [98]:
venue_df = pd.read_csv('Venues_Hamburg.csv')

Because the rectangles bounding different postal codes overlap, some venues appear in the dataframe more than once. We have to ensure we are left with unique venues by dropping duplicates.

In [99]:
venue_df.drop(columns=['Rectangle'], inplace=True)
venue_df.drop_duplicates(inplace=True, ignore_index=True)

In [100]:
venue_df.groupby('Venue_Id').size().sort_values(ascending=False)

Venue_Id
5e81b55fc0fe2a0008f30981    1
4c25f908a852c928beb6e56c    1
4c233d5213c00f4720a988de    1
4c239a13905a0f473b635e60    1
4c246be0db5195215cf72b3a    1
                           ..
4fb75caee4b037af18e095c3    1
4fb78694e4b07562f76371e2    1
4fb795f2e4b07562f76901dc    1
4fb7d5a7e4b05a81c1eb5819    1
4adcda7af964a520ed4621e3    1
Length: 6024, dtype: int64

By looking at the venue id, we see that dropping duplicates left only one venue per venue id. We may therefore be certain that our dataframe does not contain duplicate venues anymore.

## Finding super categories for each venue

Foursquare comes with a four-leveled category structure which encompasses all categories in the dataframe returned by the request sent to the Foursquare API. For each venue category, we want to find its super categories. At the highest level, the following categories exist:
- Arts & Entertainment
- College & University
- Food
- Nightlife Spot
- Outdoors & Recreation
- Professional & Other Places
- Residence
- Shop & Service
- Travel & Transport

In [101]:
def get_category_structure():
    """
get_category_structure()

    This function sends a request to the Foursquare API to get a json-file of the level structure of its categories \
    and saves it as 'Category_Structure.json'.
    
    """
    url = 'https://api.foursquare.com/v2/venues/categories?&client_id={}&client_secret={}&v={}'.format(
        *set_foursquare_credentials()
    )

    results = requests.get(url).json()

    with open('Category_Structure.json', 'w') as file:
        json.dump(results, file)

In [102]:
def find_super_categories(category_name):
    """
find_super_categories(category_name)

    For a given venue category, this function returns all its supercategories.
    
    Parameters
    ----------
    str
        Name of the venue category
        
    Returns
    -------
    list
        Contains all supercategories
    
    """
    
    with open('Category_Structure.json', 'r') as file:
        results = json.load(file)
    
    categories = ['None']*4
    
    first_level = results['response']['categories']  
    for i in range(len(first_level)):
        first_name = first_level[i]['name']
        
        categories[0] = first_name
        categories[1:] = ['None']*3
        
        if first_name == category_name:
            return categories
        
        second_level = first_level[i]['categories']
        for j in range(len(second_level)):
            second_name = second_level[j]['name']
            
            categories[1] = second_name
            categories[2:] = ['None']*2

            if second_name == category_name:
                return categories
                             
            third_level = second_level[j]['categories'] 
            for k in range(len(third_level)):
                third_name = third_level[k]['name']
                    
                categories[2] = third_name
                categories[3] = 'None'

                if third_name == category_name:
                            return categories
                               
                fourth_level = third_level[k]['categories']
                for l in range(len(fourth_level)):
                    fourth_name = fourth_level[l]['name']
                            
                    categories[3] = fourth_name

                    if fourth_name == category_name:
                        return categories

In [103]:
# We get the category structure from the Foursquare API and save it under 'Category_Structure.json'.
get_category_structure()

In [104]:
# We recover the entire category structure and place it in columns 'Category_L1' to 'Category_L4'
# where 'Category_L1' is a category in one of the nine main categories.
venue_df['Supercategory'] = venue_df['Category'].apply(find_super_categories)

for i in range(4):
    venue_df['Category_L{}'.format(i+1)] = venue_df['Supercategory'].apply(lambda x: list(x)[i])
    
venue_df.drop(columns=['Supercategory', 'Category'], inplace=True)

In [105]:
venue_df

Unnamed: 0,Name,Latitude,Longitude,Venue_Id,Category_L1,Category_L2,Category_L3,Category_L4
0,Himmelsleiter,53.545188,9.901605,4c28eb679eb1952149a12959,Outdoors & Recreation,Trail,,
1,Oevelgönner Seekiste,53.544703,9.905130,4c28eaab9eb195212aa12959,Arts & Entertainment,Museum,,
2,H Liebermannstraße,53.546030,9.898941,51dc0e9b498e6d7db55df732,Travel & Transport,Bus Stop,,
3,Hafenrundfahrt HADAG,53.542231,9.902829,4b05885df964a52017c122e3,Travel & Transport,Boat or Ferry,,
4,62,53.542080,9.903445,56afdc4c498ec155d3b31b79,Travel & Transport,Boat or Ferry,,
...,...,...,...,...,...,...,...,...
6019,Genuss Speicher,53.544611,9.996850,51d6c6f6498e460c4a3b0a46,Food,Coffee Shop,,
6020,Fleetschlösschen,53.544418,9.997967,4bbb2ad57421a5939f79c440,Nightlife Spot,Bar,,
6021,Happy Vietnam,53.545454,9.991364,4c9729d90341370428358aef,Food,Asian Restaurant,Vietnamese Restaurant,
6022,Wochenmarkt St. Katarienen,53.545402,9.993996,55115eeb498ef5b1e1f503f1,Shop & Service,Market,,


## Finally, we want to find the postal code associated to each venue

Although we recovered the latitude and longitude of each venue, we do not know yet the postal code associated to each venue.

In [106]:
def find_postal_code(latitude, longitude):
    """
find_postal_code(latitude, longitude)

    Find the postal code (in Hamburg) of a given point from its latitudinal and longitudinal coordinates. \
    The file 'Postal_Codes_Coordinates_Hamburg.geojson' is used for this calculation.
    
    Parameters
    ----------
    latitude: float
        The latitudinal coordinate of the point
    
    longitude: float
        The longitudinal coordinate of the point
    
    Returns
    -------
    int
        Returns the associated postal code if it can be found, none otherwise.
        
    """
    
    with open('Postal_Codes_Coordinates_Hamburg.geojson') as file:
        geography = json.load(file)
        
    base = geography['features']

    for i in range(len(base)):    
        polygon_coordinates = base[i]['geometry']['coordinates']
        postal_code = base[i]['properties']['plz']

        point = Point([longitude, latitude])
        polygon = Polygon(polygon_coordinates[0])

        if point.within(polygon):
            return postal_code

In [107]:
venue_df['Postal_Code'] = venue_df.apply(lambda x: find_postal_code(x['Latitude'], x['Longitude']), axis=1)

In [108]:
venue_df.isna().sum()

Name             0
Latitude         0
Longitude        0
Venue_Id         0
Category_L1      0
Category_L2      0
Category_L3      0
Category_L4      0
Postal_Code    253
dtype: int64

It is now clear that not all venues lie in one of the postal codes in Hamburg. These venues were found when we considered a rectangle bounding a postal code area which lies in the outskirts of Hamburg. We have to remove these venues.

In [109]:
venue_df.dropna(inplace=True)

In [110]:
venue_df

Unnamed: 0,Name,Latitude,Longitude,Venue_Id,Category_L1,Category_L2,Category_L3,Category_L4,Postal_Code
0,Himmelsleiter,53.545188,9.901605,4c28eb679eb1952149a12959,Outdoors & Recreation,Trail,,,22605
1,Oevelgönner Seekiste,53.544703,9.905130,4c28eaab9eb195212aa12959,Arts & Entertainment,Museum,,,22605
2,H Liebermannstraße,53.546030,9.898941,51dc0e9b498e6d7db55df732,Travel & Transport,Bus Stop,,,22605
3,Hafenrundfahrt HADAG,53.542231,9.902829,4b05885df964a52017c122e3,Travel & Transport,Boat or Ferry,,,22605
4,62,53.542080,9.903445,56afdc4c498ec155d3b31b79,Travel & Transport,Boat or Ferry,,,22605
...,...,...,...,...,...,...,...,...,...
6019,Genuss Speicher,53.544611,9.996850,51d6c6f6498e460c4a3b0a46,Food,Coffee Shop,,,20457
6020,Fleetschlösschen,53.544418,9.997967,4bbb2ad57421a5939f79c440,Nightlife Spot,Bar,,,20457
6021,Happy Vietnam,53.545454,9.991364,4c9729d90341370428358aef,Food,Asian Restaurant,Vietnamese Restaurant,,20457
6022,Wochenmarkt St. Katarienen,53.545402,9.993996,55115eeb498ef5b1e1f503f1,Shop & Service,Market,,,20457


In [111]:
# Having cleaned the data, we save the new dataframe as a csv file.
venue_df.to_csv('Venues_Hamburg_Clean.csv', index=False)