In [1]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, Polygon, LineString

Joining Datasets

The goal of training models for classifying routes on the points of interest they pass by in sight, we had first to match our cleaned routes data with our cleaned points of interest (poi) data. We chose a spacial join from the geopandas library for matching on the geometry of both datasets. 

Of cause, this being a study project, we did this for only 451 scraped routes from wandermap.net and a (satisfyingly large) subset of 18 types of poi scraped from openstreetmap.org. 

In [2]:
#reading in the routes and poi data
routes = pd.read_csv('cleaned_routes_data/cleaned_all_routes_data_long.csv')
poi = pd.read_csv('cleaned_all_poi_data.csv').iloc[0:5000, :] #using smaller sample for experimenting

In [3]:
#converting routes df into a GeoDataFrame
gdf_routes = gpd.GeoDataFrame(routes, geometry=gpd.points_from_xy(routes.latitude, routes.longitude))
gdf_routes.drop(['Unnamed: 0', 'lat_lgt'], axis=1, inplace=True)
gdf_routes

Unnamed: 0,route_id,num_of_waypoint,latitude,longitude,geometry
0,1005019,0,52.50607,13.33208,POINT (52.50607 13.33208)
1,1005019,1,52.50553,13.33163,POINT (52.50553 13.33163)
2,1005019,2,52.50525,13.33148,POINT (52.50525 13.33148)
3,1005019,3,52.50515,13.33337,POINT (52.50515 13.33337)
4,1005019,4,52.50520,13.33366,POINT (52.50520 13.33366)
...,...,...,...,...,...
186229,933359,151,52.50444,13.38246,POINT (52.50444 13.38246)
186230,933359,152,52.50525,13.38633,POINT (52.50525 13.38633)
186231,933359,153,52.50643,13.38615,POINT (52.50643 13.38615)
186232,933359,154,52.50648,13.39023,POINT (52.50648 13.39023)


In [4]:
#converting points from routes data to linestring and adding these into a new column
gdf_routes['route_linestring'] = gdf_routes['route_id'].map(gdf_routes.groupby(['route_id'])['geometry'].apply(lambda x: LineString(x.tolist())))
gdf_routes

Unnamed: 0,route_id,num_of_waypoint,latitude,longitude,geometry,route_linestring
0,1005019,0,52.50607,13.33208,POINT (52.50607 13.33208),"LINESTRING (52.50607 13.33208, 52.50553 13.331..."
1,1005019,1,52.50553,13.33163,POINT (52.50553 13.33163),"LINESTRING (52.50607 13.33208, 52.50553 13.331..."
2,1005019,2,52.50525,13.33148,POINT (52.50525 13.33148),"LINESTRING (52.50607 13.33208, 52.50553 13.331..."
3,1005019,3,52.50515,13.33337,POINT (52.50515 13.33337),"LINESTRING (52.50607 13.33208, 52.50553 13.331..."
4,1005019,4,52.50520,13.33366,POINT (52.50520 13.33366),"LINESTRING (52.50607 13.33208, 52.50553 13.331..."
...,...,...,...,...,...,...
186229,933359,151,52.50444,13.38246,POINT (52.50444 13.38246),"LINESTRING (52.50157 13.49061, 52.50300 13.484..."
186230,933359,152,52.50525,13.38633,POINT (52.50525 13.38633),"LINESTRING (52.50157 13.49061, 52.50300 13.484..."
186231,933359,153,52.50643,13.38615,POINT (52.50643 13.38615),"LINESTRING (52.50157 13.49061, 52.50300 13.484..."
186232,933359,154,52.50648,13.39023,POINT (52.50648 13.39023),"LINESTRING (52.50157 13.49061, 52.50300 13.484..."


In [5]:
#converting poi df into a GeoDataFrame
gdf_poi = gpd.GeoDataFrame(poi, geometry=gpd.points_from_xy(poi.lat, poi.lon))
gdf_poi

Unnamed: 0.1,Unnamed: 0,category,name,id,lat,lon,geometry
0,0,['atm'],Bank für Sozialwirtschaft,78252154,52.523744,13.398627,POINT (52.52374 13.39863)
1,1,['atm'],Sparda-Bank,87036263,52.532985,13.384282,POINT (52.53299 13.38428)
2,2,['atm'],Bankhaus August Lenz,89275133,52.518025,13.406956,POINT (52.51802 13.40696)
3,3,['atm'],,213106623,52.542170,13.441137,POINT (52.54217 13.44114)
4,4,['atm'],Berliner Sparkasse,213113204,52.542750,13.392862,POINT (52.54275 13.39286)
...,...,...,...,...,...,...,...
4995,4995,['bench'],,2687646399,52.407078,13.562105,POINT (52.40708 13.56210)
4996,4996,['bench'],,2687646403,52.407081,13.562048,POINT (52.40708 13.56205)
4997,4997,['bench'],,2687646405,52.407066,13.564322,POINT (52.40707 13.56432)
4998,4998,['bench'],,2687646415,52.407187,13.563088,POINT (52.40719 13.56309)


In [16]:
#spacial joining both datasets on nearest distance of any a route's waypoint from a poi
poi_routes = gpd.sjoin_nearest(gdf_poi, gdf_routes, how='left', max_distance=0.001, distance_col='distance')

#followed by some manipulation for the optics
poi_routes.drop(['Unnamed: 0', 'id', 'lat', 'lon', 'index_right', 'latitude', 'longitude'], axis=1, inplace=True) #drop info to reduce risk of overfitting
poi_routes = poi_routes.iloc[:, [3,5,4,2,0,1,6]] #rearranging columns
poi_routes.rename({'geometry': 'poi_lat_lgt', 'category': 'poi_category', 'name': 'poi_name'}, axis=1, inplace=True) #renaming cloumns
poi_routes['route_id'] = poi_routes['route_id'].astype(int, errors='ignore') #converting float to int for some reason isn't working
poi_routes.dropna(thresh=5, inplace=True) #dropping any poi without a route passing by in max_distance
poi_routes.sort_values(by=['route_id', 'num_of_waypoint'], inplace=True) #sorting by route_id and by number of waypoint to keep the order
poi_routes.reset_index(drop=True, inplace=True) #reset index
poi_routes


Unnamed: 0,route_id,route_linestring,num_of_waypoint,poi_lat_lgt,poi_category,poi_name,distance
0,113043.0,"LINESTRING (52.45147 13.69072, 52.45147 13.690...",367.0,POINT (52.45474 13.62477),['atm'],HypoVereinsbank,0.000352
1,113043.0,"LINESTRING (52.45147 13.69072, 52.45147 13.690...",368.0,POINT (52.45494 13.62547),['atm'],Berliner Volksbank,0.000191
2,113043.0,"LINESTRING (52.45147 13.69072, 52.45147 13.690...",374.0,POINT (52.45611 13.62562),['bar'],Sports Bar,0.000237
3,113104.0,"LINESTRING (52.50054 13.44143, 52.50090 13.442...",11.0,POINT (52.51104 13.42722),['bar'],Holzmarkt Pampa,0.000442
4,198322.0,"LINESTRING (52.52543 13.34187, 52.52524 13.342...",66.0,POINT (52.50566 13.33769),['bar'],Monkey Bar,0.000505
...,...,...,...,...,...,...,...
2878,3672503.0,"LINESTRING (52.51441 13.35027, 52.51438 13.350...",53.0,POINT (52.48454 13.35241),['bar'],Hannibal,0.000251
2879,3672505.0,"LINESTRING (52.51453 13.35019, 52.51459 13.350...",69.0,POINT (52.53098 13.38433),['atm'],Euronet,0.000472
2880,3672506.0,"LINESTRING (52.51448 13.35014, 52.51439 13.350...",17.0,POINT (52.51310 13.35563),['bench'],,0.000076
2881,3672507.0,"LINESTRING (52.51446 13.35019, 52.51439 13.350...",31.0,POINT (52.50650 13.33261),['atm'],,0.000098


In [10]:
#writing joint sample data into a csv file
poi_routes.to_csv('joint_sample_data.csv')

Notes on joining routes and points of interest data

For discovering at which points of interest our touristic and hiking routes pass by (which we believe justifies people to chose those routes in the first place) we had to join our routes data and our data about specific points of interest.

The geopandas method sjoin_nearest matches the coordinates of every point of interest (poi) to the nearest waypoints from our routes dataset, if they are closer than the defined max_distance. We had grouped the waypoints before and had connected a linestring for the whole route to every waypoint, so that after joining the poi are essentially matched with information on full routes.

By left joining poi and routes, we ensured - turning it round again - that we could detect several poi per each route. That gave us far more matches than joining on the linestring geometry from the routes data. 

If there are several route's waypoints equally near to a poi, sjoin_nearest matches both.

Since all calculations were only applied to a small area (Berlin), coordinate reference system (CRS) metrics for the max_distance parameter were neglectable. We found a tolerable distance value (max_distance=0.001) by (visual) experimentation. That distance to the route means that the poi might be at the other side of the street, or elseway visible to a pedestrian.