In [73]:
import geopandas as gpd
from geopy.distance import geodesic
import pandas as pd
import numpy as np
from shapely.geometry import shape 
import matplotlib.pyplot as plt
import datetime
import sys
import iris
sys.path.append('../../Scripts/') # link to scripts
import preprocessing.translate   # import translation scripts from preprocessing
import utils.spatial_utils
import utils.temporal_utils
import preprocessing.create_metrics

%matplotlib inline

In [74]:
# path to the MTL Trajet data 
path_2016 = "../../Data/mtl_trajet/mtl_trajet_2016.shp"
path_2017 = "../../Data/mtl_trajet/trajets_mtl_trajet_2017.shp"

## Translate and re-project data

In [None]:
gdf_2016 = preprocessing.translate.translate_data(path_2016)
gdf_2017 = preprocessing.translate.translate_data(path_2017)

# initalise the CRS
if gdf_2016.crs == {}:
    print("initialising crs")
    gdf_2016.crs = {'init': 'epsg:4326'}
    
if gdf_2017.crs == {}:
    print("initialising crs")
    gdf_2017.crs = {'init': 'epsg:4326'}

In [None]:
# calculate the bounding box 
used_data = gdf_2017
for i in range(len(used_data)):
    if i == 0:
        min_lon, min_lat, max_lon, max_lat = used_data['geometry'][i].bounds
    else:
        lon1, lat1, lon2, lat2 = used_data['geometry'][i].bounds
        if lon1 < min_lon:
            min_lon = lon1
        elif lat1 < min_lat:
            min_lat = lat1
        if lon1 > max_lon:
            max_lon = lon2
        elif lat1 > max_lat:
            max_lat = lat2
        

In [None]:
min_lon, min_lat, max_lon, max_lat

In [None]:
gdf_2017.plot()

plt.xlim(min_lon,max_lon)
plt.ylim(min_lat, max_lat)

In [None]:
min_lon, min_lat, max_lon, max_lat

In [None]:
# reproject data
if not gdf_2016.crs == {'init': 'epsg:3347'}:
    gdf_2016 = utils.spatial_utils.change_projection(gdf_2016)
    
if not gdf_2017.crs == {'init': 'epsg:3347'}:
    gdf_2017 = utils.spatial_utils.change_projection(gdf_2017)

In [None]:
gdf_2016.head()

In [None]:
gdf_2017.head()

In [9]:
# remove timezone 
gdf_2016 = utils.temporal_utils.remove_timezone(gdf_2016)
gdf_2017 = utils.temporal_utils.remove_timezone(gdf_2017)

converting start and end timestamps to datetime objects
converting start and end timestamps to datetime objects


## merge with weather data

In [10]:
## see weather_data notebook for details

In [11]:
weather_data = pd.read_csv("../../Data/supplementary_data/weather/mtl_temp_prec.csv")

In [12]:
weather_data['dt'] = weather_data['dt'].apply(pd.to_datetime)
weather_data = weather_data.set_index('dt')

In [13]:
def extract_weather_subset(row, weather_df):
    assert type(weather_df.index) == pd.core.indexes.datetimes.DatetimeIndex, ("the weather data needs a datetime index")
    
    start = row['starttime']
    end = row['endtime']
    subset_weather = weather_df.loc[start.strftime("%Y-%m-%d %H:%M:%S"):end.strftime("%Y-%m-%d %H:%M:%S")]
    if len(subset_weather) < 1:
        start = start - datetime.timedelta(hours=1)
        subset_weather = weather_df.loc[start.strftime("%Y-%m-%d %H:%M:%S"):end.strftime("%Y-%m-%d %H:%M:%S")]
    return subset_weather

def calc_mean_weather(subset):
#     print(len(subset), "hours worth of weather")
    return subset.precipitation.mean(), subset.temperature.mean()


In [14]:
%%time
## apply average weather for trip to each row
## takes 15 mins
gdf_2016['av_weather'] = gdf_2016.apply(lambda row: calc_mean_weather(extract_weather_subset(row, weather_data)), axis=1)
gdf_2017['av_weather'] = gdf_2017.apply(lambda row: calc_mean_weather(extract_weather_subset(row, weather_data)), axis=1)


CPU times: user 16min 9s, sys: 8.83 s, total: 16min 17s
Wall time: 16min 17s


In [15]:
temp_prec_16 = gdf_2016.av_weather.apply(pd.Series)
temp_prec_16.columns = ['precip','temperature']

temp_prec_17 = gdf_2017.av_weather.apply(pd.Series)
temp_prec_17.columns = ['precip','temperature']

In [16]:
gdf_2016 = pd.concat([gdf_2016,temp_prec_16], axis=1)
gdf_2017 = pd.concat([gdf_2017,temp_prec_17], axis=1)

In [17]:
gdf_2016.head()

Unnamed: 0,id_trip,avg_speed,duration,mode,purpose,n_coord,segments,starttime,endtime,geometry,av_weather,precip,temperature
0,1724206,4.4,460,walking,returning_home,12,"[{""id"": 1150192, ""source"": ""geobase_mtl""}, {""i...",2016-09-07 20:37:26,2016-09-07 20:37:26,LINESTRING (7628287.236741193 1247680.17623496...,"(1.5039444824678503e-06, 28.012522481572827)",2e-06,28.012522
1,1724208,10.7,2146,combination,work,120,"[{""id"": 1140016, ""source"": ""geobase_mtl""}, {""i...",2016-09-08 07:43:23,2016-09-08 07:43:23,LINESTRING (7627830.06960756 1247172.275532002...,"(0.0, 21.00729048861388)",0.0,21.00729
2,1889461,15.4,447,public_transport,leisure,36,"[{""id"": 1390715, ""source"": ""geobase_mtl""}, {""i...",2016-09-08 19:46:14,2016-09-08 19:46:14,LINESTRING (7632055.840015979 1247584.15415876...,"(0.0001343523737672367, 25.84488639333847)",0.000134,25.844886
3,1724219,16.8,591,car,returning_home,45,"[{""id"": 1210250, ""source"": ""geobase_mtl""}, {""i...",2016-09-08 21:41:37,2016-09-08 21:41:37,LINESTRING (7630236.131991102 1247926.25343475...,"(0.00024012980236755996, 25.38936301243564)",0.00024,25.389363
4,2071985,6.9,279,combination,pick_up_drop_off,12,"[{""id"": 1140287, ""source"": ""geobase_mtl""}, {""i...",2016-09-09 16:49:12,2016-09-09 16:49:12,"LINESTRING (7628479.883114187 1247193.3864914,...","(0.001389143386973931, 23.612509824599048)",0.001389,23.61251


In [18]:
gdf_2017.head()

Unnamed: 0,id_trip,mode,purpose,starttime,endtime,geometry,av_weather,precip,temperature
0,1547,,,2017-09-18 04:16:58,2017-09-18 04:16:58,LINESTRING (7624015.797731054 1247372.35912242...,"(0.0, 18.951943514056154)",0.0,18.951944
1,308312,,,2017-09-18 06:17:46,2017-09-18 06:17:46,LINESTRING (7624029.565886395 1247375.82561127...,"(0.0, 18.353765796400975)",0.0,18.353766
2,384772,,,2017-09-18 09:30:24,2017-09-18 09:30:24,(LINESTRING (7607307.107749194 1253237.7069476...,"(0.0, 17.101756334982213)",0.0,17.101756
3,150744,car,pick_up_drop_off,2017-09-18 10:02:50,2017-09-18 10:02:50,LINESTRING (7624322.265039734 1247672.97005955...,"(0.0, 16.910883638032313)",0.0,16.910884
4,199011,,,2017-09-18 10:18:40,2017-09-18 10:18:40,(LINESTRING (7616402.393582943 1246769.1996916...,"(0.0, 16.910883638032313)",0.0,16.910884


## Calculate distance, direction and duration

In [25]:
if 'distance_m' not in gdf_2016.columns:
    print("calculating distance")
    gdf_2016['distance_m'] = gdf_2016['geometry'].apply(lambda row: row.length)
else:
    print('distance already calculated')
    

if 'distance_m' not in gdf_2017.columns:
    print("calculating distance")
    gdf_2017['distance_m'] = gdf_2017['geometry'].apply(lambda row: row.length)
else:
    print('distance already calculated')

calculating distance
calculating distance


In [29]:
if 'duration' not in gdf_2016.columns:
    gdf_2016['duration'] = pd.to_datetime(gdf_2016['endtime']) - pd.to_datetime(gdf_2016['starttime'])
    gdf_2016['duration'] = gdf_2016['duration'].apply(lambda tm: tm.seconds)
else:
    print('duration already calculated')
    
if 'duration' not in gdf_2017.columns:
    gdf_2017['duration'] = pd.to_datetime(gdf_2017['endtime']) - pd.to_datetime(gdf_2017['starttime'])
    gdf_2017['duration'] = gdf_2017['duration'].apply(lambda tm: tm.seconds)
else:
    print('duration already calculated')

duration already calculated
duration already calculated


In [31]:
%%time
gdf_2016 = preprocessing.create_metrics.calc_direction(gdf_2016)
gdf_2017 = preprocessing.create_metrics.calc_direction(gdf_2017)

reprojecting data into WGS_84 for direction calculation
calculating mean direction (may take 5-10 minutes)
reprojecting data into WGS_84 for direction calculation
calculating mean direction (may take 5-10 minutes)
CPU times: user 13min 13s, sys: 13.6 s, total: 13min 27s
Wall time: 14min 32s


## Calculate Spatial and Temporal Clusters

## Add Land Use and POI data

## Add rush-hour and downtown labels

## Clean & Subset data

## encode data

## save final preprocessed data

In [None]:
# path to the preprocessed MTL Trajet data 
# path_final_2016 = "../../Data/mtl_trajet/mtl_trajet_2016_final.shp"
# path_final_2017 = "../../Data/mtl_trajet/trajets_mtl_trajet_2017_final.shp"