# 0 prepare Milan GPS dataset

The goal of the following notebook is to download and preprocess the Milan GPS dataset used to create a realistic mobility demand (i.e., trips between locations).

This notebook is not mandatory, you can use a precomputed Origin-Destination (OD) matrix.

___

### Dataset Download Instructions

1. Create an account at https://sobigdata.d4science.org/
2. Login and download the dataset at https://ckan-sobigdata.d4science.org/dataset/gps_track_milan_italy
3. Unzip the dataset and place at the path specified in `dataset_path`
___

In [None]:
import pandas as pd
import numpy as np
import skmob
import geopandas as gpd
from skmob.preprocessing import *
from skmob.measures.individual import *
from skmob.preprocessing import detection
from preprocess_utils import *

### Preprocessing parameters

In [None]:
dataset_path = '../data/raw/MilanoData.csv'
max_speed_kmh = 270
spatial_radius_km_compress = 0.05
spatial_radius_km_stops = 0.100
minutes_for_a_stop = 20

shape_path = "../data/shapes/Milano_big_shape.geojson"

# only the hour of the day is important
lower_time = pd.to_datetime("1994-10-14 15:00:00").time()
upper_time = pd.to_datetime("1994-10-14 16:00:00").time()

### 1. Data Loading

In [None]:
# loading the dataset

df_traj = pd.read_csv(dataset_path, sep=',', parse_dates=['datetime'])
df_traj[:2]

In [None]:
# print some statistics
print("# of users: "+str(len(df_traj['userid'].unique())))
print("# of points: "+str(len(df_traj)))
print("from: "+str(df_traj['datetime'].min()))
print("to: "+str(df_traj['datetime'].max()))

Convert the `DataFrame` into a `TrajDataFrame`

In [None]:
tdf_traj = skmob.TrajDataFrame(df_traj, latitude='lat', longitude='lon', 
                          datetime='datetime', user_id='userid')

tdf_traj = tdf_traj.sort_by_uid_and_datetime()

### 2. Data Cleaning

Execution time: $\approx 2min$

In [None]:
%%time

tdf_filtered = skmob.preprocessing.filtering.filter(tdf_traj, max_speed_kmh=max_speed_kmh, 
                                    include_loops=False)

print("Filtered "+str(len(tdf_traj)-len(tdf_filtered))+" points.")
print(len(tdf_filtered))

Merge together all points that are closer than spatial_radius_km=0.05 kilometers from each other.<br>
Execution time: $\approx 4 min$

In [None]:
%%time

tdf_compressed = compression.compress(tdf_filtered, 
                                      spatial_radius_km=spatial_radius_km_compress)

print("Compressed "+str(len(tdf_filtered)-len(tdf_compressed))+" points.")
print("Radius: "+str(spatial_radius_km_compress))

### 3. Trajectory segmentation with Stop detection

#### Stop detection

A stop is detected when the individual spends at least minutes_for_a_stop minutes within a distance stop_radius_factor * spatial_radius km from a given trajectory point. The stop’s coordinates are the median latitude and longitude values of the points found within the specified distance <br>
Parameters: <br>
`minutes_for_a_stop = 20.0` <br>
`spatial_radius_km = 0.1`<br><br>
Execution time: $\approx 2min$

In [None]:
%%time

stdf = detection.stay_locations(tdf_traj, stop_radius_factor=None, 
                           minutes_for_a_stop=minutes_for_a_stop, 
                       spatial_radius_km=spatial_radius_km_stops, leaving_time=True)

#### Trajectory segmentation
Execution time: $\approx 4 min$

In [None]:
%%time

traj_seg = split_trajectories_in_tdf(tdf_compressed, stdf)

# create an UNIQUE traj_id as uid+'_'+tid
traj_ids = []
for uid, tid in zip(traj_seg['uid'], traj_seg['tid']):
    traj_ids.append(str(uid)+"_"+str(tid))

traj_seg = traj_seg.drop("tid", axis=1)
traj_seg['traj_id'] = traj_ids

In [None]:
# print some statistics
print("# of users: "+str(len(traj_seg['uid'].unique())))
print("# of points: "+str(len(traj_seg)))
print("from: "+str(traj_seg['datetime'].min()))
print("to: "+str(traj_seg['datetime'].max()))

### 4. Filter by geographic area

In [None]:
milan_medium = gpd.read_file(shape_path)

In [None]:
from skmob.utils.plot import *
# style of the tessellation
tex_style = {'fillColor':'blue', 'color':'black', 'opacity': 0.2}
plot_gdf(milan_medium, style_func_args=tex_style, zoom=12)

Keep only the trajectories with at least one GPS points inside the geographic area of interest.

In [None]:
%%time

id_all_in, id_atleast_one_in = filter_in_shape(traj_seg, milan_medium, drop=False)

In [None]:
#take the trajectories with at least TWO points inside the region

traj_inside = traj_seg[traj_seg['uid'].isin(id_atleast_one_in)]
gb = traj_inside.groupby("uid", as_index=False).count()

ids_traj_mobility = gb[gb['lat']>1]['uid']
traj_filtered_area = traj_inside[traj_inside['uid'].isin(ids_traj_mobility)]

print(len(traj_filtered_area['uid'].unique()))

### 5. Trajectory segmentation$^2$

Cut the trajectories wrt to the boundaries


In [None]:
%%time

res = segment_trajectories_area(traj_filtered_area)

traj_segmented_2 = traj_filtered_area.drop(['uid'], axis=1)
traj_segmented_2['uid'] = res

Filter 1. Keep only the sub-trajectories INSIDE the geographic region

In [None]:
ids_filter1 = list(traj_segmented_2[traj_segmented_2['isin']==True]['uid'].unique())
df_traj_f1 = traj_segmented_2[traj_segmented_2['uid'].isin(ids_filter1)]
print("Trajectories: "+str(len(ids_filter1)))

Filter 2. Keep only the sub-trajectories with at least 2 GPS points

In [None]:
gb = df_traj_f1.groupby("uid", as_index=False).count()
ids_filter2 = list(gb[gb['isin']>1].uid.unique())
df_traj_f2 = df_traj_f1[df_traj_f1['uid'].isin(ids_filter2)]
print("Trajectories: "+str(len(ids_filter2)))

Filter 3. Keep only working days

In [None]:
trips_grouped = df_traj_f2.groupby(['uid'], as_index=False).first()
trips_grouped['day_number'] = trips_grouped['datetime'].apply(lambda d: d.weekday())
trips_grouped[:2]

In [None]:
tid_2_keep = trips_grouped[trips_grouped['day_number']<5]['uid'].unique()
traj_d_week = df_traj_f2[df_traj_f2['uid'].isin(tid_2_keep)]
print(len(traj_d_week['uid'].unique()))

### 6. Save the pre-processed dataset

In [None]:
traj_d_week.to_csv("../data/preprocessed/MilanoData_big_preprocessed.csv", index=False)