# Save to parquet as list of coordinates

In [2]:
import os
from pprint import pprint
import pandas as pd
from time import time
import geopandas as gpd
import contextily
from shapely.geometry import Point, LineString
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [3]:
col = ['taxi_id', 'date_time', 'lon', 'lat']
path = 't_drive_raw'
print('Liczba plików: ', len(os.listdir(path)))

Liczba plików:  10357


In [4]:
out_df = pd.DataFrame(columns=col)

t1 = time()
counter = 0
raw_data = []
for f in tqdm(sorted(os.listdir(path))):
    data = open(os.path.join(path, f), 'r').read().split('\n')
    data = [d.split(',') for d in data]
    data.pop()
    raw_data += data
    counter += 1
t2 = time()
print(raw_data)
print('time: ', t2 - t1, 'files converted: ', counter)

100%|██████████| 10357/10357 [00:52<00:00, 195.81it/s]
IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [5]:
out_df = pd.DataFrame(raw_data, columns=col)
out_df.head()

Unnamed: 0,taxi_id,date_time,lon,lat
0,1,2008-02-02 15:36:08,116.51172,39.92123
1,1,2008-02-02 15:46:08,116.51135,39.93883
2,1,2008-02-02 15:46:08,116.51135,39.93883
3,1,2008-02-02 15:56:08,116.51627,39.91034
4,1,2008-02-02 16:06:08,116.47186,39.91248


In [6]:
# date_tiem column to datetime
out_df['date_time'] = pd.to_datetime(out_df['date_time'])

In [7]:
out_df['date_time'][0]

Timestamp('2008-02-02 15:36:08')

In [8]:
out_df.shape

(17662984, 4)

In [9]:
gdf = gpd.GeoDataFrame(out_df, geometry=gpd.points_from_xy(out_df.lon, out_df.lat), crs='EPSG:4326')

In [10]:
gdf.to_parquet('t_drive.parquet')

In [95]:
# unique_ids = out_df['taxi_id'].unique()
# train_ids, testval_ids = train_test_split(unique_ids, test_size=0.2, random_state=42)
# val_ids, test_ids = train_test_split(testval_ids, test_size=0.5, random_state=42)
# 
# train_df = out_df[out_df['taxi_id'].isin(train_ids)]
# val_df = out_df[out_df['taxi_id'].isin(val_ids)]
# test_df = out_df[out_df['taxi_id'].isin(test_ids)]
# 
# train_df.shape, val_df.shape, test_df.shape

((14282609, 4), (1806035, 4), (1574340, 4))

In [96]:
# train_df.shape[0]/out_df.shape[0]*100, val_df.shape[0]/out_df.shape[0]*100, test_df.shape[0]/out_df.shape[0]*100

(80.86181247743869, 10.22497104679481, 8.913216475766497)

# II part

In [97]:
geodf_train = gpd.GeoDataFrame(train_df, geometry=gpd.points_from_xy(train_df.lon, train_df.lat), crs='EPSG:4326')
geodf_test = gpd.GeoDataFrame(test_df, geometry=gpd.points_from_xy(test_df.lon, test_df.lat), crs='EPSG:4326')
geodf_val = gpd.GeoDataFrame(val_df, geometry=gpd.points_from_xy(val_df.lon, val_df.lat), crs='EPSG:4326')

geodf_train.head()

Unnamed: 0,taxi_id,date_time,lon,lat,geometry
588,10,2008-02-02 13:32:03,116.44457,39.92157,POINT (116.44457 39.92157)
589,10,2008-02-02 13:33:58,116.44043,39.9219,POINT (116.44043 39.92190)
590,10,2008-02-02 13:34:25,116.4404,39.92192,POINT (116.44040 39.92192)
591,10,2008-02-02 13:35:08,116.43528,39.9228,POINT (116.43528 39.92280)
592,10,2008-02-02 13:36:03,116.43523,39.92287,POINT (116.43523 39.92287)


In [98]:
train_list = geodf_train.groupby(by = 'taxi_id').agg(list)
test_list = geodf_test.groupby(by = 'taxi_id').agg(list)
val_list = geodf_val.groupby(by = 'taxi_id').agg(list)

In [99]:
train_list.head()

Unnamed: 0_level_0,date_time,lon,lat,geometry
taxi_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10,"[2008-02-02 13:32:03, 2008-02-02 13:33:58, 200...","[116.44457, 116.44043, 116.4404, 116.43528, 11...","[39.92157, 39.9219, 39.92192, 39.9228, 39.9228...","[POINT (116.44457 39.92157), POINT (116.44043 ..."
100,"[2008-02-02 18:44:59, 2008-02-02 18:46:14, 200...","[116.46956, 116.47115, 116.48475, 116.48589, 1...","[39.87211, 39.87062, 39.86908, 39.87436, 39.87...","[POINT (116.46956 39.87211), POINT (116.47115 ..."
10000,"[2008-02-02 13:39:48, 2008-02-02 13:44:50, 200...","[116.35924, 116.35929, 116.35926, 116.35928, 1...","[39.93669, 39.93688, 39.93681, 39.93646, 39.93...","[POINT (116.35924 39.93669), POINT (116.35929 ..."
10001,"[2008-02-02 13:34:36, 2008-02-02 13:36:29, 200...","[116.30375, 116.29665, 116.29957, 116.30406, 1...","[39.93778, 39.93125, 39.93099, 39.91203, 39.90...","[POINT (116.30375 39.93778), POINT (116.29665 ..."
10002,"[2008-02-02 13:40:13, 2008-02-02 13:45:15, 200...","[116.41106, 116.41104, 116.41104, 116.41085, 1...","[39.92823, 39.92822, 39.92822, 39.92821, 39.92...","[POINT (116.41106 39.92823), POINT (116.41104 ..."


In [100]:
print(train_list.shape, test_list.shape, val_list.shape)

(8268, 4) (1034, 4) (1034, 4)


In [101]:
# drop rows with geometry len = 1
train_list = train_list[train_list['geometry'].apply(lambda x: len(x) > 1)]
test_list = test_list[test_list['geometry'].apply(lambda x: len(x) > 1)]
val_list = val_list[val_list['geometry'].apply(lambda x: len(x) > 1)]


In [102]:
print(train_list.shape, test_list.shape, val_list.shape)

(8257, 4) (1032, 4) (1031, 4)


In [103]:
# reset index
train_list = train_list.reset_index()
test_list = test_list.reset_index()
val_list = val_list.reset_index()

In [104]:
train_list.head()

Unnamed: 0,taxi_id,date_time,lon,lat,geometry
0,10,"[2008-02-02 13:32:03, 2008-02-02 13:33:58, 200...","[116.44457, 116.44043, 116.4404, 116.43528, 11...","[39.92157, 39.9219, 39.92192, 39.9228, 39.9228...","[POINT (116.44457 39.92157), POINT (116.44043 ..."
1,100,"[2008-02-02 18:44:59, 2008-02-02 18:46:14, 200...","[116.46956, 116.47115, 116.48475, 116.48589, 1...","[39.87211, 39.87062, 39.86908, 39.87436, 39.87...","[POINT (116.46956 39.87211), POINT (116.47115 ..."
2,10000,"[2008-02-02 13:39:48, 2008-02-02 13:44:50, 200...","[116.35924, 116.35929, 116.35926, 116.35928, 1...","[39.93669, 39.93688, 39.93681, 39.93646, 39.93...","[POINT (116.35924 39.93669), POINT (116.35929 ..."
3,10001,"[2008-02-02 13:34:36, 2008-02-02 13:36:29, 200...","[116.30375, 116.29665, 116.29957, 116.30406, 1...","[39.93778, 39.93125, 39.93099, 39.91203, 39.90...","[POINT (116.30375 39.93778), POINT (116.29665 ..."
4,10002,"[2008-02-02 13:40:13, 2008-02-02 13:45:15, 200...","[116.41106, 116.41104, 116.41104, 116.41085, 1...","[39.92823, 39.92822, 39.92822, 39.92821, 39.92...","[POINT (116.41106 39.92823), POINT (116.41104 ..."


In [105]:
train_list['geometry'] = train_list['geometry'].apply(lambda x: LineString(x))
test_list['geometry'] = test_list['geometry'].apply(lambda x: LineString(x))
val_list['geometry'] = val_list['geometry'].apply(lambda x: LineString(x))

In [106]:
train_list.head()

Unnamed: 0,taxi_id,date_time,lon,lat,geometry
0,10,"[2008-02-02 13:32:03, 2008-02-02 13:33:58, 200...","[116.44457, 116.44043, 116.4404, 116.43528, 11...","[39.92157, 39.9219, 39.92192, 39.9228, 39.9228...","LINESTRING (116.44457 39.92157, 116.44043 39.9..."
1,100,"[2008-02-02 18:44:59, 2008-02-02 18:46:14, 200...","[116.46956, 116.47115, 116.48475, 116.48589, 1...","[39.87211, 39.87062, 39.86908, 39.87436, 39.87...","LINESTRING (116.46956 39.87211, 116.47115 39.8..."
2,10000,"[2008-02-02 13:39:48, 2008-02-02 13:44:50, 200...","[116.35924, 116.35929, 116.35926, 116.35928, 1...","[39.93669, 39.93688, 39.93681, 39.93646, 39.93...","LINESTRING (116.35924 39.93669, 116.35929 39.9..."
3,10001,"[2008-02-02 13:34:36, 2008-02-02 13:36:29, 200...","[116.30375, 116.29665, 116.29957, 116.30406, 1...","[39.93778, 39.93125, 39.93099, 39.91203, 39.90...","LINESTRING (116.30375 39.93778, 116.29665 39.9..."
4,10002,"[2008-02-02 13:40:13, 2008-02-02 13:45:15, 200...","[116.41106, 116.41104, 116.41104, 116.41085, 1...","[39.92823, 39.92822, 39.92822, 39.92821, 39.92...","LINESTRING (116.41106 39.92823, 116.41104 39.9..."


In [107]:
train_list['arrays_geometry'] = train_list['geometry'].apply(lambda x: list(x.coords))
test_list['arrays_geometry'] = test_list['geometry'].apply(lambda x: list(x.coords))
val_list['arrays_geometry'] = val_list['geometry'].apply(lambda x: list(x.coords))

In [108]:
train_list.head() 

Unnamed: 0,taxi_id,date_time,lon,lat,geometry,arrays_geometry
0,10,"[2008-02-02 13:32:03, 2008-02-02 13:33:58, 200...","[116.44457, 116.44043, 116.4404, 116.43528, 11...","[39.92157, 39.9219, 39.92192, 39.9228, 39.9228...","LINESTRING (116.44457 39.92157, 116.44043 39.9...","[(116.44457, 39.92157), (116.44043, 39.9219), ..."
1,100,"[2008-02-02 18:44:59, 2008-02-02 18:46:14, 200...","[116.46956, 116.47115, 116.48475, 116.48589, 1...","[39.87211, 39.87062, 39.86908, 39.87436, 39.87...","LINESTRING (116.46956 39.87211, 116.47115 39.8...","[(116.46956, 39.87211), (116.47115, 39.87062),..."
2,10000,"[2008-02-02 13:39:48, 2008-02-02 13:44:50, 200...","[116.35924, 116.35929, 116.35926, 116.35928, 1...","[39.93669, 39.93688, 39.93681, 39.93646, 39.93...","LINESTRING (116.35924 39.93669, 116.35929 39.9...","[(116.35924, 39.93669), (116.35929, 39.93688),..."
3,10001,"[2008-02-02 13:34:36, 2008-02-02 13:36:29, 200...","[116.30375, 116.29665, 116.29957, 116.30406, 1...","[39.93778, 39.93125, 39.93099, 39.91203, 39.90...","LINESTRING (116.30375 39.93778, 116.29665 39.9...","[(116.30375, 39.93778), (116.29665, 39.93125),..."
4,10002,"[2008-02-02 13:40:13, 2008-02-02 13:45:15, 200...","[116.41106, 116.41104, 116.41104, 116.41085, 1...","[39.92823, 39.92822, 39.92822, 39.92821, 39.92...","LINESTRING (116.41106 39.92823, 116.41104 39.9...","[(116.41106, 39.92823), (116.41104, 39.92822),..."


In [109]:
# drop 'geometry' column
train_list = train_list.drop(columns = 'geometry')
test_list = test_list.drop(columns = 'geometry')
val_list = val_list.drop(columns = 'geometry')

In [110]:
if not os.path.exists('data/train_list.parquet'):
    train_list.to_parquet('data/train_list.parquet')
    
if not os.path.exists('data/test_list.parquet'):
    test_list.to_parquet('data/test_list.parquet')
    
if not os.path.exists('data/val_list.parquet'):
    val_list.to_parquet('data/val_list.parquet')
    

# read

In [111]:
path_val = 'data/val_list.parquet'

In [112]:
val = pd.read_parquet(path_val)

In [113]:
val['geometry'] = val['arrays_geometry'].map(LineString)

In [114]:
val_gdf = gpd.GeoDataFrame(val, geometry='geometry', crs='EPSG:4326')
# set index
val_gdf = val_gdf.set_index('taxi_id')

In [115]:
print(val_gdf.head())

                                                 date_time  \
taxi_id                                                      
10006    [2008-02-02T13:37:16.000000, 2008-02-02T13:42:...   
10015    [2008-02-02T13:40:45.000000, 2008-02-02T13:45:...   
10018    [2008-02-02T13:37:21.000000, 2008-02-02T13:42:...   
10023    [2008-02-02T13:34:04.000000, 2008-02-02T13:38:...   
10027    [2008-02-02T14:04:48.000000, 2008-02-02T14:09:...   

                                                       lon  \
taxi_id                                                      
10006    [116.54249, 116.52455, 116.51205, 116.48494, 1...   
10015    [116.34893, 116.34614, 116.32941, 116.32777, 1...   
10018    [116.36999, 116.36993, 116.36996, 116.36993, 1...   
10023    [116.5127, 116.46278, 116.47643, 116.47604, 11...   
10027    [116.50482, 116.50482, 116.50482, 116.50482, 1...   

                                                       lat  \
taxi_id                                                      
10006 