In [1]:
import ast
import zipfile

import pandas as pd
from tqdm.contrib.concurrent import process_map, thread_map

In [2]:
zipfile_train = zipfile.ZipFile(file="data/train.csv.zip")

In [3]:
df_train = pd.read_csv(zipfile_train.open("train.csv"))

In [4]:
df_train.rename(
    columns={
        "TRIP_ID": "trip_id",
        "CALL_TYPE": "call_type",
        "ORIGIN_CALL": "origin_call",
        "ORIGIN_STAND": "origin_stand",
        "TAXI_ID": "taxi_id",
        "TIMESTAMP": "timestamp",
        "DAY_TYPE": "day_type",
        "MISSING_DATA": "missing_data",
        "POLYLINE": "geometry",
    },
    inplace=True,
)

In [5]:
df_train.head()

Unnamed: 0,trip_id,call_type,origin_call,origin_stand,taxi_id,timestamp,day_type,missing_data,geometry
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[..."
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[..."
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-..."
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[..."
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-..."


In [6]:
df_train = df_train[df_train["missing_data"] == False]

In [7]:
df_train.drop("missing_data", axis=1, inplace=True)

In [8]:
df_train["geometry"] = process_map(
    ast.literal_eval, df_train["geometry"], chunksize=1000, max_workers=20
)

  0%|          | 0/1710660 [00:00<?, ?it/s]

In [9]:
df_train["length"] = process_map(
    len, df_train["geometry"], chunksize=1000, max_workers=20
)

  0%|          | 0/1710660 [00:00<?, ?it/s]

In [10]:
def compute_travel_time(num_points):
    return (num_points - 1) * 15

In [11]:
df_train["travel_time_seconds"] = thread_map(
    compute_travel_time, df_train["length"], chunksize=1000, max_workers=20
)

  0%|          | 0/1710660 [00:00<?, ?it/s]

In [12]:
df_train = df_train[df_train["length"] > 2].drop("length", axis=1)

In [13]:
df_train.to_parquet(path="data/porto_taxi.parquet")