In [1]:
import holidays
import pandas as pd
import os

In [2]:
def recursive_get_files(path):
    fns = []
    for (dirpath, dirnames, filenames) in os.walk(path):
        if len(filenames) > 0:
            fns += [os.path.join(dirpath, fn) for fn in filenames]
    return fns

In [3]:
fldr_in = "nyctaxi/part_parquet_stage1/"

fns = recursive_get_files(fldr_in)
fns = [fn for fn in fns if fn.endswith(".parquet")]

In [4]:
len(fns)

599

In [5]:
fn = fns[0]

In [6]:
df = pd.read_parquet(fn)
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RatecodeID,store_and_fwd_flag,dropoff_longitude,...,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,PULocationID,DOLocationID,year
0,1,05/05/2016 10:18:52 PM,05/05/2016 10:25:35 PM,1,0.9,-73.983437,40.722141,1,N,-73.98877,...,6.5,0.5,0.5,1.55,0.0,0.3,9.35,,,2016
1,1,05/14/2016 01:00:32 PM,05/14/2016 01:07:52 PM,2,1.2,-73.994072,40.761765,1,N,-73.982452,...,7.0,0.0,0.5,0.0,0.0,0.3,7.8,,,2016
2,1,05/06/2016 02:59:36 PM,05/06/2016 03:29:22 PM,1,2.4,-73.963943,40.776794,1,N,-73.988365,...,18.5,0.0,0.5,0.0,0.0,0.3,19.3,,,2016
3,1,05/19/2016 10:13:47 PM,05/19/2016 10:32:45 PM,1,3.7,-73.987656,40.732311,1,N,-73.952148,...,15.5,0.5,0.5,4.2,0.0,0.3,21.0,,,2016
4,2,05/07/2016 02:40:26 PM,05/07/2016 02:58:03 PM,5,1.73,-73.974777,40.777763,1,N,-73.953613,...,12.5,0.0,0.5,2.66,0.0,0.3,15.96,,,2016


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 391230 entries, 0 to 391229
Data columns (total 22 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   VendorID               391230 non-null  int64  
 1   tpep_pickup_datetime   391230 non-null  object 
 2   tpep_dropoff_datetime  391230 non-null  object 
 3   passenger_count        391230 non-null  int64  
 4   trip_distance          391230 non-null  float64
 5   pickup_longitude       391230 non-null  float64
 6   pickup_latitude        391230 non-null  float64
 7   RatecodeID             391230 non-null  int64  
 8   store_and_fwd_flag     391230 non-null  object 
 9   dropoff_longitude      391230 non-null  float64
 10  dropoff_latitude       391230 non-null  float64
 11  payment_type           391230 non-null  int64  
 12  fare_amount            391230 non-null  float64
 13  extra                  391230 non-null  float64
 14  mta_tax                391230 non-nu

In [8]:
dt_cols = ['tpep_pickup_datetime',
           'tpep_dropoff_datetime']
df[dt_cols].head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime
0,05/05/2016 10:18:52 PM,05/05/2016 10:25:35 PM
1,05/14/2016 01:00:32 PM,05/14/2016 01:07:52 PM
2,05/06/2016 02:59:36 PM,05/06/2016 03:29:22 PM
3,05/19/2016 10:13:47 PM,05/19/2016 10:32:45 PM
4,05/07/2016 02:40:26 PM,05/07/2016 02:58:03 PM


In [9]:
def features_dt(df, col_dt):
    df[f"{col_dt}_month"] = df[col_dt].dt.month
    df[f"{col_dt}_day"] = df[col_dt].dt.day
    df[f"{col_dt}_hour"] = df[col_dt].dt.hour
    df[f"{col_dt}_minute"] = df[col_dt].dt.minute
    df[f"{col_dt}_dow"] = df[col_dt].dt.weekday
    df[f"{col_dt}_is_weekend"] = df[f"{col_dt}_dow"].ge(5)
    return df

In [10]:
def add_features(fn, fldr_in, fldr_out):
    df = pd.read_parquet(fn)
    dt_cols = ['tpep_pickup_datetime',
               'tpep_dropoff_datetime']
    for col in dt_cols:
        df[col] = pd.to_datetime(df[col],
                                 format='%m/%d/%Y %I:%M:%S %p')
    df = features_dt(df, 'tpep_pickup_datetime')
    fn_out = fn.replace(fldr_in, fldr_out)
    os.makedirs(
        os.path.dirname(fn_out), exist_ok=True)
    df.to_parquet(fn_out)

In [11]:
fldr_out = 'nyctaxi/part_parquet_stage1_features/'

In [12]:
%%time
o = add_features(fn, fldr_in, fldr_out)

CPU times: user 5.18 s, sys: 207 ms, total: 5.39 s
Wall time: 5.34 s


In [13]:
import dask.bag as db
from dask.distributed import Client
client = Client()
client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 40135 instead


0,1
Client  Scheduler: tcp://127.0.0.1:39995  Dashboard: http://127.0.0.1:40135/status,Cluster  Workers: 4  Cores: 4  Memory: 16.50 GB


In [14]:
%%time
out = db.from_sequence(fns)\
        .map(lambda fn: add_features(fn, fldr_in, fldr_out))\
        .compute()

CPU times: user 3min 28s, sys: 20 s, total: 3min 48s
Wall time: 44min 20s


In [15]:
client.close()