In [2]:
import pandas as pd
import holidays
import pickle

In [3]:
expected_test_types = {
    "pickup_longitude": float,
    "pickup_latitude": float,
    "dropoff_longitude": float,
    "dropoff_latitude": float,
    "passenger_count": int,
    "pickup_datetime": str
}

expected_train_types = {**expected_test_types, **{"fare_amount": float}}

date_format = "%Y-%m-%d %H:%M:%S UTC"

In [4]:
nrows_train = 55000000

try:
    with open("../../data/processed/train_data_55m.pkl", 'rb') as f:
        train_data = pickle.load(f)
except FileNotFoundError:
    train_data = pd.read_csv("../../data/raw/train.csv", 
                             nrows=nrows_train,
                             usecols=range(1, len(expected_train_types)),
                             dtype=expected_train_types)
    
    train_data["pickup_datetime"] = pd.to_datetime(train_data["pickup_datetime"], format=date_format)
    # Let's save a pickle with the dates parsed, it is the slow action there
    with open("../../data/processed/train_data_55m.pkl", 'wb') as f:
        pickle.dump(train_data, f)

In [5]:
import sys

sys.path.append("../../python/")

In [6]:
from nytf.holidays_extractor import HolidayExtractor

In [7]:
h_ex = HolidayExtractor(date_col="pickup_datetime", state="NY")

In [16]:
subset = train_data.iloc[:6000000]
subset = h_ex.fit_transform(X=subset)

In [17]:
h_ex._relative_holiday_importance

normal                                         1.000000
Lincoln's Birthday                             1.076923
Susan B. Anthony Day                           1.061617
Veterans Day                                   0.979199
Martin Luther King, Jr. Day                    0.808870
Columbus Day                                   0.874019
Washington's Birthday                          0.859498
New Year's Day                                 0.715463
Memorial Day                                   0.684066
Labor Day                                      0.702512
Independence Day                               0.689560
Thanksgiving                                   0.689168
Christmas Day                                  0.491758
Election Day                                   0.927786
Lincoln's Birthday (Observed)                  1.103218
New Year's Day (Observed)                      0.815542
Independence Day (Observed)                    0.738619
Christmas Day (Observed)                       0

In [18]:
subset["holidays_score"].describe()

count    6.000000e+06
mean     9.952773e-01
std      3.886105e-02
min      4.917582e-01
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      1.103218e+00
Name: holidays_score, dtype: float64

In [19]:
subset.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,holidays_score
0,4.5,2009-06-15 17:26:21,-73.844311,40.721319,-73.84161,40.712278,1.0
1,16.9,2010-01-05 16:52:16,-74.016048,40.711303,-73.979268,40.782004,1.0
2,5.7,2011-08-18 00:35:00,-73.982738,40.76127,-73.991242,40.750562,1.0
3,7.7,2012-04-21 04:30:42,-73.98713,40.733143,-73.991567,40.758092,1.0
4,5.3,2010-03-09 07:51:00,-73.968095,40.768008,-73.956655,40.783762,1.0
