In [None]:
%matplotlib inline
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import zipfile

train = pd.read_csv("/kaggle/input/nyc-taxi-trip-duration/train.zip", 
                    compression="zip", index_col="id")
test = pd.read_csv("/kaggle/input/nyc-taxi-trip-duration/test.zip",
                   compression="zip", index_col="id")

train.head()

Let's include some sanity checks for the data.

In [None]:
print('Id is unique.') if train.index.nunique() == train.shape[0] else print('oops')
print('Train and test sets are distinct.') if len(np.intersect1d(train.index.values, test.index.values))== 0 else print('oops')
print('We do not need to worry about missing values.') if train.count().min() == train.shape[0] and test.count().min() == test.shape[0] else print('oops')
print('The store_and_fwd_flag has only two values {}.'.format(str(set(train.store_and_fwd_flag.unique()) | set(test.store_and_fwd_flag.unique()))))

In [None]:
train['pickup_datetime'] = pd.to_datetime(train.pickup_datetime)
test['pickup_datetime'] = pd.to_datetime(test.pickup_datetime)
train.loc[:, 'pickup_date'] = train['pickup_datetime'].dt.date
test.loc[:, 'pickup_date'] = test['pickup_datetime'].dt.date
train['dropoff_datetime'] = pd.to_datetime(train.dropoff_datetime)

In [None]:
# Are train and test sets from the same time period?

pyplot.plot(train.groupby("pickup_date").count()[["vendor_id"]])
pyplot.plot(test.groupby("pickup_date").count()[["vendor_id"]])

In [None]:
# Are train and test sets from the same geographic area?

N = 10000
city_long_border = (-74.03, -73.75)
city_lat_border = (40.63, 40.85)
fig, ax = pyplot.subplots(ncols=2, sharex=True, sharey=True)
ax[0].scatter(train['pickup_longitude'].values[:N], train['pickup_latitude'].values[:N],
              color='blue', s=1, label='train', alpha=0.1)
ax[1].scatter(test['pickup_longitude'].values[:N], test['pickup_latitude'].values[:N],
              color='green', s=1, label='test', alpha=0.1)
fig.suptitle('Train and test area complete overlap.')
ax[0].legend(loc=0)
ax[0].set_ylabel('latitude')
ax[0].set_xlabel('longitude')
ax[1].set_xlabel('longitude')
ax[1].legend(loc=0)
pyplot.ylim(city_lat_border)
pyplot.xlim(city_long_border)

In [None]:
EARTH_RADIUS=6378.137  ## km
def haversine(xy1, xy2):
    return 2*EARTH_RADIUS*np.arcsin(np.sqrt(
        np.sin((xy2[:,0]-xy1[:,0])/2)**2 +
        np.cos(xy1[:,0])*np.cos(xy2[:,0])*np.sin((xy2[:,1]-xy2[:,1])/2)
    ))
train["distance"] = haversine(
    np.radians(train[["pickup_longitude", "pickup_latitude"]].values),
    np.radians(train[["dropoff_longitude", "dropoff_latitude"]].values))
test["distance"] = haversine(
    np.radians(test[["pickup_longitude", "pickup_latitude"]].values),
    np.radians(test[["dropoff_longitude", "dropoff_latitude"]].values))

pyplot.hist(np.log(train["distance"]+1e-5), bins=50)

In [None]:
train.loc[:, 'pickup_weekday'] = train['pickup_datetime'].dt.weekday
train.loc[:, 'pickup_weekofyear'] = train['pickup_datetime'].dt.isocalendar().week.astype("int32")
train.loc[:, 'pickup_hour'] = train['pickup_datetime'].dt.hour
train.loc[:, 'pickup_minute'] = train['pickup_datetime'].dt.minute
train.loc[:, 'pickup_dt'] = (train['pickup_datetime'] - train['pickup_datetime'].min()).dt.total_seconds()
train.loc[:, 'pickup_week_hour'] = train['pickup_weekday'] * 24 + train['pickup_hour']

test.loc[:, 'pickup_weekday'] = test['pickup_datetime'].dt.weekday
test.loc[:, 'pickup_weekofyear'] = test['pickup_datetime'].dt.isocalendar().week.astype("int32")
test.loc[:, 'pickup_hour'] = test['pickup_datetime'].dt.hour
test.loc[:, 'pickup_minute'] = test['pickup_datetime'].dt.minute
test.loc[:, 'pickup_dt'] = (test['pickup_datetime'] - train['pickup_datetime'].min()).dt.total_seconds()
test.loc[:, 'pickup_week_hour'] = test['pickup_weekday'] * 24 + test['pickup_hour']

In [None]:
train.loc[:, 'avg_speed'] = 1000 * train['distance'] / train['trip_duration']

fig, ax = pyplot.subplots(ncols=3, sharey=True)
ax[0].plot(train.groupby('pickup_hour').mean()['avg_speed'], 'b', lw=2, alpha=0.7)
ax[1].plot(train.groupby('pickup_weekday').mean()['avg_speed'], 'g', lw=2, alpha=0.7)
ax[2].plot(train.groupby('pickup_week_hour').mean()['avg_speed'], 'r', lw=2, alpha=0.7)
ax[0].set_xlabel('hour')
ax[1].set_xlabel('weekday')
ax[2].set_xlabel('weekhour')
ax[0].set_ylabel('average speed')
fig.suptitle('Rush hour average traffic speed')

There are some clear outliers in the dataset.
Let's get rid of taxicab trips that are longer than 15h, as well as trips shorter than 1m.

In [None]:
print(f"Shape before dropping outliers: {train.shape}")
train.drop(train[train["trip_duration"] > 20*60*60].index, inplace=True) # no trip longer than 20h
train.drop(train[train["trip_duration"] < 60].index, inplace=True) # no trip shorter than 60s
train.drop(train[train["distance"] < 0.01].index, inplace=True) # no trip shorter than 10m
print(f"Shape after dropping outliers: {train.shape}")

In [None]:
train.describe()

In [None]:
train.dtypes


In [None]:
%%time
# Train a model to predict `avg_speed` given `pickup_week_hour` and `pickup_minute`
# ...or maybe given just the timestamp? worth trying too...

from sklearn import neighbors
avg_speed = neighbors.KNeighborsRegressor(2, n_jobs=-1)
avg_speed.fit(train[["pickup_week_hour","pickup_minute"]], train["avg_speed"])
test["avg_speed"] = avg_speed.predict(test[["pickup_week_hour","pickup_minute"]])

In [None]:
test.describe()

# Done preprocessing

By now `train` and `test` both have the same set of features (except for a few that don't occur in `test`). 
It's time to build the model pipeline, pick features to include, and train the model.

In [None]:
do_not_use_for_training = ['id', 'pickup_datetime', 'dropoff_datetime',
                           'trip_duration', 'check_trip_duration',
                           'pickup_date', 'vendor_id',
                           'pickup_lat_bin', 'pickup_long_bin',
                           'center_lat_bin', 'center_long_bin',
                           'pickup_dt_bin', 'pickup_datetime_group',
                           'store_and_fwd_flag']
feature_names = [f for f in train.columns if f not in do_not_use_for_training]
X, y = train[feature_names], train["trip_duration"]
Xtest = test[feature_names]
print(f"Shape of training data: X {X.shape} y {y.shape}")
print(f"Shape of test features: X {Xtest.shape}")

In [None]:
from sklearn import model_selection
X_train, X_val, y_train, y_val = model_selection.train_test_split(X,y, test_size=0.2)

In [None]:
from sklearn import linear_model, model_selection, metrics, pipeline
from sklearn import preprocessing, svm, compose, feature_selection, kernel_approximation

rmsle = metrics.make_scorer(lambda yt, yp: np.sqrt(metrics.mean_squared_log_error(yt, yp)),
                             greater_is_better=False)

In [None]:
params = [{
    "regressor__features__k": range(3,X.shape[1]),
    "regressor__kernelapprox__kernel": ["rbf"],
    "regressor__kernelapprox__gamma": np.logspace(-2,2,3)
},{
    "regressor__features__k": range(3,X.shape[1]),
    "regressor__kernelapprox__kernel": ["laplacian"],
    "regressor__kernelapprox__gamma": np.logspace(-2,2,3)
},{
    "regressor__features__k": range(3,X.shape[1]),
    "regressor__kernelapprox__kernel": ["chi2"],
    "regressor__kernelapprox__gamma": np.logspace(-2,2,3)
},{
    "regressor__features__k": range(3,X.shape[1]),
    "regressor__kernelapprox__kernel": ["sigmoid"],
    "regressor__kernelapprox__gamma": np.logspace(-2,2,3)
}
]
model = model_selection.RandomizedSearchCV(
    compose.TransformedTargetRegressor(
        regressor=pipeline.Pipeline([
          ("features",feature_selection.SelectKBest(score_func=feature_selection.f_regression)),
          ("scaler", preprocessing.StandardScaler()),
          ("kernelapprox", kernel_approximation.Nystroem()),
          ("svm", svm.LinearSVR())]),
        func=np.log, inverse_func=np.exp),
    params, scoring=rmsle, cv=5, n_jobs=-1, n_iter=20, verbose=2)
model.get_params()

In [None]:
from sklearn import ensemble, tree, decomposition
import xgboost
    
params = {
    "xgb__objective": ["reg:squaredlogerror"],
    "xgb__eval_metric": ["rmsle"],
    "xgb__tree_method": ["gpu_hist"],
    "xgb__grow_policy": ["depthwise", "lossguide"],
    "xgb__min_split_loss": [0, 0.1, 0.5, 1, 5],
    "xgb__max_depth": [2,4,6,8,10],
    "xgb__min_child_weight": [5,10,50,100],
    "xgb__subsample": [1.0, 0.75, 0.5, 0.25, 0.1],
    "xgb__sampling_method": ["uniform", "gradient_based"],
    "xgb__n_estimators": [50,100,200]
}

model = model_selection.RandomizedSearchCV(
    pipeline.Pipeline([
        ("scaler", preprocessing.StandardScaler()),
        ("xgb", xgboost.XGBRegressor())]),
    params, scoring=rmsle, cv=model_selection.TimeSeriesSplit(5), n_jobs=-1, verbose=1, n_iter=20
)
model.get_params()

In [None]:
%%time
model.fit(X, y)

In [None]:
print(f"Model score: {model.best_score_}")
print(f"Model chosen parameters: {model.best_params_}")

In [None]:
%%time
submission = test[[]].assign(trip_duration=model.best_estimator_.predict(Xtest)).to_csv("submission.csv")