# New York City Taxi Trip Duration

kaggle link - https://www.kaggle.com/c/nyc-taxi-trip-duration

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import os

### Load train and test data

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

#### Haversine formula to compute the great-circle distance between two points on a sphere given their longitudes and latitudes

In [None]:
from math import radians, cos, sin, asin, sqrt

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 3956*1609.34 # Radius of earth in meters. Use 3956 for miles. Use 6371 for kilometers.
    return c * r

#### Class definition for training data transformation and fit and transform methods

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class TrainDataProcessor(BaseEstimator, TransformerMixin):
    def __init__(self, filtering=True):
        self.filtering = filtering
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        if self.filtering:
            distance = X.apply(lambda df:haversine(df['pickup_longitude'], df['pickup_latitude'],
                                                   df['dropoff_longitude'],df['dropoff_latitude']), axis=1)
            highest_speed = distance/(X['trip_duration'])
            return X.drop(X[(X['trip_duration']<30)|(X['trip_duration']>10000)|(highest_speed>50)|(highest_speed<0.5)].index)
        else:
            return X

#### Training data split into training and validation so that an estimate of accuracy on test data can be obtained

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error

train_data, val_data = train_test_split(train, test_size=0.3, random_state=42)

#### Filtering and transformation: predictors and response are separated

In [None]:
filtering = True

train_data_filt = TrainDataProcessor(filtering).fit_transform(train_data)

X_train = train_data_filt.drop(columns=['trip_duration'])
y_train = train_data_filt['trip_duration']
X_val = val_data.drop(columns=['trip_duration'])
y_val = val_data['trip_duration']

#### Class definition for adding features and fit and transform methods

In [None]:
from sklearn.preprocessing import StandardScaler
# Columns to add: pickup_hour, pickup_dayofweek, distance 

class FeaturesAdder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X['pickup_hour'] = pd.to_datetime(X['pickup_datetime']).dt.hour
        X['pickup_dayofweek'] = pd.to_datetime(X['pickup_datetime']).dt.dayofweek
        X['pickup_month'] = pd.to_datetime(X['pickup_datetime']).dt.month
        X['distance'] = X.apply(lambda df:haversine(df['pickup_longitude'], df['pickup_latitude'],
                                                    df['dropoff_longitude'],df['dropoff_latitude']), axis=1)
        return X

#### Class definition for attributes/predictors selection and fit and transform methods

In [None]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return X[self.attribute_names]

#### Class definition for category encoding and fit and transform methods

In [None]:
class CatEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cols_to_encode):
        self.cols_to_encode = cols_to_encode
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return pd.get_dummies(X, columns=self.cols_to_encode)

#### Full pipeline construction for training and validation data transformation. Pipeline performs separately on numerical and categorical features in the data and then joins them together. Following operations are done:
- feature engineering
- atrributes selection
- scaling
- category encoding
- features union

#### Training and validation data are ready for ML models after this code cell

In [None]:
from sklearn.pipeline import Pipeline, FeatureUnion

cols_to_encode = ['vendor_id','store_and_fwd_flag','pickup_dayofweek', 'pickup_month']
cols_to_scale = ['pickup_hour','distance','passenger_count']

full_pipeline = Pipeline([
    ('features_adder', FeaturesAdder()),
    ('num_cat_FU', FeatureUnion([
        ('num_pipe', Pipeline([
            ('selector', DataFrameSelector(cols_to_scale)),
            ('std_scaler', StandardScaler())
        ])),
        ('cat_pipe', Pipeline([
            ('selector', DataFrameSelector(cols_to_encode)),
            ('one_hot', CatEncoder(cols_to_encode))
        ]))
    ])),    
])

train_prepared = full_pipeline.fit_transform(X_train)

val_prepared = full_pipeline.transform(X_val)

#### LightGBM is applied on the training data and Root Mean Squared Logarithmic Error (RMSLE) for validation data is computed since RMSLE is the evaluation metric for this dataset, as stated on kaggle page

In [None]:
import lightgbm as lgb

lgb_reg = lgb.LGBMRegressor(n_estimators=200)
lgb_reg.fit(train_prepared, y_train)
y_pred = lgb_reg.predict(val_prepared)
val_error = np.sqrt(mean_squared_log_error(y_val, y_pred))
print('Validation RMSLE with lgbm:', val_error)

#### XGBoost and Gradient boosting can also be applied

In [None]:
import xgboost

xgb_reg = xgboost.XGBRegressor(max_depth=4, n_estimators=250)
xgb_reg.fit(train_prepared, y_train)
y_pred = xgb_reg.predict(val_prepared)
val_error = np.sqrt(mean_squared_log_error(y_val, y_pred))
print('Validation RMSLE with xgboost:', val_error)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=5, n_estimators=300, learning_rate=0.1)
gbrt.fit(train_prepared, y_train)
y_pred = gbrt.predict(val_prepared)
val_error = np.sqrt(mean_squared_log_error(y_val, y_pred))
print('Validation RMSLE with GradientBoosting:', val_error)

#### Transform the test data through full pipeline

In [None]:
test_prepared = full_pipeline.transform(test)

#### Generate the predictions and append it to test data before writing it into a _csv_ file

In [None]:
y_test = np.around(xgb_reg.predict(test_prepared))

In [None]:
result = pd.DataFrame()

In [None]:
result['id'] = test['id']
result['trip_duration'] = pd.DataFrame(y_test)

In [None]:
result.to_csv("submission.csv", index=False)