In [1]:
import datetime

import numpy as np
import pandas as pd


TRAIN = 'train'
TEST = 'test'

DATA_FILES = {
    TRAIN: 'train.csv',
    TEST: 'test.csv'
}

In [2]:
def get_datetime_features(datetime_string):
    dt_object = datetime.datetime.strptime(datetime_string, '%Y-%m-%d %H:%M:%S')
    return dt_object.weekday(), dt_object.month, dt_object.day, dt_object.hour, dt_object.minute


def get_dataframe(mode=TRAIN):
    df = pd.read_csv(DATA_FILES[mode])
    if mode == TRAIN:
        df.drop(['dropoff_datetime', ], inplace=True, axis=1)
        
    (
        df['pickup_weekday'], df['pickup_month'], df['pickup_day'], 
        df['pickup_hour'], df['pickup_minute']
    ) = zip(*df['pickup_datetime'].map(get_datetime_features))
    
    df.drop(['pickup_datetime',  'id', ], inplace=True, axis=1)
    
    df['store_and_fwd_flag'] = df['store_and_fwd_flag'].map({'N': 0, 'Y': 1})
    return df


def transform_dataframe(df, mode=TRAIN):
    X_columns = list(df.columns)
    if mode == TRAIN:
        X_columns.remove('trip_duration')
        return df[X_columns].as_matrix(), df['trip_duration'].as_matrix()
    else:
        return df[X_columns].as_matrix()

In [3]:
df = get_dataframe(TRAIN)
X, y = transform_dataframe(df, TRAIN)

In [4]:
df.head()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_weekday,pickup_month,pickup_day,pickup_hour,pickup_minute
0,2,1,-73.982155,40.767937,-73.96463,40.765602,0,455,0,3,14,17,24
1,1,1,-73.980415,40.738564,-73.999481,40.731152,0,663,6,6,12,0,43
2,2,1,-73.979027,40.763939,-74.005333,40.710087,0,2124,1,1,19,11,35
3,2,1,-74.01004,40.719971,-74.012268,40.706718,0,429,2,4,6,19,32
4,2,1,-73.973053,40.793209,-73.972923,40.78252,0,435,5,3,26,13,30


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

def score(preds, trues):
    return np.mean((np.log(preds + 1) - np.log(trues + 1)) ** 2) ** 0.5

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y)
model = RandomForestRegressor()
model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [7]:
preds = model.predict(X_test)
trues = y_test
print(score(preds, trues))

0.573711703105
