In [1]:
import pandas as pd
import numpy as np

In [2]:
selected_cols = 'fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count'.split(',')
selected_cols

['fare_amount',
 'pickup_datetime',
 'pickup_longitude',
 'pickup_latitude',
 'dropoff_longitude',
 'dropoff_latitude',
 'passenger_count']

In [3]:
dtypes = {
    'fare_amount': 'float32',
    'pickup_datetime': 'float32',
    'pickup_longitude': 'float32',
    'pickup_latitude': 'float32',
    'dropoff_longitude': 'float32',
    ',dropoff_latitude': 'float32',
    'passenger_count': 'float32'
}

In [4]:
import random
random.random()

0.29082673152742955

In [5]:
sample_fraction = 0.0005
def skip_row(index):
    if(index == 0):
        return False
    return random.random() > sample_fraction

random.seed(32)

In [6]:
df = pd.read_csv('train.csv',usecols=selected_cols,parse_dates=['pickup_datetime'],dtype=dtypes,skiprows=skip_row)

In [7]:
df.sample(5)

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
8294,9.7,2009-12-12 17:54:16+00:00,-73.97007,40.762989,-73.980782,40.780988,1.0
79,11.0,2015-05-11 13:27:59+00:00,-73.981567,40.758839,-73.968849,40.769695,1.0
24344,13.0,2015-03-19 23:06:36+00:00,-73.985138,40.74485,-73.95546,40.786999,1.0
19564,43.299999,2011-01-30 20:37:00+00:00,-73.776756,40.645172,-73.973755,40.68754,1.0
366,10.5,2011-03-17 16:06:12+00:00,-73.989861,40.749725,-73.956696,40.764743,1.0


In [8]:
from sklearn.model_selection import train_test_split
train,test = train_test_split(df,test_size=0.2,random_state = 2)

In [9]:
train = train.dropna()
test = test.dropna()

In [10]:
train['year'] = train['pickup_datetime'].apply(lambda t:t.year)
train['weekdays'] = train['pickup_datetime'].apply(lambda t:t.weekday())
train['hours'] = train['pickup_datetime'].apply(lambda t:t.hour)
train['month'] = train['pickup_datetime'].apply(lambda t:t.month)
train['day'] = train['pickup_datetime'].apply(lambda t:t.day)

In [11]:
# do for the test part

In [12]:
final_test = pd.read_csv('test.csv',parse_dates=['pickup_datetime'])

In [13]:
final_test['year'] = final_test['pickup_datetime'].apply(lambda t:t.year)
final_test['weekdays'] = final_test['pickup_datetime'].apply(lambda t:t.weekday())
final_test['hours'] = final_test['pickup_datetime'].apply(lambda t:t.hour)
final_test['month'] = final_test['pickup_datetime'].apply(lambda t:t.month)
final_test['day'] = final_test['pickup_datetime'].apply(lambda t:t.day)

In [14]:
final_test.sample(5)

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,weekdays,hours,month,day
4241,2013-06-04 08:06:57.0000002,2013-06-04 08:06:57+00:00,-74.001433,40.727878,-73.993427,40.741955,1,2013,1,8,6,4
758,2011-06-01 07:37:00.000000163,2011-06-01 07:37:00+00:00,-73.945773,40.778205,-73.968272,40.785965,1,2011,2,7,6,1
9093,2009-06-10 16:55:00.00000011,2009-06-10 16:55:00+00:00,-73.964848,40.806907,-73.981762,40.766703,5,2009,2,16,6,10
4794,2009-04-26 17:29:42.0000005,2009-04-26 17:29:42+00:00,-73.972613,40.744824,-74.0027,40.72356,1,2009,6,17,4,26
8668,2013-09-25 22:00:00.00000066,2013-09-25 22:00:00+00:00,-73.973758,40.763487,-73.977173,40.755297,3,2013,2,22,9,25


In [15]:
test['year'] = test['pickup_datetime'].apply(lambda t:t.year)
test['weekdays'] = test['pickup_datetime'].apply(lambda t:t.weekday())
test['hours'] = test['pickup_datetime'].apply(lambda t:t.hour)
test['month'] = test['pickup_datetime'].apply(lambda t:t.month)
test['day'] = test['pickup_datetime'].apply(lambda t:t.day)

In [16]:
train.shape,test.shape

((22074, 12), (5519, 12))

In [17]:
# distance between pickup and dropoff

In [18]:
def haversine(lon1, lat1, lon2, lat2):

    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles. Determines return value units.
    return c * r

In [19]:
def add_dis(df):
    df['distance_travelled'] = haversine(
        df['pickup_longitude'],
        df['pickup_latitude'],
        df['dropoff_longitude'],
        df['dropoff_latitude']
    )

In [20]:
add_dis(train)

In [21]:
train.sample(5)

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,weekdays,hours,month,day,distance_travelled
4495,13.3,2012-07-14 06:00:00+00:00,-73.948936,40.770828,-74.003609,40.753442,1.0,2012,5,6,7,14,4.994184
23738,14.9,2012-04-13 01:57:00+00:00,-73.99218,40.73455,-73.922897,40.741068,2.0,2012,4,1,4,13,5.881722
16111,16.9,2011-10-21 23:17:06+00:00,-74.012108,40.707352,-73.988235,40.774679,1.0,2011,4,23,10,21,7.752034
33,3.7,2011-12-13 23:36:23+00:00,-73.965393,40.766735,-73.968849,40.770001,2.0,2011,1,23,12,13,0.465367
1145,12.0,2013-12-14 12:41:13+00:00,0.0,0.0,0.0,0.0,1.0,2013,5,12,12,14,0.0


In [22]:
add_dis(test)

In [23]:
add_dis(final_test)

In [24]:
# add distance from popular landmarks

In [25]:
jfk_lonlat = -73.7781, 40.6413
lga_lonlat = -73.8740, 40.7769
ewr_lonlat = -74.1745, 40.6895
met_lonlat = -73.9632, 40.7794
wtc_lonlat = -74.0099, 40.7126

In [26]:
def add_landmark_dropoff_distance(df, landmark_name, landmark_lonlat):
    lon, lat = landmark_lonlat
    df[landmark_name + '_drop_distance'] = haversine(lon, lat, df['dropoff_longitude'], df['dropoff_latitude'])

In [27]:
for a_df in [train, test, final_test]:
    for name, lonlat in [('jfk', jfk_lonlat), ('lga', lga_lonlat), ('ewr', ewr_lonlat), ('met', met_lonlat), ('wtc', wtc_lonlat)]:
        add_landmark_dropoff_distance(a_df, name, lonlat)

In [28]:
train.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,weekdays,hours,month,day,distance_travelled,jfk_drop_distance,lga_drop_distance,ewr_drop_distance,met_drop_distance,wtc_drop_distance
17184,9.7,2010-03-13 22:29:49+00:00,-73.979774,40.77533,-73.947304,40.781034,2.0,2010,5,22,3,13,2.806749,21.090629,6.189137,21.680523,1.351095,9.258156
26083,36.830002,2013-03-27 14:35:00+00:00,-73.974632,40.761883,-73.871071,40.774187,1.0,2013,2,14,3,27,8.82787,16.725884,0.389722,27.246236,7.779491,13.553254
27064,6.5,2011-08-25 09:02:00+00:00,-73.979439,40.77774,-73.959785,40.776767,1.0,2011,3,9,8,25,1.658149,21.481035,7.223585,20.529575,0.410374,8.29047
23366,7.0,2014-01-01 15:39:00+00:00,-73.972725,40.780846,-73.970612,40.767597,1.0,2014,2,15,1,1,1.483963,21.460984,8.201205,19.250506,1.453241,6.953683
16420,14.0,2013-06-29 18:50:00+00:00,-73.776871,40.645412,-73.780014,40.666477,5.0,2013,5,18,6,29,2.357408,2.804199,14.611673,33.365201,19.899606,20.049995


In [29]:
# remove outliers and invalid values

In [30]:
df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,27593.0,27593.0,27593.0,27593.0,27593.0,27593.0
mean,11.355337,-72.613846,39.876396,-72.594383,39.880884,1.68637
std,10.087166,17.243196,9.600838,16.839939,9.591884,1.307774
min,-52.0,-2256.421143,-1185.391235,-2256.421143,-1185.391185,0.0
25%,6.0,-73.992081,40.73513,-73.991455,40.734178,1.0
50%,8.5,-73.981819,40.752712,-73.980049,40.753201,1.0
75%,12.5,-73.966988,40.767166,-73.96357,40.768167,2.0
max,340.0,40.840694,42.359001,40.802437,42.3323,6.0


In [31]:
def remove_outliers(df):
    return df[(df['fare_amount'] >= 1.) & 
              (df['fare_amount'] <= 500.) &
              (df['pickup_longitude'] >= -75) & 
              (df['pickup_longitude'] <= -72) & 
              (df['dropoff_longitude'] >= -75) & 
              (df['dropoff_longitude'] <= -72) & 
              (df['pickup_latitude'] >= 40) & 
              (df['pickup_latitude'] <= 42) & 
              (df['dropoff_latitude'] >=40) & 
              (df['dropoff_latitude'] <= 42) & 
              (df['passenger_count'] >= 1) & 
              (df['passenger_count'] <= 6)]

In [32]:
train = remove_outliers(train)

In [33]:
test = remove_outliers(test)

In [34]:
train.columns

Index(['fare_amount', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'year',
       'weekdays', 'hours', 'month', 'day', 'distance_travelled',
       'jfk_drop_distance', 'lga_drop_distance', 'ewr_drop_distance',
       'met_drop_distance', 'wtc_drop_distance'],
      dtype='object')

In [35]:
input_col = [ 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'year',
       'weekdays', 'hours', 'month', 'day', 'distance_travelled',
       'jfk_drop_distance', 'lga_drop_distance', 'ewr_drop_distance',
       'met_drop_distance', 'wtc_drop_distance']

In [36]:
target_col = 'fare_amount'

In [37]:
x_train = train[input_col]
x_test = train[target_col]

In [58]:
test_input = final_test[input_col]

In [38]:
y_train = test[input_col]
y_test = test[target_col]

In [39]:
from sklearn.metrics import root_mean_squared_error

In [40]:
def evaluate(model):
    train_preds = model.predict(x_train)
    train_rmse =  root_mean_squared_error(x_test, train_preds)
    test_preds = model.predict(y_train)
    test_rmse = root_mean_squared_error(y_test, test_preds)
    return train_rmse, test_rmse, train_preds, test_preds

In [41]:
from sklearn.linear_model import Ridge

In [42]:
model1 = Ridge(random_state=42)

In [43]:
model1.fit(x_train, x_test)

In [44]:
evaluate(model1)

(5.6828103376112855,
 5.978044619249369,
 array([ 8.29453491, 26.08994161,  6.14834761, ..., 12.06355909,
        11.51812163, 12.43494441]),
 array([ 8.96778194,  7.32822472,  9.58695065, ..., 10.24801579,
        51.19953668,  6.32890654]))

In [45]:
from sklearn.ensemble import RandomForestRegressor

In [46]:
model2 = RandomForestRegressor(max_depth=10, n_jobs=-1, random_state=42, n_estimators=50)

In [47]:
model2.fit(x_train, x_test)

In [48]:
evaluate(model2)

(3.3508082960619614,
 4.98981051435932,
 array([ 8.80010599, 33.73497007,  6.80290577, ..., 10.95426955,
        12.30995995, 12.77772098]),
 array([11.11403405,  5.01046114, 10.54648179, ..., 11.22765081,
        48.65939316,  4.99927509]))

In [68]:
import os
current_directory = os.getcwd()
data_dir = current_directory

In [49]:
import xgboost

In [50]:
from xgboost import XGBRegressor

In [51]:
model3 = XGBRegressor(random_state=42, n_jobs=-1, objective='reg:squarederror')

In [52]:
model3.fit(x_train, x_test)

In [53]:
evaluate(model3)

(1.7977133,
 4.6625257,
 array([ 8.760898, 34.312187,  7.560793, ...,  9.432157, 10.798134,
        14.030467], dtype=float32),
 array([ 9.732045 ,  5.1069846,  9.23608  , ...,  9.839843 , 48.372684 ,
         6.0207357], dtype=float32))

In [66]:
def predict_and_submit(model, fname):
    test_preds = model.predict(test_input)
    sub_df = pd.read_csv(data_dir+'/sample_submission.csv')
    sub_df['fare_amount'] = test_preds
    sub_df.to_csv(fname, index=None)
    return sub_df

In [67]:
predict_and_submit(model1,"ridge_sub.csv")

Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,10.008460
1,2015-01-27 13:08:24.0000003,11.115557
2,2011-10-08 11:53:44.0000002,5.509371
3,2012-12-01 21:12:12.0000002,8.799904
4,2012-12-01 21:12:12.0000003,14.738205
...,...,...
9909,2015-05-10 12:37:51.0000002,9.286850
9910,2015-01-12 17:05:51.0000001,11.507479
9911,2015-04-19 20:44:15.0000001,47.570126
9912,2015-01-31 01:05:19.0000005,22.647526


In [69]:
predict_and_submit(model2,"Random_forest.csv")

Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,10.032569
1,2015-01-27 13:08:24.0000003,10.405475
2,2011-10-08 11:53:44.0000002,4.923554
3,2012-12-01 21:12:12.0000002,8.070972
4,2012-12-01 21:12:12.0000003,15.218567
...,...,...
9909,2015-05-10 12:37:51.0000002,9.477440
9910,2015-01-12 17:05:51.0000001,12.307505
9911,2015-04-19 20:44:15.0000001,52.874224
9912,2015-01-31 01:05:19.0000005,20.597853


In [70]:
predict_and_submit(model3,"xgBoost.csv")

Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,11.360831
1,2015-01-27 13:08:24.0000003,9.791626
2,2011-10-08 11:53:44.0000002,4.591375
3,2012-12-01 21:12:12.0000002,7.901974
4,2012-12-01 21:12:12.0000003,16.719931
...,...,...
9909,2015-05-10 12:37:51.0000002,9.103668
9910,2015-01-12 17:05:51.0000001,13.166449
9911,2015-04-19 20:44:15.0000001,56.141384
9912,2015-01-31 01:05:19.0000005,20.754114
