In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

from tensorflow.keras.models import Sequential, load_model, model_from_json
from tensorflow.keras.layers import Dense, Activation, BatchNormalization
from tensorflow.keras import regularizers
from tensorflow.keras import optimizers
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping, LearningRateScheduler
import tensorflow.keras.backend as K

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Code Cleaning

In [2]:
def clean(df):
    # Delimiter lats and lons to NY only
    df = df[(-76 <= df['pickup_longitude']) & (df['pickup_longitude'] <= -72)]
    df = df[(-76 <= df['dropoff_longitude']) & (df['dropoff_longitude'] <= -72)]
    df = df[(38 <= df['pickup_latitude']) & (df['pickup_latitude'] <= 42)]
    df = df[(38 <= df['dropoff_latitude']) & (df['dropoff_latitude'] <= 42)]
    # Remove possible outliers
    df = df[(0 < df['fare_amount']) & (df['fare_amount'] <= 250)]
    # Remove inconsistent values
    df = df[(df['dropoff_longitude'] != df['pickup_longitude'])]
    df = df[(df['dropoff_latitude'] != df['pickup_latitude'])]
    
    return df

def late_night (row):
    if (row['hour'] <= 6) or (row['hour'] >= 20):
        return 1
    else:
        return 0


def night (row):
    if ((row['hour'] <= 20) and (row['hour'] >= 16)) and (row['weekday'] < 5):
        return 1
    else:
        return 0

# Feature Engineering

In [3]:
def add_time_features(df):
    df['pickup_datetime'] =  pd.to_datetime(df['pickup_datetime'], format='%Y-%m-%d %H:%M:%S %Z')
    df['year'] = df['pickup_datetime'].apply(lambda x: x.year)
    df['month'] = df['pickup_datetime'].apply(lambda x: x.month)
    df['day'] = df['pickup_datetime'].apply(lambda x: x.day)
    df['hour'] = df['pickup_datetime'].apply(lambda x: x.hour)
    df['weekday'] = df['pickup_datetime'].apply(lambda x: x.weekday())
    df['pickup_datetime'] =  df['pickup_datetime'].apply(lambda x: str(x))
    df['night'] = df.apply (lambda x: night(x), axis=1)
    df['late_night'] = df.apply (lambda x: late_night(x), axis=1)
    # Drop 'pickup_datetime' as we won't need it anymore
    df = df.drop('pickup_datetime', axis=1)
    
    return df

def add_distance_features(df):
    df['horizontal_dist'] = df["pickup_longitude"] - df["dropoff_longitude"]
    df['vertical_dist'] = df["pickup_latitude"] - df["dropoff_latitude"]
    df['euclidian'] = df['horizontal_dist'] ** 2 + df['vertical_dist'] ** 2
    
    ny_coord = (40.7141667, -74.0063889)
    jfk_coord = (40.639722, -73.778889)
    ewr_coord = (40.6925, -74.168611)
    lga_coord = (40.77725, -73.872611)
    
    df['jfk_pickup'] = ((df["pickup_longitude"] - jfk_coord[1]) ** 2 + (df["pickup_latitude"] - jfk_coord[0]) ** 2) ** (1/2)
    df['jfk_dropoff'] = ((df["dropoff_longitude"] - jfk_coord[1]) ** 2 + (df["dropoff_latitude"] - jfk_coord[0]) ** 2) ** (1/2)
    
    df['ewr_pickup'] = ((df["pickup_longitude"] - ewr_coord[1]) ** 2 + (df["pickup_latitude"] - ewr_coord[0]) ** 2) ** (1/2)
    df['ewr_dropoff'] = ((df["dropoff_longitude"] - ewr_coord[1]) ** 2 + (df["dropoff_latitude"] - ewr_coord[0]) ** 2) ** (1/2)
    
    df['lga_pickup'] = ((df["pickup_longitude"] - lga_coord[1]) ** 2 + (df["pickup_latitude"] - lga_coord[0]) ** 2) ** (1/2)
    df['lga_dropoff'] = ((df["dropoff_longitude"] - lga_coord[1]) ** 2 + (df["dropoff_latitude"] - lga_coord[0]) ** 2) ** (1/2)
    
    df['lga_pickup'] = ((df["pickup_longitude"] - ny_coord[1]) ** 2 + (df["pickup_latitude"] - ny_coord[0]) ** 2) ** (1/2)
    df['lga_dropoff'] = ((df["dropoff_longitude"] - ny_coord[1]) ** 2 + (df["dropoff_latitude"] - ny_coord[0]) ** 2) ** (1/2)
    
    
    return df

# Helpful Functions

In [4]:
def plot_loss_accuracy(history):
    plt.figure(figsize=(20,10))
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper right')
    plt.show()
    
    plt.figure(figsize=(20,10))
    plt.plot(history.history['root_mean_squared_error'])
    plt.plot(history.history['val_root_mean_squared_error'])
    plt.title('model MSE')
    plt.ylabel('MSE')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper right')
    plt.show()

In [5]:
def output_submission(raw_test, prediction, id_column, prediction_column, file_name):
    df = pd.DataFrame(prediction, columns=[prediction_column])
    df[id_column] = raw_test[id_column]
    df[[id_column, prediction_column]].to_csv((file_name), index=False)
    print('Output complete')

In [6]:
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true)))

# Setting up Data

In [7]:
datatypes = {'key': 'str', 
              'fare_amount': 'float32',
              'pickup_datetime': 'str', 
              'pickup_longitude': 'float32',
              'pickup_latitude': 'float32',
              'dropoff_longitude': 'float32',
              'dropoff_latitude': 'float32',
              'passenger_count': 'uint8'}

In [47]:
DATA_SIZE = 5000000
perc = 0.5


# train_full = pd.read_csv('train.csv', nrows=DATA_SIZE, dtype=datatypes, usecols=[1,2,3,4,5,6,7])
test = pd.read_csv('test.csv')

# train = train_full.sample(frac=perc)

# train = clean(train)

# train_clean = add_time_features(train)
# test_clean = add_time_features(test)

# train_clean = add_distance_features(train_clean)
# test_clean = add_distance_features(test_clean)

# dropped_columns = ['pickup_longitude', 'pickup_latitude', 
#                    'dropoff_longitude', 'dropoff_latitude',
#                    'horizontal_dist', 'vertical_dist']
# train_clean = train_clean.drop(dropped_columns, axis=1)
# test_clean = test_clean.drop(dropped_columns + ['key'], axis=1)

In [48]:
# train_clean.to_csv('train_clean.csv')
# test_clean.to_csv('test_clean.csv')

train_clean = pd.read_csv('train_clean.csv')
train_clean = train_clean.sample(frac=perc)

test_clean = pd.read_csv('test_clean.csv')

In [49]:
train_clean.head(5)

Unnamed: 0.1,Unnamed: 0,fare_amount,passenger_count,year,month,day,hour,weekday,night,late_night,euclidian,jfk_pickup,jfk_dropoff,ewr_pickup,ewr_dropoff,lga_pickup,lga_dropoff
1165548,206120,5.3,1,2009,11,24,20,1,1,1,0.000333,0.227444,0.225258,0.215074,0.231573,0.066285,0.084417
4778287,2085120,23.7,1,2012,6,18,8,0,0,0,0.010464,0.226376,0.240935,0.202674,0.289992,0.04963,0.151811
3584278,622618,10.1,4,2011,9,9,10,4,0,0,0.000141,0.237349,0.238439,0.201543,0.19171,0.056964,0.045242
2820325,705415,6.9,1,2011,3,22,23,1,0,1,0.00034,0.250153,0.239601,0.167781,0.16759,0.026375,0.008761
3829002,4893779,27.47,1,2009,8,25,11,1,0,0,0.0157,0.154118,0.231541,0.315344,0.190262,0.153815,0.036587


In [50]:
test_clean.head(5)

Unnamed: 0.1,Unnamed: 0,passenger_count,year,month,day,hour,weekday,night,late_night,euclidian,jfk_pickup,jfk_dropoff,ewr_pickup,ewr_dropoff,lga_pickup,lga_dropoff
0,0,1,2015,1,27,13,1,0,0,0.000465,0.230651,0.227733,0.207901,0.194093,0.059645,0.038771
1,1,1,2015,1,27,13,1,0,0,0.000537,0.222708,0.241443,0.183726,0.176033,0.020212,0.026134
2,2,1,2011,10,8,11,5,0,0,3.4e-05,0.232181,0.227225,0.195144,0.196423,0.044107,0.041677
3,3,1,2012,12,1,21,5,0,1,0.000348,0.239415,0.239336,0.202012,0.187721,0.059277,0.040718
4,4,1,2012,12,1,21,5,0,1,0.002564,0.239883,0.234365,0.224711,0.187385,0.085698,0.035119


In [51]:
train_df, validation_df = train_test_split(train_clean, test_size=0.2, random_state=1)

train_labels = train_df['fare_amount'].values
validation_labels = validation_df['fare_amount'].values
train_df = train_df.drop(['fare_amount'], axis=1)
validation_df = validation_df.drop(['fare_amount'], axis=1)

In [52]:
scaler = preprocessing.MinMaxScaler()
train_df_scaled = scaler.fit_transform(train_df)
validation_df_scaled = scaler.transform(validation_df)
test_scaled = scaler.transform(test_clean)

  return self.partial_fit(X, y)


# Training Neural Networks

In [53]:
def learning_scheduale(epoch, learning_rate):
#     if epoch % 20 == 0:
#         numInt = epoch / 20
#         if numInt == 0:
#             return learning_rate
#         return learning_rate * ((1/4) ** (1/numInt))
    if epoch == 20:
        return learning_rate * (1/4)
    if epoch == 35:
        return learning_rate * (1/2)
    return learning_rate
    

In [54]:
def make_model(LEARNING_RATE, REGULARIZATION_RATE, layers, opt_choice):
    model = Sequential()
    first = True
    for layer in layers:
        if   first:
            model.add(Dense(layer, activation='relu', input_dim=train_df_scaled.shape[1], 
                            activity_regularizer=regularizers.l1_l2(REGULARIZATION_RATE, REGULARIZATION_RATE)))
            model.add(BatchNormalization())
            first = False
        else:
            model.add(Dense(layer, activation='relu'))
            model.add(BatchNormalization())
    model.add(Dense(1))

    if opt_choice == 0:
        opt = optimizers.Adam(lr=LEARNING_RATE)
    elif opt_choice == 1:
        opt = optimizers.RMSprop(lr=LEARNING_RATE)
    elif opt_choice == 2:
        opt = optimizers.Adagrad(lr=LEARNING_RATE)
    else:
        opt = optimizers.SGD(lr=LEARNING_RATE)
    model.compile(optimizer=opt, loss='mse', metrics=[root_mean_squared_error])
    
    return model

In [None]:
BATCH_SIZE = 512
EPOCHS = 50
LEARNING_RATE = 0.0001
REGULARIZATION_RATE = 0.01
make_new_model = True

callback = [#EarlyStopping(patience=60, monitor='val_loss'),
            #ReduceLROnPlateau(patience=20, monitor='val_loss', factor=0.5, min_lr=0.00001, verbose=1),
            LearningRateScheduler(learning_scheduale),
            ModelCheckpoint('model', monitor='val_loss', save_best_only=True)]

if make_new_model:
    model = make_model(LEARNING_RATE, REGULARIZATION_RATE, [256] * 6, 0)
    model.load_weights('model')

history = model.fit(x=train_df_scaled, y=train_labels, batch_size=BATCH_SIZE, epochs=EPOCHS, 
                    verbose=1, validation_data=(validation_df_scaled, validation_labels), 
                    shuffle=True, callbacks = callback)

#model.save_weights('model_full')

Train on 1935368 samples, validate on 483842 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
 438784/1935368 [=====>........................] - ETA: 34s - loss: 12.9422 - root_mean_squared_error: 3.430

In [None]:
# model.summary()

In [None]:
# plot_loss_accuracy(history)

In [45]:
min(history.history['val_loss'])

12.462903876566608

In [None]:
prediction = model.predict(test_scaled, batch_size=512, verbose=1)
output_submission(test, prediction, 'key', 'fare_amount', 'submission_full.csv')

In [46]:
model.load_weights('model')

prediction = model.predict(test_scaled, batch_size=512, verbose=1)
output_submission(test, prediction, 'key', 'fare_amount', 'submission.csv')

Output complete


In [34]:
best_loss = [0] * 4
learn_list = [0.001, 0.00025, 0.00015, 0.0001]

for i in [0, 1, 2, 3]:
    BATCH_SIZE = 512
    EPOCHS = 50
    LEARNING_RATE = learn_list[i]
    REGULARIZATION_RATE = 0.01
    make_new_model = True

    print('\nLearning Rate: {}\n'.format(LEARNING_RATE))
    
    callback = [#EarlyStopping(patience=60, monitor='val_loss'),
                #ReduceLROnPlateau(patience=20, monitor='val_loss', factor=0.5, min_lr=0.00001, verbose=1),
                #LearningRateScheduler(learning_scheduale),
                ModelCheckpoint('model{}'.format(i), monitor='val_loss', save_best_only=True)]

    if make_new_model:
        model = make_model(LEARNING_RATE, REGULARIZATION_RATE, [256] * 6, 0)
        
        if i > 0:
            model.load_weights('model{}'.format(i-1))

    history = model.fit(x=train_df_scaled, y=train_labels, batch_size=BATCH_SIZE, epochs=EPOCHS, 
                        verbose=1, validation_data=(validation_df_scaled, validation_labels), 
                        shuffle=True, callbacks = callback)
    
    best_loss[i] = min(history.history['val_loss'])


Learning Rate: 0.001

Train on 193536 samples, validate on 48385 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50


Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50

Learning Rate: 0.00025

Train on 193536 samples, validate on 48385 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50


Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50

Learning Rate: 0.00015

Train on 193536 samples, validate on 48385 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50


Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50

Learning Rate: 0.0001

Train on 193536 samples, validate on 48385 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50


Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [35]:
for loss in best_loss:
    print(loss)

14.825783317453965
13.55271577426098
13.534215311724699
13.495046053921202


In [None]:
BATCH_SIZE = 512
EPOCHS = 40
LEARNING_RATE = 0.0001
REGULARIZATION_RATE = 0.001
make_new_model = True

callback = [#EarlyStopping(patience=60, monitor='val_loss'),
            #ReduceLROnPlateau(patience=20, monitor='val_loss', factor=0.5, min_lr=0.00001, verbose=1),
            LearningRateScheduler(learning_scheduale),
            ModelCheckpoint('model', monitor='val_loss', save_best_only=True)]

# 256, 256, 128, 64 is best at 3.34

if make_new_model:
    model = make_model(LEARNING_RATE, REGULARIZATION_RATE, [256,256,128,128,64,64])

model.load_weights('model320')

# HyperParameter Tuning

In [None]:
history_list = [0] * 5

In [None]:
ind = 0

for i in [1,0.1,0.01,0.001,0.0001]:
    BATCH_SIZE = 512
    EPOCHS = 50
    LEARNING_RATE = 0.001
    REGULARIZATION_RATE = i
    make_new_model = True

    callback = [#EarlyStopping(patience=60, monitor='val_loss'),
                #ReduceLROnPlateau(patience=20, monitor='val_loss', factor=0.5, min_lr=0.00001, verbose=1),
                LearningRateScheduler(learning_scheduale),
                ModelCheckpoint('model', monitor='val_loss', save_best_only=True)]

    # 256, 256, 128, 64 is best at 3.34

    if make_new_model:
        model = make_model(LEARNING_RATE, REGULARIZATION_RATE, [256] * 6, 0)

    history = model.fit(x=train_df_scaled, y=train_labels, batch_size=BATCH_SIZE, epochs=EPOCHS, 
                        verbose=1, validation_data=(validation_df_scaled, validation_labels), 
                        shuffle=True, callbacks = callback)

    history_list[ind] = history
    
    ind += 1
    
model.save_weights('model_full')

In [None]:
def plot_history_list(history_list):
    plt.figure(figsize=(20,10))
    for history in history_list:
        plt.plot(history.history['val_loss'])
    plt.title('Differing Hidden Layer Size', fontsize=24)
    plt.ylabel('validation loss', fontsize=24)
    plt.xlabel('epoch', fontsize=24)
    plt.legend(['32 nodes', '64 nodes', '128 nodes',
               '256 nodes', '512 nodes'], loc='upper right', fontsize=24)
    plt.show()
    
    plt.figure(figsize=(20,10))
    for history in history_list:
        plt.plot(history.history['val_root_mean_squared_error'])
    plt.title('Differing Hidden Layer Size RMSE')
    plt.ylabel('RMSE')
    plt.xlabel('epoch')
    plt.legend(['1.0', '0.1', '0.01',
               '0.001', '0.0001'], loc='upper right')
    plt.show()

In [None]:
plot_history_list(history_list)

In [None]:
for history in history_list:
    print(min(history.history['val_loss']))

In [None]:
for history in history_list:
    print(min(history.history['val_root_mean_squared_error']))

In [None]:
temp = pd.read_csv('train.csv', nrows=5000000, dtype=datatypes, usecols=[1,2,3,4,5,6,7])

In [None]:
temp = temp.sample(frac=0.1)

temp.to_csv('500k_data.csv')