<a href="https://colab.research.google.com/github/PierreM31/Kaggle_Challenge_NYC/blob/master/model_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import lib

### Import TensorFlow GPU

In [3]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


### Other import


In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import floor, ceil
import seaborn as sns
% matplotlib inline
plt.style.use('seaborn-whitegrid')

## Import data from BigQuerry 

In [0]:
from google.cloud import bigquery

client = bigquery.Client(project='bigquery-public-data')

## Import data directly from Colab


In [5]:
df_train = pd.read_csv('train.csv', nrows=500_000, parse_dates=["pickup_datetime"])
df_train.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2012-01-18 10:15:28.0000001,5.3,2012-01-18 10:15:28,-73.999906,40.727304,-74.001324,40.736911,1
1,2012-09-06 14:48:00.00000030,21.5,2012-09-06 14:48:00,-73.974438,40.754583,-73.979488,40.764328,5
2,2009-05-02 22:19:14.0000002,10.5,2009-05-02 22:19:14,0.0,0.0,0.0,0.0,1
3,2009-10-31 12:10:59.0000001,5.7,2009-10-31 12:10:59,0.0,0.0,0.0,0.0,1
4,2012-10-30 12:33:41.0000001,4.0,2012-10-30 12:33:41,-73.989309,40.763072,-73.988742,40.758697,1


## Preprocessing

In [6]:
print('Sample size before preprocessing: %d' % len(df_train))

Sample size before preprocessing: 174752


In [0]:
# drop-out negative fare
df_train = df_train[df_train.fare_amount>=0]

# drop-out non full line
df_train = df_train.dropna(how = 'any', axis = 'rows')

In [0]:
# minimum and maximum longitude test set
min(df_train.pickup_longitude.min(), df_train.dropoff_longitude.min()), \
max(df_train.pickup_longitude.max(), df_train.dropoff_longitude.max())

# minimum and maximum latitude test set
min(df_train.pickup_latitude.min(), df_train.dropoff_latitude.min()), \
max(df_train.pickup_latitude.max(), df_train.dropoff_latitude.max())

def select_within_boundingbox(df, BB):
    return (df.pickup_longitude >= BB[0]) & (df.pickup_longitude <= BB[1]) & \
           (df.pickup_latitude >= BB[2]) & (df.pickup_latitude <= BB[3]) & \
           (df.dropoff_longitude >= BB[0]) & (df.dropoff_longitude <= BB[1]) & \
           (df.dropoff_latitude >= BB[2]) & (df.dropoff_latitude <= BB[3])
  
# load image of NYC map
BB = (-74.5, -72.8, 40.5, 41.8)
nyc_map = plt.imread('https://aiblog.nl/download/nyc_-74.5_-72.8_40.5_41.8.png')

# load extra image to zoom in on NYC
BB_zoom = (-74.3, -73.7, 40.5, 40.9)
nyc_map_zoom = plt.imread('https://aiblog.nl/download/nyc_-74.3_-73.7_40.5_40.9.png')

df_train = df_train[select_within_boundingbox(df_train, BB)]

In [9]:
# Eead nyc mask and turn into boolean map with
# land = True, water = False
nyc_mask = plt.imread('https://aiblog.nl/download/nyc_mask-74.5_-72.8_40.5_41.8.png')[:,:,0] > 0.9

# Translate longitude/latitude coordinate into image xy coordinate
def lonlat_to_xy(longitude, latitude, dx, dy, BB):
    return (dx*(longitude - BB[0])/(BB[1]-BB[0])).astype('int'), \
           (dy - dy*(latitude - BB[2])/(BB[3]-BB[2])).astype('int')
  
pickup_x, pickup_y = lonlat_to_xy(df_train.pickup_longitude, df_train.pickup_latitude, 
                                  nyc_mask.shape[1], nyc_mask.shape[0], BB)
dropoff_x, dropoff_y = lonlat_to_xy(df_train.dropoff_longitude, df_train.dropoff_latitude, 
                                  nyc_mask.shape[1], nyc_mask.shape[0], BB)


idx = (nyc_mask[pickup_y, pickup_x] & nyc_mask[dropoff_y, dropoff_x])
print("Number of trips in water: {}".format(np.sum(~idx)))


def remove_datapoints_from_water(df):
    def lonlat_to_xy(longitude, latitude, dx, dy, BB):
        return (dx*(longitude - BB[0])/(BB[1]-BB[0])).astype('int'), \
               (dy - dy*(latitude - BB[2])/(BB[3]-BB[2])).astype('int')
      
    # define bounding box
    BB = (-74.5, -72.8, 40.5, 41.8)
    
    # read nyc mask and turn into boolean map with
    # land = True, water = False
    nyc_mask = plt.imread('https://aiblog.nl/download/nyc_mask-74.5_-72.8_40.5_41.8.png')[:,:,0] > 0.9
    
    # calculate for each lon,lat coordinate the xy coordinate in the mask map
    pickup_x, pickup_y = lonlat_to_xy(df.pickup_longitude, df.pickup_latitude, 
                                      nyc_mask.shape[1], nyc_mask.shape[0], BB)
    dropoff_x, dropoff_y = lonlat_to_xy(df.dropoff_longitude, df.dropoff_latitude, 
                                      nyc_mask.shape[1], nyc_mask.shape[0], BB)    
    # calculate boolean index
    idx = nyc_mask[pickup_y, pickup_x] & nyc_mask[dropoff_y, dropoff_x]
    
    # return only datapoints on land
    return df[idx]
  
  

df_train = remove_datapoints_from_water(df_train)

Number of trips in water: 32


In [10]:
print('Sample size after preprocessing: %d' % len(df_train))

Sample size after preprocessing: 171088


## Feature engineering

### Manhattan distance

In [11]:
# Given a dataframe, add two new features 'abs_diff_longitude' and 'abs_diff_latitude' reprensenting the vector from
# the pickup location to the dropoff location

def add_travel_vector_features(df):
    df['abs_diff_longitude'] = (df.dropoff_longitude - df.pickup_longitude).abs()
    df['abs_diff_latitude'] = (df.dropoff_latitude - df.pickup_latitude).abs()

add_travel_vector_features(df_train)


def convert_kilometers(train_df):
    train_df['abs_diff_longitude'] = train_df.abs_diff_longitude * 79.1
    train_df['abs_diff_latitude'] = train_df.abs_diff_latitude * 111
    
convert_kilometers(df_train)


### Angle difference between north, and manhattan roadways
meas_ang = 0.506 # 29 degrees = 0.506 radians
import math

## adding extra features
def add_extra_manh_features(df):
    df['euclidean_distance'] = (df.abs_diff_latitude**2 + df.abs_diff_longitude**2)**0.5  
    df['delta_manh_long'] = (df.euclidean_distance*np.sin(np.arctan(df.abs_diff_longitude / df.abs_diff_latitude)-meas_ang)).abs()
    df['delta_manh_lat'] = (df.euclidean_distance*np.cos(np.arctan(df.abs_diff_longitude / df.abs_diff_latitude)-meas_ang)).abs()
    df['manh_distance'] = df.delta_manh_long + df.delta_manh_lat
    df['euclidean_error'] = (df.manh_distance - df.euclidean_distance)*100 /  df.euclidean_distance

    
add_extra_manh_features(df_train)


df_train.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_diff_longitude,abs_diff_latitude,euclidean_distance,delta_manh_long,delta_manh_lat,manh_distance,euclidean_error
0,2012-01-18 10:15:28.0000001,5.3,2012-01-18 10:15:28,-73.999906,40.727304,-74.001324,40.736911,1,0.112164,1.066377,1.07226,0.418746,0.987113,1.405859,31.111806
1,2012-09-06 14:48:00.00000030,21.5,2012-09-06 14:48:00,-73.974438,40.754583,-73.979488,40.764328,5,0.399455,1.081695,1.153095,0.174879,1.139757,1.314636,14.009324
4,2012-10-30 12:33:41.0000001,4.0,2012-10-30 12:33:41,-73.989309,40.763072,-73.988742,40.758697,1,0.04485,0.485625,0.487692,0.196144,0.446509,0.642654,31.774575
5,2014-07-21 22:15:00.000000159,30.0,2014-07-21 22:15:00,-73.8752,40.773922,-73.971135,40.753697,1,7.588458,2.244975,7.913572,5.549451,5.64165,11.1911,41.416557
6,2011-02-25 23:35:00.000000154,13.3,2011-02-25 23:35:00,-74.008035,40.705807,-73.987173,40.729433,5,1.650184,2.622486,3.098474,0.172327,3.093678,3.266005,5.4069


## Build the network

In [0]:
def get_model(n_x, n_h1, n_h2):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Dense(n_h1, input_dim=n_x, activation='relu'))
    model.add(tf.keras.layers.Dense(n_h2, activation='relu'))
    model.add(tf.keras.layers.Dropout(0.5))
    model.add(tf.keras.layers.Dense(4, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())

    return model

In [59]:
import keras.backend as K

def rmse (y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred -y_true), axis=-1))

Using TensorFlow backend.


In [0]:
train_x = pd.DataFrame()
#train_x['pickup_longitude'] = df_train.pickup_longitude
#train_x['pickup_latitude'] = df_train.pickup_latitude
#train_x['dropoff_longitude'] = df_train.dropoff_longitude
#train_x['dropoff_latitude'] = df_train.dropoff_latitude
train_x['passenger_count'] = df_train.passenger_count
train_x['manh_distance'] = df_train.manh_distance

train_y = df_train.fare_amount

train_size = 0.99996
train_cnt = floor(train_x.shape[0] * train_size)
x_train = train_x.iloc[0:train_cnt].values
y_train = train_y.iloc[0:train_cnt].values
x_test = train_x.iloc[train_cnt:].values
y_test = train_y.iloc[train_cnt:].values

In [214]:
inputs = tf.keras.Input(shape=(2,), name='input_layer')

x = tf.keras.layers.Dense(64, activation='relu')(inputs)
x = tf.keras.layers.Dense(64, activation='relu')(x)
predictions = tf.keras.layers.Dense(1, activation='relu')(x)

model = tf.keras.Model(inputs=inputs, outputs=predictions)
model.compile(optimizer='rmsprop',
              loss='mse',
              metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=32, epochs=2)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7fb6269e2b38>

In [219]:
#other network formalism

model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(64,tf.keras.activations.relu,input_dim=2))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(64,tf.keras.activations.relu))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(1,tf.keras.activations.relu))
model.compile(tf.keras.optimizers.Adam(lr=0.001), loss='mse',metrics=['accuracy'])
# This builds the model for the first time:
model.fit(x_train, y_train, batch_size=32, epochs=2)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7fb6260f69b0>

In [209]:
model.evaluate(x_train, y_train)
#print (model.layers[1].get_weights()[0])



[92.50763735758389, 0.0]

In [217]:
#print (model.predict([x_test[0]]))

print(x_test)
print (y_test)
print (model.predict(x_test)[0:10])

[[1.         2.99053816]
 [1.         2.43948396]]
[9.7 6.5]
[[10.760851]
 [10.760851]]
