<a href="https://colab.research.google.com/github/PierreM31/Kaggle_Challenge_NYC/blob/master/model_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import lib

### Import TensorFlow GPU

In [104]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


### Other import


In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import floor, ceil
import seaborn as sns
import random
from keras.callbacks import LambdaCallback
% matplotlib inline
plt.style.use('seaborn-whitegrid')

## Import data from BigQuerry 

In [0]:
#TODO
"""from google.cloud import bigquery

client = bigquery.Client(project='bigquery-public-data')"""

## Import data directly from Colab


In [106]:
df_train = pd.read_csv('train.csv', nrows=500_000, parse_dates=["pickup_datetime"])
df_train.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2012-01-18 10:15:28.0000001,5.3,2012-01-18 10:15:28,-73.999906,40.727304,-74.001324,40.736911,1
1,2012-09-06 14:48:00.00000030,21.5,2012-09-06 14:48:00,-73.974438,40.754583,-73.979488,40.764328,5
2,2009-05-02 22:19:14.0000002,10.5,2009-05-02 22:19:14,0.0,0.0,0.0,0.0,1
3,2009-10-31 12:10:59.0000001,5.7,2009-10-31 12:10:59,0.0,0.0,0.0,0.0,1
4,2012-10-30 12:33:41.0000001,4.0,2012-10-30 12:33:41,-73.989309,40.763072,-73.988742,40.758697,1


## Preprocessing

In [107]:
print('Sample size before preprocessing: %d' % len(df_train))

Sample size before preprocessing: 174752


In [0]:
# drop-out negative fare
df_train = df_train[df_train.fare_amount>=0]

# drop-out non full line
df_train = df_train.dropna(how = 'any', axis = 'rows')

In [0]:
# minimum and maximum longitude test set
min(df_train.pickup_longitude.min(), df_train.dropoff_longitude.min()), \
max(df_train.pickup_longitude.max(), df_train.dropoff_longitude.max())

# minimum and maximum latitude test set
min(df_train.pickup_latitude.min(), df_train.dropoff_latitude.min()), \
max(df_train.pickup_latitude.max(), df_train.dropoff_latitude.max())

def select_within_boundingbox(df, BB):
    return (df.pickup_longitude >= BB[0]) & (df.pickup_longitude <= BB[1]) & \
           (df.pickup_latitude >= BB[2]) & (df.pickup_latitude <= BB[3]) & \
           (df.dropoff_longitude >= BB[0]) & (df.dropoff_longitude <= BB[1]) & \
           (df.dropoff_latitude >= BB[2]) & (df.dropoff_latitude <= BB[3])
  
# load image of NYC map
BB = (-74.5, -72.8, 40.5, 41.8)
nyc_map = plt.imread('https://aiblog.nl/download/nyc_-74.5_-72.8_40.5_41.8.png')

# load extra image to zoom in on NYC
BB_zoom = (-74.3, -73.7, 40.5, 40.9)
nyc_map_zoom = plt.imread('https://aiblog.nl/download/nyc_-74.3_-73.7_40.5_40.9.png')

df_train = df_train[select_within_boundingbox(df_train, BB)]

In [111]:
# Eead nyc mask and turn into boolean map with
# land = True, water = False
nyc_mask = plt.imread('https://aiblog.nl/download/nyc_mask-74.5_-72.8_40.5_41.8.png')[:,:,0] > 0.9

# Translate longitude/latitude coordinate into image xy coordinate
def lonlat_to_xy(longitude, latitude, dx, dy, BB):
    return (dx*(longitude - BB[0])/(BB[1]-BB[0])).astype('int'), \
           (dy - dy*(latitude - BB[2])/(BB[3]-BB[2])).astype('int')
  
pickup_x, pickup_y = lonlat_to_xy(df_train.pickup_longitude, df_train.pickup_latitude, 
                                  nyc_mask.shape[1], nyc_mask.shape[0], BB)
dropoff_x, dropoff_y = lonlat_to_xy(df_train.dropoff_longitude, df_train.dropoff_latitude, 
                                  nyc_mask.shape[1], nyc_mask.shape[0], BB)


idx = (nyc_mask[pickup_y, pickup_x] & nyc_mask[dropoff_y, dropoff_x])
print("Number of trips in water: {}".format(np.sum(~idx)))


def remove_datapoints_from_water(df):
    def lonlat_to_xy(longitude, latitude, dx, dy, BB):
        return (dx*(longitude - BB[0])/(BB[1]-BB[0])).astype('int'), \
               (dy - dy*(latitude - BB[2])/(BB[3]-BB[2])).astype('int')
      
    # define bounding box
    BB = (-74.5, -72.8, 40.5, 41.8)
    
    # read nyc mask and turn into boolean map with
    # land = True, water = False
    nyc_mask = plt.imread('https://aiblog.nl/download/nyc_mask-74.5_-72.8_40.5_41.8.png')[:,:,0] > 0.9
    
    # calculate for each lon,lat coordinate the xy coordinate in the mask map
    pickup_x, pickup_y = lonlat_to_xy(df.pickup_longitude, df.pickup_latitude, 
                                      nyc_mask.shape[1], nyc_mask.shape[0], BB)
    dropoff_x, dropoff_y = lonlat_to_xy(df.dropoff_longitude, df.dropoff_latitude, 
                                      nyc_mask.shape[1], nyc_mask.shape[0], BB)    
    # calculate boolean index
    idx = nyc_mask[pickup_y, pickup_x] & nyc_mask[dropoff_y, dropoff_x]
    
    # return only datapoints on land
    return df[idx]
  
  

df_train = remove_datapoints_from_water(df_train)

Number of trips in water: 32


In [112]:
print('Sample size after preprocessing: %d' % len(df_train))

Sample size after preprocessing: 171088


## Feature engineering

### Manhattan distance

In [119]:
# Given a dataframe, add two new features 'abs_diff_longitude' and 'abs_diff_latitude' reprensenting the vector from
# the pickup location to the dropoff location

def add_travel_vector_features(df):
    df['abs_diff_longitude'] = (df.dropoff_longitude - df.pickup_longitude).abs()
    df['abs_diff_latitude'] = (df.dropoff_latitude - df.pickup_latitude).abs()

add_travel_vector_features(df_train)


def convert_kilometers(train_df):
    train_df['abs_diff_longitude'] = train_df.abs_diff_longitude * 79.1
    train_df['abs_diff_latitude'] = train_df.abs_diff_latitude * 111
    
convert_kilometers(df_train)


### Angle difference between north, and manhattan roadways
meas_ang = 0.506 # 29 degrees = 0.506 radians
import math

## adding extra features
def add_extra_manh_features(df):
    df['euclidean_distance'] = (df.abs_diff_latitude**2 + df.abs_diff_longitude**2)**0.5  
    df['delta_manh_long'] = (df.euclidean_distance*np.sin(np.arctan(df.abs_diff_longitude / df.abs_diff_latitude)-meas_ang)).abs()
    df['delta_manh_lat'] = (df.euclidean_distance*np.cos(np.arctan(df.abs_diff_longitude / df.abs_diff_latitude)-meas_ang)).abs()
    df['manh_distance'] = df.delta_manh_long + df.delta_manh_lat
    df['euclidean_error'] = (df.manh_distance - df.euclidean_distance)*100 /  df.euclidean_distance

    
add_extra_manh_features(df_train)

tmp = len(df_train)
df_train = df_train.dropna(how = 'any', axis = 'rows')
print('Raws where distante = NaN: %d' % (tmp-len(df_train)))

df_train.head()

Raws where distante = NaN: 0


Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_diff_longitude,abs_diff_latitude,euclidean_distance,delta_manh_long,delta_manh_lat,manh_distance,euclidean_error
0,2012-01-18 10:15:28.0000001,5.3,2012-01-18 10:15:28,-73.999906,40.727304,-74.001324,40.736911,1,0.112164,1.066377,1.07226,0.418746,0.987113,1.405859,31.111806
1,2012-09-06 14:48:00.00000030,21.5,2012-09-06 14:48:00,-73.974438,40.754583,-73.979488,40.764328,5,0.399455,1.081695,1.153095,0.174879,1.139757,1.314636,14.009324
4,2012-10-30 12:33:41.0000001,4.0,2012-10-30 12:33:41,-73.989309,40.763072,-73.988742,40.758697,1,0.04485,0.485625,0.487692,0.196144,0.446509,0.642654,31.774575
5,2014-07-21 22:15:00.000000159,30.0,2014-07-21 22:15:00,-73.8752,40.773922,-73.971135,40.753697,1,7.588458,2.244975,7.913572,5.549451,5.64165,11.1911,41.416557
6,2011-02-25 23:35:00.000000154,13.3,2011-02-25 23:35:00,-74.008035,40.705807,-73.987173,40.729433,5,1.650184,2.622486,3.098474,0.172327,3.093678,3.266005,5.4069


## Build the network

In [114]:
def getYear(x):
    return x.year%2000
  
def getMonth(x):
  return x.month

def getHour(x):
  return x.hour  

def getHourCos(x):
  return math.cos(2* math.pi * x.hour/24)  

def getHourSin(x):
  return math.sin(2* math.pi * x.hour/24)  

def getDayOfWeek(x,day):
  #tmp = tf.keras.backend.one_hot(x.dayofweek, 7)
  if x.dayofweek == day:
    return 1
  else :
    return 0

train_x = pd.DataFrame()
train_x['pickup_longitude'] = df_train.pickup_longitude
train_x['pickup_latitude'] = df_train.pickup_latitude
train_x['dropoff_longitude'] = df_train.dropoff_longitude
train_x['dropoff_latitude'] = df_train.dropoff_latitude
train_x['passenger_count'] = df_train.passenger_count
train_x['manh_distance'] = df_train.manh_distance
train_x['pickup_year'] = df_train.pickup_datetime.apply(getYear)
train_x['pickup_month'] = df_train.pickup_datetime.apply(getMonth)
#train_x['pickup_hour'] = df_train.pickup_datetime.apply(getHour)

#train_x['pickup_hour_cos'] = df_train.pickup_datetime.apply(getHourCos)
#train_x['pickup_hour_sin'] = df_train.pickup_datetime.apply(getHourSin)

"""train_x['pickup_day_of_week_mon'] = df_train.pickup_datetime.apply(lambda x: getDayOfWeek(x,0))
train_x['pickup_day_of_week_tue'] = df_train.pickup_datetime.apply(lambda x: getDayOfWeek(x,1))
train_x['pickup_day_of_week_wed'] = df_train.pickup_datetime.apply(lambda x: getDayOfWeek(x,2))
train_x['pickup_day_of_week_thu'] = df_train.pickup_datetime.apply(lambda x: getDayOfWeek(x,3))
train_x['pickup_day_of_week_fri'] = df_train.pickup_datetime.apply(lambda x: getDayOfWeek(x,4))
train_x['pickup_day_of_week_sat'] = df_train.pickup_datetime.apply(lambda x: getDayOfWeek(x,5))
train_x['pickup_day_of_week_sun'] = df_train.pickup_datetime.apply(lambda x: getDayOfWeek(x,6))"""


train_y = df_train.fare_amount

train_size = 0.999 #0.00009 #0.99996
train_cnt = floor(train_x.shape[0] * train_size)
x_train = train_x.iloc[0:train_cnt].values
y_train = train_y.iloc[0:train_cnt].values
print (x_train)
x_test = train_x.iloc[train_cnt:].values
y_test = train_y.iloc[train_cnt:].values

[[-73.999906    40.727304   -74.001324   ...   1.40585891  12.
    1.        ]
 [-73.974438    40.754583   -73.979488   ...   1.31463596  12.
    9.        ]
 [-73.989309    40.763072   -73.988742   ...   0.64265358  12.
   10.        ]
 ...
 [-73.980763    40.753068   -73.96202    ...   2.52987882  10.
   11.        ]
 [-73.99845886  40.74055099 -74.00517273 ...   1.09018349  15.
    6.        ]
 [-74.001535    40.741085   -73.984148   ...   1.93787404  13.
    2.        ]]


In [98]:
inputs = tf.keras.Input(shape=(8,), name='input_layer')

x = tf.keras.layers.Dense(512, activation='relu')(inputs)
#x = tf.keras.layers.Dense(512, activation='relu')(x)
x = tf.keras.layers.Dense(512, activation='relu')(x)
#x = tf.keras.layers.Dense(128, activation='relu')(x)
predictions = tf.keras.layers.Dense(1, activation='relu')(x)

model = tf.keras.Model(inputs=inputs, outputs=predictions)
model.compile(optimizer='rmsprop',
              loss='mse',
              metrics=['accuracy'])
print_weights = LambdaCallback(on_epoch_end=lambda batch, logs: print(model.layers[5].get_weights()))
#model.fit(x_train, y_train, batch_size=32, epochs=20,callbacks = [print_weights])
model.fit(x_train, y_train, batch_size=32, epochs=6)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<tensorflow.python.keras.callbacks.History at 0x7fdd73393be0>

## Local evaluation

In [115]:
model.evaluate(x_train, y_train)




[18.80202504744918, 0.0]

In [116]:

print (y_test[0:100])

predictionTmp = model.predict(x_test)

print (predictionTmp)

erreurTotale = 0
for i in range(100):
  erreurTotale += math.pow((y_test[i] - predictionTmp[i][0]),2)
  
print('Erreur moyenne : ' + str(math.sqrt(erreurTotale/100)))
    

[36.83  5.    9.3  39.    8.1   6.9   4.5   5.7  16.    8.    8.   10.5
  8.1   5.7   7.3   4.9   8.1  10.5   8.    5.3   5.3   4.1   3.5   6.5
  8.5   4.9   4.9   4.5   7.   17.3   4.5   5.3   6.5  10.1   7.5  14.1
 25.5  12.   12.5   7.3  11.5   8.9   6.5   6.9  12.5   8.    7.7   8.1
  6.5   8.5   9.   13.3   5.3   7.7  57.33  7.5  10.9  13.    6.5   5.5
 15.   12.5  10.9   4.5   7.    5.3   6.9   7.3   4.5   9.5   6.    8.1
 10.   21.   12.   11.7  11.   10.5   7.    8.5  10.5   4.9   4.5   7.5
 18.5   7.    8.5  14.9   6.5  18.    3.7  14.5   5.7   4.1   4.9   7.3
  4.9   8.5  17.4   5.7 ]
[[31.104424 ]
 [ 7.3753877]
 [11.450322 ]
 [36.387566 ]
 [ 8.374105 ]
 [ 8.235859 ]
 [ 5.799046 ]
 [ 6.8173637]
 [14.791421 ]
 [ 6.694931 ]
 [ 8.114597 ]
 [13.444854 ]
 [11.211635 ]
 [ 5.752664 ]
 [ 9.239293 ]
 [ 5.5992084]
 [ 6.682624 ]
 [11.071662 ]
 [ 7.7659774]
 [ 4.5735855]
 [ 6.3736076]
 [ 5.592484 ]
 [ 5.859862 ]
 [17.052721 ]
 [10.275203 ]
 [ 5.5675735]
 [ 5.045608 ]
 [ 5.5642757]
 [ 7.4

## Kaggle evaluation

In [125]:
df_test = pd.read_csv('test.csv', nrows=9915, parse_dates=["pickup_datetime"])
df_test.head()

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24,-73.97332,40.763805,-73.98143,40.743835,1
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24,-73.986862,40.719383,-73.998886,40.739201,1
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44,-73.982524,40.75126,-73.979654,40.746139,1
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12,-73.98116,40.767807,-73.990448,40.751635,1
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12,-73.966046,40.789775,-73.988565,40.744427,1


### Manhattan distance

In [130]:
# Given a dataframe, add two new features 'abs_diff_longitude' and 'abs_diff_latitude' reprensenting the vector from
# the pickup location to the dropoff location

def add_travel_vector_features(df):
    df['abs_diff_longitude'] = (df.dropoff_longitude - df.pickup_longitude).abs()
    df['abs_diff_latitude'] = (df.dropoff_latitude - df.pickup_latitude).abs()

add_travel_vector_features(df_test)


def convert_kilometers(train_df):
    train_df['abs_diff_longitude'] = train_df.abs_diff_longitude * 79.1
    train_df['abs_diff_latitude'] = train_df.abs_diff_latitude * 111
    
convert_kilometers(df_test)


### Angle difference between north, and manhattan roadways
meas_ang = 0.506 # 29 degrees = 0.506 radians
import math

## adding extra features
def add_extra_manh_features(df):
    df['euclidean_distance'] = (df.abs_diff_latitude**2 + df.abs_diff_longitude**2)**0.5  
    df['delta_manh_long'] = (df.euclidean_distance*np.sin(np.arctan(df.abs_diff_longitude / df.abs_diff_latitude)-meas_ang)).abs()
    df['delta_manh_lat'] = (df.euclidean_distance*np.cos(np.arctan(df.abs_diff_longitude / df.abs_diff_latitude)-meas_ang)).abs()
    df['manh_distance'] = df.delta_manh_long + df.delta_manh_lat
    df['euclidean_error'] = (df.manh_distance - df.euclidean_distance)*100 /  df.euclidean_distance

    
add_extra_manh_features(df_test)

tmp = len(df_test)
#df_test = df_test.dropna(how = 'any', axis = 'rows')
print('Raws where distante = NaN: %d' % (tmp-len(df_test)))


Raws where distante = NaN: 0


### Create input

In [0]:
def getYear(x):
    return x.year%2000
  
def getMonth(x):
  return x.month

def getHour(x):
  return x.hour  

def getHourCos(x):
  return math.cos(2* math.pi * x.hour/24)  

def getHourSin(x):
  return math.sin(2* math.pi * x.hour/24)  

def getDayOfWeek(x,day):
  #tmp = tf.keras.backend.one_hot(x.dayofweek, 7)
  if x.dayofweek == day:
    return 1
  else :
    return 0

test_x = pd.DataFrame()
test_x['pickup_longitude'] = df_test.pickup_longitude
test_x['pickup_latitude'] = df_test.pickup_latitude
test_x['dropoff_longitude'] = df_test.dropoff_longitude
test_x['dropoff_latitude'] = df_test.dropoff_latitude
test_x['passenger_count'] = df_test.passenger_count
test_x['manh_distance'] = df_test.manh_distance
test_x['pickup_year'] = df_test.pickup_datetime.apply(getYear)
test_x['pickup_month'] = df_test.pickup_datetime.apply(getMonth)

### Compute output

In [146]:
import csv

print(test_x.shape)
predictionTmp = model.predict(test_x)
print(predictionTmp.shape)
prediction = []
for i in range(predictionTmp.size):
  prediction.append(predictionTmp[i][0])

with open('4evaluation.csv', 'w') as f:
    writer = csv.writer(f)

    writer.writerows(zip(['key'],['fare_amount']))
    #f.write('key,fare_amount')
    writer.writerows(zip(df_test.key, prediction))

(9914, 8)
(9914, 1)
