In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pyarrow.parquet as pq

In [3]:
data = pd.read_parquet("../fhvhv_tripdata_2021-01.parquet", engine="pyarrow")
print(data.head())

  hvfhs_license_num dispatching_base_num originating_base_num  \
0            HV0003               B02682               B02682   
1            HV0003               B02682               B02682   
2            HV0003               B02764               B02764   
3            HV0003               B02764               B02764   
4            HV0003               B02764               B02764   

     request_datetime   on_scene_datetime     pickup_datetime  \
0 2021-01-01 00:28:09 2021-01-01 00:31:42 2021-01-01 00:33:44   
1 2021-01-01 00:45:56 2021-01-01 00:55:19 2021-01-01 00:55:19   
2 2021-01-01 00:21:15 2021-01-01 00:22:41 2021-01-01 00:23:56   
3 2021-01-01 00:39:12 2021-01-01 00:42:37 2021-01-01 00:42:51   
4 2021-01-01 00:46:11 2021-01-01 00:47:17 2021-01-01 00:48:14   

     dropoff_datetime  PULocationID  DOLocationID  trip_miles  ...  sales_tax  \
0 2021-01-01 00:49:07           230           166        5.26  ...       1.98   
1 2021-01-01 01:18:21           152           167       

In [4]:
data['driver_pay_per_mile'] = data['driver_pay'] / data['trip_miles']
data['trip_duration'] = (data['dropoff_datetime'] - data['pickup_datetime']).dt.total_seconds()/60
data['driver_pay_per_minute'] = data['driver_pay'] / data['trip_duration']

print(data.head())

  hvfhs_license_num dispatching_base_num originating_base_num  \
0            HV0003               B02682               B02682   
1            HV0003               B02682               B02682   
2            HV0003               B02764               B02764   
3            HV0003               B02764               B02764   
4            HV0003               B02764               B02764   

     request_datetime   on_scene_datetime     pickup_datetime  \
0 2021-01-01 00:28:09 2021-01-01 00:31:42 2021-01-01 00:33:44   
1 2021-01-01 00:45:56 2021-01-01 00:55:19 2021-01-01 00:55:19   
2 2021-01-01 00:21:15 2021-01-01 00:22:41 2021-01-01 00:23:56   
3 2021-01-01 00:39:12 2021-01-01 00:42:37 2021-01-01 00:42:51   
4 2021-01-01 00:46:11 2021-01-01 00:47:17 2021-01-01 00:48:14   

     dropoff_datetime  PULocationID  DOLocationID  trip_miles  ...  tips  \
0 2021-01-01 00:49:07           230           166        5.26  ...  0.00   
1 2021-01-01 01:18:21           152           167        3.65  ...

In [5]:
import tensorflow as tf
from tensorflow import keras

In [11]:
# create a neural network model for predicting driver pay per mile based on all trip data. Create a model with 3 hidden layers, each with 64 nodes and ReLU activation functions. Use the Adam optimizer and mean squared error loss function. Train the model for 10 epochs.

model = keras.Sequential([
    keras.layers.Flatten(input_shape=(2,)),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(1, activation='linear')
])

# training data
training_data = data[1:data.shape[0]//2]
target_data = data[data.shape[0]//2:]
model.compile(optimizer='adam', loss='mean_squared_error')

model.fit(training_data[['trip_miles', 'trip_duration']], training_data['driver_pay'], epochs=5)

# Use the model to predict driver pay per mile for the first 5 trips in the dataset. Compare the predicted values to the actual values.

predictions = model.predict(target_data[['trip_miles', 'trip_duration']] )
print(predictions[:5])
print(target_data['driver_pay'][:5])



  super().__init__(**kwargs)


Epoch 1/5
[1m186070/186070[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m234s[0m 1ms/step - loss: 10.3158
Epoch 2/5
[1m186070/186070[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m220s[0m 1ms/step - loss: 9.3862
Epoch 3/5
[1m186070/186070[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m219s[0m 1ms/step - loss: 9.6645
Epoch 4/5
[1m186070/186070[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m221s[0m 1ms/step - loss: 9.7176
Epoch 5/5
[1m186070/186070[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m227s[0m 1ms/step - loss: 9.6589
[1m186070/186070[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m180s[0m 967us/step
[[11.340802]
 [ 6.036688]
 [10.085955]
 [ 5.905987]
 [ 6.07553 ]]
5954234    10.72
5954235     5.40
5954236     9.47
5954237     5.40
5954238     6.32
Name: driver_pay, dtype: float64


In [12]:
print(model.evaluate(target_data[['trip_miles', 'trip_duration']], target_data['driver_pay']))

[1m186070/186070[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m189s[0m 1ms/step - loss: 8.5704
9.355286598205566


In [13]:
model.save('my_model.keras')

In [23]:
new_model = tf.keras.models.load_model('my_model.keras')

print(new_model.predict(target_data[1:50][['trip_miles', 'trip_duration']]))
print(target_data[1:50]['driver_pay'])

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[[ 6.036688 ]
 [10.085955 ]
 [ 5.905987 ]
 [ 6.07553  ]
 [11.788852 ]
 [ 6.6538496]
 [ 6.6538496]
 [12.050205 ]
 [14.300811 ]
 [ 6.1049414]
 [ 5.953967 ]
 [ 6.961256 ]
 [49.10934  ]
 [27.036638 ]
 [ 9.482059 ]
 [ 9.052041 ]
 [ 6.0113697]
 [26.31416  ]
 [ 8.627387 ]
 [13.940819 ]
 [ 8.22101  ]
 [10.112573 ]
 [ 6.2367926]
 [28.552801 ]
 [12.233469 ]
 [ 7.2398434]
 [23.060116 ]
 [ 6.000741 ]
 [ 6.0315437]
 [12.854951 ]
 [ 9.121206 ]
 [19.923515 ]
 [ 9.716449 ]
 [ 8.563349 ]
 [ 5.9578676]
 [13.688239 ]
 [ 6.8834724]
 [ 9.445472 ]
 [ 8.453664 ]
 [16.718786 ]
 [19.244373 ]
 [ 5.8845997]
 [17.932804 ]
 [18.198654 ]
 [31.99241  ]
 [ 5.976926 ]
 [ 6.145316 ]
 [ 5.9407225]
 [19.79589  ]]
5954235     5.40
5954236     9.47
5954237     5.40
5954238     6.32
5954239    14.50
5954240     6.13
5954241     6.13
5954242    11.41
5954243    13.62
5954244     5.47
5954245     5.39
5954246     6.43
5954247    53.79
5954248    26.02
595