In [16]:
# download June 2020 TLC Yellow Taxi Trip records
#!wget -nc https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillsNetwork/labs/Module%203/data/yellow_tripdata_2019-06.csv 
# download June 2020 TLC Yellow Taxi Trip records
# Uncomment the next line, if working locally
print(100)
!curl https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillsNetwork/labs/Module%203/data/yellow_tripdata_2019-06.csv 
print(101)
# Snap ML is available on PyPI. To install it simply run the pip command below.
!pip install snapml==1.8.2
print(102)
# Import the libraries we need to use in this lab
from __future__ import print_function
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize, StandardScaler, MinMaxScaler
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import mean_squared_error
import time
import warnings
import gc, sys
warnings.filterwarnings('ignore')
# read the input data
print(103)
raw_data = pd.read_csv('yellow_tripdata_2019-06.csv')
print(104)
print("There are " + str(len(raw_data)) + " observations in the dataset.")
print("There are " + str(len(raw_data.columns)) + " variables in the dataset.")

# display first rows in the dataset
raw_data.head()
# some trips report 0 tip. it is assumed that these tips were paid in cash.
# for this study we drop all these rows
raw_data = raw_data[raw_data['tip_amount'] > 0]

# we also remove some outliers, namely those where the tip was larger than the fare cost
raw_data = raw_data[(raw_data['tip_amount'] <= raw_data['fare_amount'])]

# we remove trips with very large fare cost
raw_data = raw_data[((raw_data['fare_amount'] >=2) & (raw_data['fare_amount'] < 200))]

# we drop variables that include the target variable in it, namely the total_amount
clean_data = raw_data.drop(['total_amount'], axis=1)

# release memory occupied by raw_data as we do not need it anymore
# we are dealing with a large dataset, thus we need to make sure we do not run out of memory
del raw_data
gc.collect()

# print the number of trips left in the dataset
print("There are " + str(len(clean_data)) + " observations in the dataset.")
print("There are " + str(len(clean_data.columns)) + " variables in the dataset.")

plt.hist(clean_data.tip_amount.values, 16, histtype='bar', facecolor='g')
plt.show()

print("Minimum amount value is ", np.min(clean_data.tip_amount.values))
print("Maximum amount value is ", np.max(clean_data.tip_amount.values))
print("90% of the trips have a tip amount less or equal than ", np.percentile(clean_data.tip_amount.values, 90))
# display first rows in the dataset
clean_data.head()
# Convert 'tpep_dropoff_datetime' and 'tpep_pickup_datetime' columns to datetime objects
clean_data['tpep_dropoff_datetime'] = pd.to_datetime(clean_data['tpep_dropoff_datetime'])
clean_data['tpep_pickup_datetime'] = pd.to_datetime(clean_data['tpep_pickup_datetime'])

# Extract pickup and dropoff hour
clean_data['pickup_hour'] = clean_data['tpep_pickup_datetime'].dt.hour
clean_data['dropoff_hour'] = clean_data['tpep_dropoff_datetime'].dt.hour

# Extract pickup and dropoff day of the week (0 = Monday, 6 = Sunday)
clean_data['pickup_day'] = clean_data['tpep_pickup_datetime'].dt.weekday
clean_data['dropoff_day'] = clean_data['tpep_dropoff_datetime'].dt.weekday

# Calculate trip time in seconds
clean_data['trip_time'] = (clean_data['tpep_dropoff_datetime'] - clean_data['tpep_pickup_datetime']).dt.total_seconds()

# Ideally use the full dataset for this exercise.
# However, if you run into out-of-memory issues due to the data size, reduce it.
# For instance, in this example, we use only the first 200,000 samples.
first_n_rows = 200000
clean_data = clean_data.head(first_n_rows)
# drop the pickup and dropoff datetimes
clean_data = clean_data.drop(['tpep_pickup_datetime', 'tpep_dropoff_datetime'], axis=1)

# some features are categorical, we need to encode them
# to encode them we use one-hot encoding from the Pandas package
get_dummy_col = ["VendorID","RatecodeID","store_and_fwd_flag","PULocationID", "DOLocationID","payment_type", "pickup_hour", "dropoff_hour", "pickup_day", "dropoff_day"]
proc_data = pd.get_dummies(clean_data, columns = get_dummy_col)

# release memory occupied by clean_data as we do not need it anymore
# we are dealing with a large dataset, thus we need to make sure we do not run out of memory
del clean_data
gc.collect()
# extract the labels from the dataframe
y = proc_data[['tip_amount']].values.astype('float32')

# drop the target variable from the feature matrix
proc_data = proc_data.drop(['tip_amount'], axis=1)

# get the feature matrix used for training
X = proc_data.values

# normalize the feature matrix
X = normalize(X, axis=1, norm='l1', copy=False)

# print the shape of the features matrix and the labels vector
print('X.shape=', X.shape, 'y.shape=', y.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print('X_train.shape=', X_train.shape, 'Y_train.shape=', y_train.shape)
print('X_test.shape=', X_test.shape, 'Y_test.shape=', y_test.shape)
# import the Decision Tree Regression Model from scikit-learn
from sklearn.tree import DecisionTreeRegressor

# for reproducible output across multiple function calls, set random_state to a given integer value
sklearn_dt = DecisionTreeRegressor(max_depth=8, random_state=35)

# train a Decision Tree Regressor using scikit-learn
t0 = time.time()
sklearn_dt.fit(X_train, y_train)
sklearn_time = time.time()-t0
print("[Scikit-Learn] Training time (s):  {0:.5f}".format(sklearn_time))
# import the Decision Tree Regressor Model from Snap ML
from snapml import DecisionTreeRegressor

# in contrast to sklearn's Decision Tree, Snap ML offers multi-threaded CPU/GPU training 
# to use the GPU, one needs to set the use_gpu parameter to True
# snapml_dt = DecisionTreeRegressor(max_depth=4, random_state=45, use_gpu=True)

# to set the number of CPU threads used at training time, one needs to set the n_jobs parameter
# for reproducible output across multiple function calls, set random_state to a given integer value
snapml_dt = DecisionTreeRegressor(max_depth=8, random_state=45, n_jobs=4)

# train a Decision Tree Regressor model using Snap ML
t0 = time.time()
snapml_dt.fit(X_train, y_train)
snapml_time = time.time()-t0
print("[Snap ML] Training time (s):  {0:.5f}".format(snapml_time))

# Snap ML vs Scikit-Learn training speedup
training_speedup = sklearn_time/snapml_time
print('[Decision Tree Regressor] Snap ML vs. Scikit-Learn speedup : {0:.2f}x '.format(training_speedup))

# run inference using the sklearn model
sklearn_pred = sklearn_dt.predict(X_test)

# evaluate mean squared error on the test dataset
sklearn_mse = mean_squared_error(y_test, sklearn_pred)
print('[Scikit-Learn] MSE score : {0:.3f}'.format(sklearn_mse))

# run inference using the Snap ML model
snapml_pred = snapml_dt.predict(X_test)

# evaluate mean squared error on the test dataset
snapml_mse = mean_squared_error(y_test, snapml_pred)
print('[Snap ML] MSE score : {0:.3f}'.format(snapml_mse))

tree = DecisionTreeRegressor(max_depth=12, random_state=45, n_jobs=4)

tree.fit(X_train, y_train)
pred = tree.predict(X_test)

print("MSE: ", mean_squared_error(y_test, pred))

100
101VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
1,2019-06-01 00:55:13,2019-06-01 00:56:17,1,.00,1,N,145,145,2,3,0.5,0.5,0,0,0.3,4.3,0
1,2019-06-01 00:06:31,2019-06-01 00:06:52,1,.00,1,N,262,263,2,2.5,3,0.5,0,0,0.3,6.3,2.5
1,2019-06-01 00:17:05,2019-06-01 00:36:38,1,4.40,1,N,74,7,2,17.5,0.5,0.5,0,0,0.3,18.8,0
1,2019-06-01 00:59:02,2019-06-01 00:59:12,0,.80,1,N,145,145,2,2.5,1,0.5,0,0,0.3,4.3,0
1,2019-06-01 00:03:25,2019-06-01 00:15:42,1,1.70,1,N,113,148,1,9.5,3,0.5,2.65,0,0.3,15.95,2.5
1,2019-06-01 00:28:31,2019-06-01 00:39:23,2,1.60,1,N,79,125,1,9.5,3,0.5,1,0,0.3,14.3,2.5
1,2019-06-01 00:46:46,2019-06-01 00:50:55,4,.60,1,N,211,148,2,4.5,3,0.5,0,0,0.3,8.3,2.5
1,2019-06-01 00:54:49,2019-06-01 01:02:57,2,1.20,1,N,79,249,1,7.5,3,0.5,1,0,0.3,12.3,2.5
1,2019-06-01 00:09:57,2019-0

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0  348M    0  189k    0     0   117k      0  0:50:34  0:00:01  0:50:33  117k
  1  348M    1 5038k    0     0  1929k      0  0:03:04  0:00:02  0:03:02 1930k
 13  348M   13 47.9M    0     0  13.2M      0  0:00:26  0:00:03  0:00:23 13.2M
 13  348M   13 48.5M    0     0  10.2M      0  0:00:34  0:00:04  0:00:30 10.2M
 14  348M   14 49.2M    0     0  8983k      0  0:00:39  0:00:05  0:00:34 10.6M
 15  348M   15 53.9M    0     0  8356k      0  0:00:42  0:00:06  0:00:36 10.7M
 27  348M   27 94.4M    0     0  12.4M      0  0:00:28  0:00:07  0:00:21 17.9M
 28  348M   28 99.0M    0     0  11.0M      0  0:00:31  0:00:08  0:00:23 9806k
 28  348M   28 99.2M    0     0  10.3M      0  0:00

102
103
2,2019-06-03 14:55:52,2019-06-03 15:03:42,1,.94,1,N,234,68,1,7,1,0.5,1.7,0,0.3,13,2.5
1,2019-06-03 18:26:41,2019-06-03 18:40:06,1,2.20,1,N,48,239,2,11,3.5,0.5,0,0,0.3,15.3,2.5
1,2019-06-03 18:50:02,2019-06-03 18:54:32,1,.60,1,N,143,142,1,5,3.5,0.5,1.4,0,0.3,10.7,2.5
1,2019-06-03 18:56:03,2019-06-03 19:04:42,1,1.20,1,N,163,237,1,7.5,3.5,0.5,2.35,0,0.3,14.15,2.5
2,2019-06-03 18:21:46,2019-06-03 18:27:13,1,.87,1,N,246,234,1,5.5,1,0.5,1,0,0.3,10.8,2.5
2,2019-06-03 18:34:04,2019-06-03 18:40:42,1,1.02,1,N,113,234,1,6.5,1,0.5,1.62,0,0.3,12.42,2.5
2,2019-06-03 18:42:29,2019-06-03 18:54:56,1,2.45,1,N,234,237,1,10.5,1,0.5,2.96,0,0.3,17.76,2.5
2,2019-06-03 18:09:54,2019-06-03 18:54:40,1,17.23,2,N,132,164,2,52,4.5,0.5,0,6.12,0.3,65.92,2.5
2,2019-06-03 18:58:34,2019-06-03 19:08:02,1,1.16,1,N,100,246,1,8,1,0.5,0,0,0.3,12.3,2.5
1,2019-06-03 18:27:43,2019-06-03 18:37:33,1,.60,1,N,211,231,1,7.5,3.5,0.5,2.35,0,0.3,14.15,2.5
1,2019-06-03 18:51:45,2019-06-03 19:04:56,1,4.90,1,N,209,233,2,16,3.5,0.

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

ERROR: Could not find a version that satisfies the requirement snapml==1.8.2 (from versions: none)
ERROR: No matching distribution found for snapml==1.8.2


FileNotFoundError: [Errno 2] No such file or directory: 'yellow_tripdata_2019-06.csv'

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



2,2019-06-04 19:40:07,2019-06-04 19:54:19,1,3.45,1,N,246,238,1,13,1,0.5,3,0,0.3,20.3,2.5
2,2019-06-04 19:09:46,2019-06-04 19:15:05,2,1.15,1,N,43,236,1,6,1,0.5,2.06,0,0.3,12.36,2.5
2,2019-06-04 19:16:11,2019-06-04 19:18:10,2,.62,1,N,236,236,1,4,1,0.5,1.66,0,0.3,9.96,2.5
2,2019-06-04 19:26:43,2019-06-04 19:31:47,2,1.18,1,N,162,141,1,6,1,0.5,2.06,0,0.3,12.36,2.5
2,2019-06-04 19:35:32,2019-06-04 19:46:50,2,3.12,1,N,237,224,2,12,1,0.5,0,0,0.3,16.3,2.5
2,2019-06-04 19:53:44,2019-06-04 19:59:09,2,1.97,1,N,233,140,1,7.5,1,0.5,2,0,0.3,13.8,2.5
1,2019-06-04 19:11:43,2019-06-04 19:21:25,1,1.10,1,N,48,142,1,8,3.5,0.5,2.45,0,0.3,14.75,2.5
1,2019-06-04 19:24:54,2019-06-04 19:32:08,2,1.90,1,N,142,151,2,8,3.5,0.5,0,0,0.3,12.3,2.5
1,2019-06-04 19:50:07,2019-06-04 19:53:39,2,.60,1,N,238,239,2,4.5,3,0.5,0,0,0.3,8.3,2.5
1,2019-06-04 19:58:11,2019-06-04 20:03:42,2,1.10,1,N,43,238,1,6,3,0.5,1.95,0,0.3,11.75,2.5
1,2019-06-04 19:37:43,2019-06-04 19:52:05,1,1.70,1,N,233,234,1,10.5,3.5,0.5,2.22,0,0.3,17.02,2.5


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



2,2019-06-05 12:15:56,2019-06-05 12:36:38,1,2.97,1,N,262,161,1,15.5,0,0.5,4.7,0,0.3,23.5,2.5
2,2019-06-05 12:38:19,2019-06-05 13:13:39,1,5.66,1,N,161,45,2,26.5,0,0.5,0,0,0.3,29.8,2.5
2,2019-06-05 12:12:28,2019-06-05 12:20:57,1,2.22,1,N,9,121,1,9,0,0.5,0,0,0.3,9.8,0
1,2019-06-05 12:06:23,2019-06-05 12:26:06,1,1.60,1,N,163,186,1,13,2.5,0.5,2.5,0,0.3,18.8,2.5
1,2019-06-05 12:27:45,2019-06-05 12:57:30,1,3.80,1,N,186,239,1,20.5,2.5,0.5,4.75,0,0.3,28.55,2.5
1,2019-06-05 12:27:34,2019-06-05 12:36:27,1,1.50,1,N,233,141,1,8,2.5,0.5,1,0,0.3,12.3,2.5
1,2019-06-05 12:41:32,2019-06-05 12:51:51,1,2.30,1,N,262,162,1,10,2.5,0.5,1,0,0.3,14.3,2.5
1,2019-06-05 12:14:06,2019-06-05 12:30:22,1,4.90,1,N,87,161,1,17,2.5,0.5,4.05,0,0.3,24.35,2.5
1,2019-06-05 12:58:28,2019-06-05 13:06:11,1,.90,1,N,143,163,1,6.5,2.5,0.5,1.2,0,0.3,11,2.5
1,2019-06-05 12:06:13,2019-06-05 12:41:46,2,2.70,1,Y,263,100,2,21.5,2.5,0.5,0,0,0.3,24.8,2.5
1,2019-06-05 12:44:45,2019-06-05 12:48:26,1,.50,1,N,161,233,1,4.5,2.5,0.5,1.55,0,0.3,