In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor


# Build a regression tree using sklearn library

In [3]:
data = pd.read_csv("real_estate_data.csv")

In [4]:
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 511 entries, 0 to 510
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     511 non-null    float64
 1   ZN       511 non-null    float64
 2   INDUS    511 non-null    float64
 3   CHAS     511 non-null    int64  
 4   NOX      511 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      511 non-null    float64
 7   DIS      511 non-null    float64
 8   RAD      511 non-null    int64  
 9   TAX      511 non-null    int64  
 10  PTRATIO  511 non-null    float64
 11  B        511 non-null    float64
 12  LSTAT    511 non-null    float64
 13  MEDV     511 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 56.0 KB


In [6]:
data.isna().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         5
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
MEDV       0
dtype: int64

In [7]:
data.dropna(inplace=True)

In [8]:
data.isna().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
MEDV       0
dtype: int64

In [9]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(["MEDV"], axis=1), data["MEDV"], test_size=0.3, random_state=41)

In [10]:
X_train.shape

(354, 13)

In [11]:
X_test.shape

(152, 13)

In [12]:
rt = DecisionTreeRegressor(criterion="squared_error")

In [13]:
rt.fit(X_train, y_train)

In [14]:
yhat = rt.predict(X_test)

In [15]:
pd.DataFrame({"True Labels": y_test, "Predicted Labels": yhat})

Unnamed: 0,True Labels,Predicted Labels
162,50.0,50.0
363,16.8,17.7
500,16.8,21.2
304,36.1,33.2
198,34.6,32.7
...,...,...
271,25.2,24.8
38,24.7,20.3
229,31.5,29.0
399,6.3,12.8


In [16]:
## It indicated the coefficient of determination
rt.score(X_test, y_test)

0.7171717900120664

# Build a regression tree Using snapml library

In [17]:
pip install snapml

Note: you may need to restart the kernel to use updated packages.


In [18]:
import snapml

In [19]:
trip_data = pd.read_csv("yellow_tripdata_2016-01.csv")

In [20]:
trip_data.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RatecodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2016-01-01 00:00:00,2016-01-01 00:00:00,2,1.1,-73.990372,40.734695,1,N,-73.981842,40.732407,2,7.5,0.5,0.5,0.0,0.0,0.3,8.8
1,2,2016-01-01 00:00:00,2016-01-01 00:00:00,5,4.9,-73.980782,40.729912,1,N,-73.944473,40.716679,1,18.0,0.5,0.5,0.0,0.0,0.3,19.3
2,2,2016-01-01 00:00:00,2016-01-01 00:00:00,1,10.54,-73.98455,40.679565,1,N,-73.950272,40.788925,1,33.0,0.5,0.5,0.0,0.0,0.3,34.3
3,2,2016-01-01 00:00:00,2016-01-01 00:00:00,1,4.75,-73.993469,40.71899,1,N,-73.962242,40.657333,2,16.5,0.0,0.5,0.0,0.0,0.3,17.3
4,2,2016-01-01 00:00:00,2016-01-01 00:00:00,3,1.76,-73.960625,40.78133,1,N,-73.977264,40.758514,2,8.0,0.0,0.5,0.0,0.0,0.3,8.8


In [21]:
#trip_data.info

In [22]:
trip_data = trip_data[trip_data["tip_amount"]>0]

In [23]:
#trip_data.info

In [24]:
trip_data = trip_data[trip_data["tip_amount"] <= trip_data["fare_amount"]]

In [25]:
trip_data = trip_data[(trip_data["fare_amount"] >=1) & (trip_data["fare_amount"] <200)]

In [26]:
trip_data.shape

(6920930, 19)

In [27]:
trip_data.drop(["total_amount"], inplace=True, axis = 1)

In [28]:
trip_data.shape

(6920930, 18)

In [29]:
trip_data["tpep_pickup_datetime"] = pd.to_datetime(trip_data["tpep_pickup_datetime"])

In [30]:
trip_data.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RatecodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge
11,2,2016-01-01 00:00:03,2016-01-01 00:15:49,6,2.43,-73.96933,40.763538,1,N,-73.995689,40.744251,1,12.0,0.5,0.5,3.99,0.0,0.3
13,1,2016-01-01 00:00:04,2016-01-01 00:14:32,1,3.7,-74.004303,40.742241,1,N,-74.007362,40.706936,1,14.0,0.5,0.5,3.05,0.0,0.3
14,1,2016-01-01 00:00:05,2016-01-01 00:14:27,2,2.2,-73.991997,40.718578,1,N,-74.005135,40.739944,1,11.0,0.5,0.5,1.5,0.0,0.3
17,1,2016-01-01 00:00:06,2016-01-01 00:04:44,1,1.7,-73.982101,40.774696,1,Y,-73.97094,40.796707,1,7.0,0.5,0.5,1.65,0.0,0.3
18,2,2016-01-01 00:00:06,2016-01-01 00:07:14,1,1.38,-73.994843,40.718498,1,N,-73.989807,40.73423,1,7.0,0.5,0.5,1.66,0.0,0.3


In [31]:
trip_data.reset_index(inplace=True)

In [32]:
trip_data.drop(["index"], axis=1, inplace=True)

In [33]:
trip_data

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RatecodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge
0,2,2016-01-01 00:00:03,2016-01-01 00:15:49,6,2.43,-73.969330,40.763538,1,N,-73.995689,40.744251,1,12.0,0.5,0.5,3.99,0.0,0.3
1,1,2016-01-01 00:00:04,2016-01-01 00:14:32,1,3.70,-74.004303,40.742241,1,N,-74.007362,40.706936,1,14.0,0.5,0.5,3.05,0.0,0.3
2,1,2016-01-01 00:00:05,2016-01-01 00:14:27,2,2.20,-73.991997,40.718578,1,N,-74.005135,40.739944,1,11.0,0.5,0.5,1.50,0.0,0.3
3,1,2016-01-01 00:00:06,2016-01-01 00:04:44,1,1.70,-73.982101,40.774696,1,Y,-73.970940,40.796707,1,7.0,0.5,0.5,1.65,0.0,0.3
4,2,2016-01-01 00:00:06,2016-01-01 00:07:14,1,1.38,-73.994843,40.718498,1,N,-73.989807,40.734230,1,7.0,0.5,0.5,1.66,0.0,0.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6920925,2,2016-01-31 21:28:59,2016-01-31 22:01:58,1,7.83,-74.002953,40.750481,1,N,-73.958153,40.656689,1,29.0,0.5,0.5,5.00,0.0,0.3
6920926,2,2016-01-31 22:36:41,2016-01-31 22:45:04,1,2.50,-74.009277,40.717049,1,N,-73.994637,40.750488,1,9.5,0.5,0.5,2.16,0.0,0.3
6920927,2,2016-01-31 22:53:00,2016-01-31 22:59:37,1,1.68,-74.003578,40.750751,1,N,-74.002159,40.734909,1,7.0,0.5,0.5,1.00,0.0,0.3
6920928,2,2016-01-31 23:00:11,2016-01-31 23:12:08,1,2.65,-74.002159,40.734852,1,N,-73.999680,40.761669,1,11.0,0.5,0.5,1.00,0.0,0.3


In [34]:
trip_data["tpep_pickup_datetime"] = pd.to_datetime(trip_data["tpep_pickup_datetime"])
trip_data["tpep_dropoff_datetime"] = pd.to_datetime(trip_data["tpep_dropoff_datetime"])

In [35]:
trip_data.drop(["pickup_longitude", "dropoff_longitude", "pickup_latitude", "dropoff_latitude"], axis=1, inplace=True)

In [36]:
trip_data

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge
0,2,2016-01-01 00:00:03,2016-01-01 00:15:49,6,2.43,1,N,1,12.0,0.5,0.5,3.99,0.0,0.3
1,1,2016-01-01 00:00:04,2016-01-01 00:14:32,1,3.70,1,N,1,14.0,0.5,0.5,3.05,0.0,0.3
2,1,2016-01-01 00:00:05,2016-01-01 00:14:27,2,2.20,1,N,1,11.0,0.5,0.5,1.50,0.0,0.3
3,1,2016-01-01 00:00:06,2016-01-01 00:04:44,1,1.70,1,Y,1,7.0,0.5,0.5,1.65,0.0,0.3
4,2,2016-01-01 00:00:06,2016-01-01 00:07:14,1,1.38,1,N,1,7.0,0.5,0.5,1.66,0.0,0.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6920925,2,2016-01-31 21:28:59,2016-01-31 22:01:58,1,7.83,1,N,1,29.0,0.5,0.5,5.00,0.0,0.3
6920926,2,2016-01-31 22:36:41,2016-01-31 22:45:04,1,2.50,1,N,1,9.5,0.5,0.5,2.16,0.0,0.3
6920927,2,2016-01-31 22:53:00,2016-01-31 22:59:37,1,1.68,1,N,1,7.0,0.5,0.5,1.00,0.0,0.3
6920928,2,2016-01-31 23:00:11,2016-01-31 23:12:08,1,2.65,1,N,1,11.0,0.5,0.5,1.00,0.0,0.3


In [37]:
trip_data = pd.get_dummies(trip_data, columns=["store_and_fwd_flag"])

In [38]:
trip_data

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,store_and_fwd_flag_N,store_and_fwd_flag_Y
0,2,2016-01-01 00:00:03,2016-01-01 00:15:49,6,2.43,1,1,12.0,0.5,0.5,3.99,0.0,0.3,1,0
1,1,2016-01-01 00:00:04,2016-01-01 00:14:32,1,3.70,1,1,14.0,0.5,0.5,3.05,0.0,0.3,1,0
2,1,2016-01-01 00:00:05,2016-01-01 00:14:27,2,2.20,1,1,11.0,0.5,0.5,1.50,0.0,0.3,1,0
3,1,2016-01-01 00:00:06,2016-01-01 00:04:44,1,1.70,1,1,7.0,0.5,0.5,1.65,0.0,0.3,0,1
4,2,2016-01-01 00:00:06,2016-01-01 00:07:14,1,1.38,1,1,7.0,0.5,0.5,1.66,0.0,0.3,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6920925,2,2016-01-31 21:28:59,2016-01-31 22:01:58,1,7.83,1,1,29.0,0.5,0.5,5.00,0.0,0.3,1,0
6920926,2,2016-01-31 22:36:41,2016-01-31 22:45:04,1,2.50,1,1,9.5,0.5,0.5,2.16,0.0,0.3,1,0
6920927,2,2016-01-31 22:53:00,2016-01-31 22:59:37,1,1.68,1,1,7.0,0.5,0.5,1.00,0.0,0.3,1,0
6920928,2,2016-01-31 23:00:11,2016-01-31 23:12:08,1,2.65,1,1,11.0,0.5,0.5,1.00,0.0,0.3,1,0


In [39]:
X = trip_data.drop(["tip_amount"], axis=1)

In [40]:
X = trip_data.drop(["tpep_pickup_datetime", "tpep_dropoff_datetime"], axis=1)

In [41]:
X.shape

(6920930, 13)

In [42]:
X

Unnamed: 0,VendorID,passenger_count,trip_distance,RatecodeID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,store_and_fwd_flag_N,store_and_fwd_flag_Y
0,2,6,2.43,1,1,12.0,0.5,0.5,3.99,0.0,0.3,1,0
1,1,1,3.70,1,1,14.0,0.5,0.5,3.05,0.0,0.3,1,0
2,1,2,2.20,1,1,11.0,0.5,0.5,1.50,0.0,0.3,1,0
3,1,1,1.70,1,1,7.0,0.5,0.5,1.65,0.0,0.3,0,1
4,2,1,1.38,1,1,7.0,0.5,0.5,1.66,0.0,0.3,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6920925,2,1,7.83,1,1,29.0,0.5,0.5,5.00,0.0,0.3,1,0
6920926,2,1,2.50,1,1,9.5,0.5,0.5,2.16,0.0,0.3,1,0
6920927,2,1,1.68,1,1,7.0,0.5,0.5,1.00,0.0,0.3,1,0
6920928,2,1,2.65,1,1,11.0,0.5,0.5,1.00,0.0,0.3,1,0


In [43]:
X = X.head(1000000)

In [44]:
X.shape

(1000000, 13)

In [45]:
y = trip_data["tip_amount"].head(1000000)

In [46]:
y.shape

(1000000,)

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [48]:
sklearn_rt = DecisionTreeRegressor(max_depth=8, random_state=42)

In [49]:
import time
t0 = time.time()
sklearn_rt.fit(X_train, y_train)
sklearn_time = time.time()-t0
print(sklearn_time)

0.8940703868865967


In [50]:
from snapml import DecisionTreeRegressor

In [51]:
snapml_rt = DecisionTreeRegressor(max_depth=8, random_state=45, n_jobs=4)

In [52]:
t0 = time.time()
snapml_rt = snapml_rt.fit(X_train.to_numpy(), y_train.to_numpy())
snapml_time = time.time()-t0
print(snapml_time)

0.14701437950134277
