In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.enableHiveSupport().appName('Rideshare').getOrCreate()
sc = spark.sparkContext

In [2]:
conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '4g'), ('spark.app.name', 'Spark Updated Conf'), ('spark.executor.cores', '4'), ('spark.cores.max', '4'), ('spark.driver.memory','4g')])

In [3]:
spark.sparkContext.getConf().getAll()

[('spark.eventLog.enabled', 'true'),
 ('spark.yarn.jars',
  'local:/opt/cloudera/parcels/CDH-6.3.0-1.cdh6.3.0.p0.1279813/lib/spark/jars/*,local:/opt/cloudera/parcels/CDH-6.3.0-1.cdh6.3.0.p0.1279813/lib/spark/hive/*'),
 ('spark.yarn.appMasterEnv.MKL_NUM_THREADS', '1'),
 ('spark.sql.queryExecutionListeners',
  'com.cloudera.spark.lineage.NavigatorQueryListener'),
 ('spark.lineage.log.dir', '/var/log/spark/lineage'),
 ('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS',
  'md01.rcc.local,md02.rcc.local'),
 ('spark.serializer', 'org.apache.spark.serializer.KryoSerializer'),
 ('spark.executorEnv.PYTHONPATH',
  '/opt/cloudera/parcels/CDH/lib/spark/python/lib/py4j-0.10.7-src.zip:/opt/cloudera/parcels/CDH/lib/spark/python/lib/pyspark.zip<CPS>/opt/cloudera/parcels/CDH-6.3.0-1.cdh6.3.0.p0.1279813/lib/spark/python/lib/py4j-0.10.7-src.zip<CPS>/opt/cloudera/parcels/CDH-6.3.0-1.cdh6.3.0.p0.1279813/lib/spark/python/lib/pyspark.zip'),
 ('spark.yarn.historyServer.addre

In [4]:
spark.sql('use trasley').show()

++
||
++
++



In [5]:
rideshares = spark.read.csv("/user/trasley/data/rideshare.csv", inferSchema=True, header=True)

In [6]:
rideshares = rideshares.withColumnRenamed("Trip ID", "trip_id")\
    .withColumnRenamed("Trip Start Timestamp","start_time")\
    .withColumnRenamed("Trip End Timestamp","end_time")\
    .withColumnRenamed("Trip Seconds","seconds")\
    .withColumnRenamed("Trip Miles","miles")\
    .withColumnRenamed("Pickup Census Tract","pickup_tract")\
    .withColumnRenamed("Dropoff Census Tract","dropoff_tract")\
    .withColumnRenamed("Pickup Community Area","pickup_comm_area")\
    .withColumnRenamed("Dropoff Community Area","dropoff_comm_area")\
    .withColumnRenamed("Fare","fare")\
    .withColumnRenamed("Tip","tip")\
    .withColumnRenamed("Additional Charges","add_charges")\
    .withColumnRenamed("Trip Total","total")\
    .withColumnRenamed("Shared Trip Authorized","shared_auth")\
    .withColumnRenamed("Trips Pooled","trips_pooled")\
    .withColumnRenamed("Pickup Centroid Latitude","pickup_lat_centroid")\
    .withColumnRenamed("Pickup Centroid Longitude","pickup_long_centroid")\
    .withColumnRenamed("Pickup Centroid Location","pickup_loc_centroid")\
    .withColumnRenamed("Dropoff Centroid Latitude","dropoff_lat_centroid")\
    .withColumnRenamed("Dropoff Centroid Longitude","dropoff_long_centroid")\
    .withColumnRenamed("Dropoff Centroid Location","dropoff_loc_centroid")
rideshares.show(10)

+--------------------+--------------------+--------------------+-------+-----+------------+-------------+----------------+-----------------+----+---+-----------+-----+-----------+------------+-------------------+--------------------+--------------------+--------------------+---------------------+--------------------+
|             trip_id|          start_time|            end_time|seconds|miles|pickup_tract|dropoff_tract|pickup_comm_area|dropoff_comm_area|fare|tip|add_charges|total|shared_auth|trips_pooled|pickup_lat_centroid|pickup_long_centroid| pickup_loc_centroid|dropoff_lat_centroid|dropoff_long_centroid|dropoff_loc_centroid|
+--------------------+--------------------+--------------------+-------+-----+------------+-------------+----------------+-----------------+----+---+-----------+-----+-----------+------------+-------------------+--------------------+--------------------+--------------------+---------------------+--------------------+
|6e22fde8d26ed1363...|05/27/2019 11:30:...|

In [7]:
from pyspark.sql.types import StringType,IntegerType,DateType,TimestampType
from pyspark.sql.functions import to_timestamp,from_unixtime,unix_timestamp

rideshares=rideshares.withColumn('start_time',from_unixtime(unix_timestamp('start_time', 'MM/dd/yyyy hh:mm:ss a')).cast(TimestampType())).withColumn('end_time',from_unixtime(unix_timestamp('start_time', 'MM/dd/yyyy hh:mm:ss a')).cast(TimestampType()))

In [8]:
#Limit date range for consistency across datasets
date_from='2018-11-01'
date_to='2020-06-30'
rideshares=rideshares.filter((rideshares.start_time>=date_from) & (rideshares.start_time<=date_to))
rideshares.count()

158532812

In [9]:
#Only keep rides that are not pooled rides - no similar concept for taxis
rideshares=rideshares.filter(rideshares.shared_auth==False)

In [10]:
from pyspark.sql.functions import when, count, col

#Show fields with null values
rideshares.select([count(when(col(i).isNull(), i)).alias(i) for i in rideshares.columns]).show()

+-------+----------+--------+-------+-----+------------+-------------+----------------+-----------------+----+---+-----------+-----+-----------+------------+-------------------+--------------------+-------------------+--------------------+---------------------+--------------------+
|trip_id|start_time|end_time|seconds|miles|pickup_tract|dropoff_tract|pickup_comm_area|dropoff_comm_area|fare|tip|add_charges|total|shared_auth|trips_pooled|pickup_lat_centroid|pickup_long_centroid|pickup_loc_centroid|dropoff_lat_centroid|dropoff_long_centroid|dropoff_loc_centroid|
+-------+----------+--------+-------+-----+------------+-------------+----------------+-----------------+----+---+-----------+-----+-----------+------------+-------------------+--------------------+-------------------+--------------------+---------------------+--------------------+
|      0|         0|       0| 171431| 4048|    35656511|     36176349|         8098147|          9021455| 101| 23|        101|  101|          0|       

In [11]:
#Drop rows where required fields are null - need them to accurately capture ride
rideshares=rideshares.dropna(subset=['pickup_comm_area', 'dropoff_comm_area','seconds','end_time','miles','fare'])

In [12]:
from pyspark.sql.functions import year,month,hour,minute,second

#rideshares.filter(col('seconds').isNull() & (hour(col("end_time"))-hour(col("start_time"))>=1)).count()
rideshares=rideshares.filter(~(col('seconds').isNull() & (hour(col("end_time"))-hour(col("start_time"))>=1)))

In [13]:
rideshares.select([count(when(col(i).isNull(), i)).alias(i) for i in rideshares.columns]).show()

+-------+----------+--------+-------+-----+------------+-------------+----------------+-----------------+----+---+-----------+-----+-----------+------------+-------------------+--------------------+-------------------+--------------------+---------------------+--------------------+
|trip_id|start_time|end_time|seconds|miles|pickup_tract|dropoff_tract|pickup_comm_area|dropoff_comm_area|fare|tip|add_charges|total|shared_auth|trips_pooled|pickup_lat_centroid|pickup_long_centroid|pickup_loc_centroid|dropoff_lat_centroid|dropoff_long_centroid|dropoff_loc_centroid|
+-------+----------+--------+-------+-----+------------+-------------+----------------+-----------------+----+---+-----------+-----+-----------+------------+-------------------+--------------------+-------------------+--------------------+---------------------+--------------------+
|      0|         0|       0|      0|    0|    24596026|     24596026|               0|                0|   0|  0|          0|    0|          0|       

In [14]:
from pyspark.sql.functions import date_format
from pyspark.sql.functions import col

#break out year, month, day of week, and hour that ride starts to be used as categorical features

rideshares=rideshares.withColumn('year',year("start_time")).withColumn('month',month("start_time")) \
            .withColumn('hour',hour("start_time")).withColumn("week_day", date_format(col("start_time"), "EEEE"))

In [15]:
from pyspark.sql import functions as F
from pyspark.ml.feature import StringIndexer


#bin hours in to 2 hour increments to be used as categories

rideshares=rideshares.withColumn('hourbin',F.floor((F.col('hour')/2)).cast('string'))

from pyspark.ml.feature import QuantileDiscretizer

discretizer = QuantileDiscretizer(numBuckets=12, inputCol="hour", outputCol="hour_bucket")

result = discretizer.fit(rideshares).transform(rideshares)
result.show()

droplist=['trip_id',\ #not meaningful
          'start_time'\ #broken out into categorical features
          ,'end_time'\ #using start time
          ,'tip'\ #not dependent on ride but rider
          ,'add_charges'\ #not dependent on ride but rider
          ,'total'\ #includes tip, additional charges
          ,'shared_auth'\ #no comparable process for taxi
          ,'trips_pooled'\ #no comparable process for taxi
          ,'pickup_lat_centroid','pickup_long_centroid', 'dropoff_lat_centroid'\ #community area is proxy
          ,'dropoff_long_centroid','pickup_loc_centroid','dropoff_loc_centroid'\ ##community area is proxy
          ,'pickup_tract','dropoff_tract'\ #not as meaningful as community area
          ,'hour'\ #binned
          ,'seconds' #highly correlated to fare - proxy for output
         ]
rideshares=rideshares.drop(*droplist)


In [16]:
droplist=['trip_id','start_time','end_time','tip','add_charges','total','shared_auth','trips_pooled','pickup_lat_centroid','pickup_long_centroid', 'dropoff_lat_centroid','dropoff_long_centroid','pickup_loc_centroid','dropoff_loc_centroid','pickup_tract','dropoff_tract','hour','seconds']
rideshares=rideshares.drop(*droplist)


In [17]:
rideshares.show()

+-----+----------------+-----------------+----+----+-----+--------+-------+
|miles|pickup_comm_area|dropoff_comm_area|fare|year|month|week_day|hourbin|
+-----+----------------+-----------------+----+----+-----+--------+-------+
|  3.7|              24|                5| 7.5|2019|    5|  Monday|     11|
| 12.0|              24|               41|20.0|2019|    6| Tuesday|     11|
|  2.7|              28|                8| 7.5|2019|    4|Saturday|      8|
|  2.0|               8|                8| 7.5|2019|    4|Thursday|      9|
| 10.2|               4|               32|17.5|2019|    4|Thursday|      4|
|  0.5|              32|               32| 7.5|2019|    6|Thursday|      8|
|  6.0|               6|               32|35.0|2019|    6|Saturday|      7|
|  1.0|              28|               28| 2.5|2019|    4|  Friday|      3|
|  1.3|               8|                8| 5.0|2019|    6|  Friday|     10|
|  2.6|               6|                7|12.5|2019|    4|Saturday|      8|
|  4.1|     

In [18]:
from pyspark.ml.feature import VectorAssembler

from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator

#convert relevant categorical into one hot encoded
indexer1 = StringIndexer(inputCol="pickup_comm_area", outputCol="pickup_comm_Idx").setHandleInvalid("skip")
indexer2 = StringIndexer(inputCol="dropoff_comm_area", outputCol="dropoff_comm_Idx").setHandleInvalid("skip")
indexer3 = StringIndexer(inputCol="hourbin", outputCol="hour_Idx").setHandleInvalid("skip")
indexer4 = StringIndexer(inputCol="week_day", outputCol="day_Idx").setHandleInvalid("skip")
indexer5 = StringIndexer(inputCol="year", outputCol="year_Idx").setHandleInvalid("skip")
indexer6 = StringIndexer(inputCol="month", outputCol="month_Idx").setHandleInvalid("skip")

#gather all indexers as inputs to the One Hot Encoder
inputs = [indexer1.getOutputCol(), indexer2.getOutputCol(),indexer3.getOutputCol(),\
          indexer4.getOutputCol(),indexer5.getOutputCol(),indexer6.getOutputCol()]

#create the one hot encoder
encoder = OneHotEncoderEstimator(inputCols=inputs,  \
                                 outputCols=["pickup_comm_Vec", "dropoff_comm_Vec","hour_Vec","day_Vec"\
                                            ,"year_Vec","month_Vec"])

#run it through a pipeline
pipeline = Pipeline(stages=[indexer1, indexer2, indexer3, indexer4, indexer5, indexer6, encoder])
encodedRideshare = pipeline.fit(rideshares).transform(rideshares)

encodedRideshare.show(5)

+-----+----------------+-----------------+----+----+-----+--------+-------+---------------+----------------+--------+-------+--------+---------+--------------+----------------+-------------+---------------+--------------+-------------+
|miles|pickup_comm_area|dropoff_comm_area|fare|year|month|week_day|hourbin|pickup_comm_Idx|dropoff_comm_Idx|hour_Idx|day_Idx|year_Idx|month_Idx|     month_Vec|dropoff_comm_Vec|     year_Vec|pickup_comm_Vec|      hour_Vec|      day_Vec|
+-----+----------------+-----------------+----+----+-----+--------+-------+---------------+----------------+--------+-------+--------+---------+--------------+----------------+-------------+---------------+--------------+-------------+
|  3.7|              24|                5| 7.5|2019|    5|  Monday|     11|            4.0|            12.0|     3.0|    6.0|     0.0|      6.0|(11,[6],[1.0])| (76,[12],[1.0])|(2,[0],[1.0])| (76,[4],[1.0])|(11,[3],[1.0])|    (6,[],[])|
| 12.0|              24|               41|20.0|2019|    

In [19]:
assembler = VectorAssembler(inputCols = ['pickup_comm_Vec', 'dropoff_comm_Vec', \
                                         'hour_Vec', 'day_Vec', 'year_Vec','month_Vec','miles'], \
                            outputCol = 'features')

encodedRideshare = assembler.transform(encodedRideshare)
encodedRideshare.show()

+-----+----------------+-----------------+----+----+-----+--------+-------+---------------+----------------+--------+-------+--------+---------+--------------+----------------+-------------+---------------+---------------+-------------+--------------------+
|miles|pickup_comm_area|dropoff_comm_area|fare|year|month|week_day|hourbin|pickup_comm_Idx|dropoff_comm_Idx|hour_Idx|day_Idx|year_Idx|month_Idx|     month_Vec|dropoff_comm_Vec|     year_Vec|pickup_comm_Vec|       hour_Vec|      day_Vec|            features|
+-----+----------------+-----------------+----+----+-----+--------+-------+---------------+----------------+--------+-------+--------+---------+--------------+----------------+-------------+---------------+---------------+-------------+--------------------+
|  3.7|              24|                5| 7.5|2019|    5|  Monday|     11|            4.0|            12.0|     3.0|    6.0|     0.0|      6.0|(11,[6],[1.0])| (76,[12],[1.0])|(2,[0],[1.0])| (76,[4],[1.0])| (11,[3],[1.0])|    

In [20]:
encodedRideshare.select('features').show(5,False)

+----------------------------------------------------------------+
|features                                                        |
+----------------------------------------------------------------+
|(183,[4,88,155,169,177,182],[1.0,1.0,1.0,1.0,1.0,3.7])          |
|(183,[4,87,155,168,169,176,182],[1.0,1.0,1.0,1.0,1.0,1.0,12.0]) |
|(183,[2,76,153,163,169,178,182],[1.0,1.0,1.0,1.0,1.0,1.0,2.7])  |
|(183,[0,76,152,165,169,178,182],[1.0,1.0,1.0,1.0,1.0,1.0,2.0])  |
|(183,[15,77,156,165,169,178,182],[1.0,1.0,1.0,1.0,1.0,1.0,10.2])|
+----------------------------------------------------------------+
only showing top 5 rows



In [21]:
#split data into train and test
rideshare_train, rideshare_test = encodedRideshare.randomSplit([.6,.4],seed=1234)
rideshare_test, rideshare_holdout = encodedRideshare.randomSplit([.7,.3],seed=1234)

rideshare_train.show(1)

+-----+----------------+-----------------+----+----+-----+--------+-------+---------------+----------------+--------+-------+--------+---------+--------------+----------------+-------------+---------------+--------------+-------------+--------------------+
|miles|pickup_comm_area|dropoff_comm_area|fare|year|month|week_day|hourbin|pickup_comm_Idx|dropoff_comm_Idx|hour_Idx|day_Idx|year_Idx|month_Idx|     month_Vec|dropoff_comm_Vec|     year_Vec|pickup_comm_Vec|      hour_Vec|      day_Vec|            features|
+-----+----------------+-----------------+----+----+-----+--------+-------+---------------+----------------+--------+-------+--------+---------+--------------+----------------+-------------+---------------+--------------+-------------+--------------------+
|  0.0|               2|               14| 5.0|2019|    4|Thursday|      7|           22.0|            23.0|     5.0|    2.0|     0.0|      7.0|(11,[7],[1.0])| (76,[23],[1.0])|(2,[0],[1.0])|(76,[22],[1.0])|(11,[5],[1.0])|(6,[2],[

In [22]:
from pyspark.ml.regression import LinearRegression

#Elastic Net
lr = LinearRegression(featuresCol = 'features', labelCol='fare', regParam=0.3, elasticNetParam=0.8, maxIter=10)
lrm = lr.fit(rideshare_train)

#coefficients
print("Coefficients: " + str(lrm.coefficients))
print("Intercept: " + str(lrm.intercept))

#model summary
print("RMSE: %f" % lrm.summary.rootMeanSquaredError)
print("r2: %f" % lrm.summary.r2)


Coefficients: [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.09882202468715107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.46090726291808104,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25143020439742125,0.0,0.0,0.0,0.0,0.0,0.15725424964248338,0.0,0.0,0.0,0.0,0.0,1.3467124665424965]
Intercept: 4.15834397063148
RMSE: 3.642618
r2: 0.768659


In [23]:
sigfeature=[]
[sigfeature.append(i) for i,j in enumerate(lrm.coefficients) if lrm.coefficients[i]!=0]
sigfeature

[83, 153, 170, 176, 182]

In [25]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

lr2 = LinearRegression(featuresCol = 'features', labelCol='fare')

lr_paramGrid = ParamGridBuilder() \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .addGrid(lr.regParam, [0.01, 0.1, 0.5]) \
    .addGrid(lr.maxIter, [1, 5, 10])\
    .build()
    
lrevaluator = RegressionEvaluator(predictionCol="prediction", labelCol="fare", metricName="rmse")

lrcv = CrossValidator(estimator = lr,
                    estimatorParamMaps = lr_paramGrid,
                    evaluator = lrevaluator,
                    numFolds = 5)

lrcvModel = lrcv.fit(rideshare_train)
print(lrcvModel)

CrossValidatorModel_ed0e43f3b261


In [26]:
lrcvSummary = lrcvModel.bestModel.summary
print("Coefficient Standard Errors: " + str(lrcvSummary.coefficientStandardErrors))
print("P Values: " + str(lrcvSummary.pValues)) # Last element is the intercept

Coefficient Standard Errors: [0.011610783760727892, 0.01164662801462127, 0.011651587290544921, 0.011674864143286954, 0.01168286180398206, 0.011696155257345249, 0.011768095743187306, 0.011988551935415469, 0.011912401834199506, 0.011907952980855459, 0.01210262307381252, 0.012166302861714294, 0.012146881606747099, 0.012240167707184012, 0.012246628780899058, 0.012280419785085702, 0.012328836962764513, 0.012354224961121506, 0.012407090138776606, 0.012652986143587569, 0.012610143569803391, 0.012740043464934327, 0.012833261898563274, 0.012873732616771971, 0.012852150665629369, 0.012909667577090937, 0.012962579203950839, 0.013123074218935559, 0.013031796351540382, 0.013163343717346813, 0.013134690072611652, 0.013215863058003463, 0.013169990786975217, 0.013198613280287801, 0.013169130691836462, 0.013337419168982083, 0.013408208491484792, 0.013569360660518042, 0.013718361742106347, 0.013654729995024957, 0.013765449496471708, 0.013706780741535363, 0.013855096574490794, 0.014158307338931133, 0.014

In [27]:
lrpredictions = lrcvModel.transform(rideshare_test)
print('RMSE:', lrevaluator.evaluate(lrpredictions))

RMSE: 3.5431101472037945


In [28]:
#0-75
encodedRideshare.select('pickup_comm_area','pickup_comm_Vec').distinct().sort('pickup_comm_Vec').show(100)

+----------------+---------------+
|pickup_comm_area|pickup_comm_Vec|
+----------------+---------------+
|              55|     (76,[],[])|
|               8| (76,[0],[1.0])|
|              32| (76,[1],[1.0])|
|              28| (76,[2],[1.0])|
|               6| (76,[3],[1.0])|
|              24| (76,[4],[1.0])|
|               7| (76,[5],[1.0])|
|              22| (76,[6],[1.0])|
|              76| (76,[7],[1.0])|
|               3| (76,[8],[1.0])|
|              33| (76,[9],[1.0])|
|              77|(76,[10],[1.0])|
|              41|(76,[11],[1.0])|
|               5|(76,[12],[1.0])|
|              31|(76,[13],[1.0])|
|              21|(76,[14],[1.0])|
|               4|(76,[15],[1.0])|
|              56|(76,[16],[1.0])|
|              16|(76,[17],[1.0])|
|               1|(76,[18],[1.0])|
|              25|(76,[19],[1.0])|
|              23|(76,[20],[1.0])|
|              43|(76,[21],[1.0])|
|               2|(76,[22],[1.0])|
|              19|(76,[23],[1.0])|
|              14|(7

In [29]:
#76-151, look at 8 (83rd position)
encodedRideshare.select('dropoff_comm_area','dropoff_comm_Vec').distinct().sort('dropoff_comm_Vec').show(100)

+-----------------+----------------+
|dropoff_comm_area|dropoff_comm_Vec|
+-----------------+----------------+
|               55|      (76,[],[])|
|                8|  (76,[0],[1.0])|
|               32|  (76,[1],[1.0])|
|               28|  (76,[2],[1.0])|
|                6|  (76,[3],[1.0])|
|               24|  (76,[4],[1.0])|
|                7|  (76,[5],[1.0])|
|               22|  (76,[6],[1.0])|
|               76|  (76,[7],[1.0])|
|               33|  (76,[8],[1.0])|
|                3|  (76,[9],[1.0])|
|               77| (76,[10],[1.0])|
|               41| (76,[11],[1.0])|
|                5| (76,[12],[1.0])|
|               56| (76,[13],[1.0])|
|                4| (76,[14],[1.0])|
|               21| (76,[15],[1.0])|
|               31| (76,[16],[1.0])|
|               16| (76,[17],[1.0])|
|                1| (76,[18],[1.0])|
|               25| (76,[19],[1.0])|
|               23| (76,[20],[1.0])|
|               43| (76,[21],[1.0])|
|                2| (76,[22],[1.0])|
|

In [30]:
#152-162, look at 2nd (153rd overall)
encodedRideshare.select('hourbin','hour_Vec').distinct().sort('hour_Vec').show(100)

+-------+---------------+
|hourbin|       hour_Vec|
+-------+---------------+
|      2|     (11,[],[])|
|      9| (11,[0],[1.0])|
|      8| (11,[1],[1.0])|
|     10| (11,[2],[1.0])|
|     11| (11,[3],[1.0])|
|      4| (11,[4],[1.0])|
|      7| (11,[5],[1.0])|
|      6| (11,[6],[1.0])|
|      5| (11,[7],[1.0])|
|      3| (11,[8],[1.0])|
|      0| (11,[9],[1.0])|
|      1|(11,[10],[1.0])|
+-------+---------------+



In [31]:
#163-168
encodedRideshare.select('week_day','day_Vec').distinct().sort('day_Vec').show(100)

+---------+-------------+
| week_day|      day_Vec|
+---------+-------------+
|   Monday|    (6,[],[])|
| Saturday|(6,[0],[1.0])|
|   Friday|(6,[1],[1.0])|
| Thursday|(6,[2],[1.0])|
|   Sunday|(6,[3],[1.0])|
|Wednesday|(6,[4],[1.0])|
|  Tuesday|(6,[5],[1.0])|
+---------+-------------+



In [32]:
#171-181, 176- (#182=miles)
encodedRideshare.select('month','month_Vec').distinct().sort('month_Vec').show(100)

+-----+---------------+
|month|      month_Vec|
+-----+---------------+
|    9|     (11,[],[])|
|   11| (11,[0],[1.0])|
|   12| (11,[1],[1.0])|
|    2| (11,[2],[1.0])|
|    1| (11,[3],[1.0])|
|    3| (11,[4],[1.0])|
|    6| (11,[5],[1.0])|
|    5| (11,[6],[1.0])|
|    4| (11,[7],[1.0])|
|   10| (11,[8],[1.0])|
|    8| (11,[9],[1.0])|
|    7|(11,[10],[1.0])|
+-----+---------------+



In [33]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import GBTRegressor


gbt = GBTRegressor(maxDepth=5, labelCol="fare", predictionCol="prediction")

gbt_rideshare = gbt.fit(rideshare_train)

gbt_rideshare_predictions = gbt_rideshare.transform(rideshare_test)


In [34]:
evaluator = RegressionEvaluator(
    labelCol="fare", predictionCol="prediction", metricName="rmse")

rmse = evaluator.evaluate(gbt_rideshare_predictions)

In [35]:
rmse

3.521888418342016

In [36]:
chicago_taxi = spark.read.csv('/user/trasley/data/chicagotaxi.csv', inferSchema=True, header=True)

In [37]:
chicago_taxi = chicago_taxi.withColumnRenamed("Trip ID", "trip_id")\
    .withColumnRenamed("Taxi ID", "taxi_id")\
    .withColumnRenamed("Trip Start Timestamp", "start_time")\
    .withColumnRenamed("Trip End Timestamp", "end_time")\
    .withColumnRenamed("Trip Seconds", "seconds")\
    .withColumnRenamed("Trip Miles", "miles")\
    .withColumnRenamed("Pickup Census Tract", "pickup_tract")\
    .withColumnRenamed("Dropoff Census Tract", "dropoff_tract")\
    .withColumnRenamed("Pickup Community Area", "pickup_comm_area")\
    .withColumnRenamed("Dropoff Community Area", "dropoff_comm_area")\
    .withColumnRenamed("Fare", "fare")\
    .withColumnRenamed("Tips", "tip")\
    .withColumnRenamed("Tolls", "tolls")\
    .withColumnRenamed("Extras", "extras")\
    .withColumnRenamed("Trip Total","total")\
    .withColumnRenamed("Payment Type", "payment_type")\
    .withColumnRenamed("Company", "company")\
    .withColumnRenamed("Pickup Centroid Latitude","pickup_lat_centroid")\
    .withColumnRenamed("Pickup Centroid Longitude","pickup_long_centroid")\
    .withColumnRenamed("Pickup Centroid Location","pickup_loc_centroid")\
    .withColumnRenamed("Dropoff Centroid Latitude","dropoff_lat_centroid")\
    .withColumnRenamed("Dropoff Centroid Longitude","dropoff_long_centroid")\
    .withColumnRenamed("Dropoff Centroid  Location","dropoff_loc_centroid")
chicago_taxi.show(10)

+--------------------+--------------------+--------------------+--------------------+-------+-----+------------+-------------+----------------+-----------------+-----+---+-----+------+-----+------------+---------+-------------------+--------------------+--------------------+--------------------+---------------------+--------------------+
|             trip_id|             taxi_id|          start_time|            end_time|seconds|miles|pickup_tract|dropoff_tract|pickup_comm_area|dropoff_comm_area| fare|tip|tolls|extras|total|payment_type|  company|pickup_lat_centroid|pickup_long_centroid| pickup_loc_centroid|dropoff_lat_centroid|dropoff_long_centroid|dropoff_loc_centroid|
+--------------------+--------------------+--------------------+--------------------+-------+-----+------------+-------------+----------------+-----------------+-----+---+-----+------+-----+------------+---------+-------------------+--------------------+--------------------+--------------------+---------------------+--

In [38]:
chicago_taxi=chicago_taxi.withColumn('start_time',from_unixtime(unix_timestamp('start_time', 'MM/dd/yyyy hh:mm:ss a')).cast(TimestampType())).withColumn('end_time',from_unixtime(unix_timestamp('start_time', 'MM/dd/yyyy hh:mm:ss a')).cast(TimestampType()))

In [39]:
#Limit date range for consistency across datasets
date_from='2018-11-01'
date_to='2020-06-30'
chicago_taxi=chicago_taxi.filter((chicago_taxi.start_time>=date_from) & (chicago_taxi.start_time<=date_to))
chicago_taxi.count()

22476958

In [40]:
#Drop rows where required fields are null - need them to accurately capture ride
chicago_taxi=chicago_taxi.dropna(subset=['pickup_comm_area', 'dropoff_comm_area','seconds','end_time','miles','fare'])

In [41]:
from pyspark.sql.functions import col


#rideshares.filter(col('seconds').isNull() & (hour(col("end_time"))-hour(col("start_time"))>=1)).count()
chicago_taxi=chicago_taxi.filter(~(col('seconds').isNull() & (hour(col("end_time"))-hour(col("start_time"))>=1)))

In [42]:
from pyspark.sql.functions import when, count

chicago_taxi.select([count(when(col(i).isNull(), i)).alias(i) for i in chicago_taxi.columns]).show()

+-------+-------+----------+--------+-------+-----+------------+-------------+----------------+-----------------+----+---+------+------+-----+------------+-------+-------------------+--------------------+-------------------+--------------------+---------------------+--------------------+
|trip_id|taxi_id|start_time|end_time|seconds|miles|pickup_tract|dropoff_tract|pickup_comm_area|dropoff_comm_area|fare|tip| tolls|extras|total|payment_type|company|pickup_lat_centroid|pickup_long_centroid|pickup_loc_centroid|dropoff_lat_centroid|dropoff_long_centroid|dropoff_loc_centroid|
+-------+-------+----------+--------+-------+-----+------------+-------------+----------------+-----------------+----+---+------+------+-----+------------+-------+-------------------+--------------------+-------------------+--------------------+---------------------+--------------------+
|      0|      0|         0|       0|      0|    0|     5739078|      5739078|               0|                0|   0|  0|562951|    

In [43]:
from pyspark.sql.functions import year,month,hour,minute,second,date_format,col

#break out year, month, day of week, and hour that ride starts to be used as categorical features

chicago_taxi=chicago_taxi.withColumn('year',year("start_time")).withColumn('month',month("start_time")) \
            .withColumn('hour',hour("start_time")).withColumn("week_day", date_format(col("start_time"), "EEEE"))

In [44]:
from pyspark.sql import functions as F
from pyspark.ml.feature import StringIndexer


#bin hours in to 2 hour increments to be used as categories

chicago_taxi=chicago_taxi.withColumn('hourbin',F.floor((F.col('hour')/2)).cast('string'))

In [45]:
droplist=['trip_id','taxi_id','start_time','end_time','tip','extras','total','Tolls','payment_type','company','pickup_lat_centroid','pickup_long_centroid', 'dropoff_lat_centroid','dropoff_long_centroid','pickup_loc_centroid','dropoff_loc_centroid','pickup_tract','dropoff_tract','hour','seconds','dropoff_loc_centroid']
chicago_taxi=chicago_taxi.drop(*droplist)

In [46]:
chicago_taxi.show()

+-----+----------------+-----------------+-----+----+-----+---------+-------+
|miles|pickup_comm_area|dropoff_comm_area| fare|year|month| week_day|hourbin|
+-----+----------------+-----------------+-----+----+-----+---------+-------+
| 2.53|              22|               22| 11.0|2019|    2| Saturday|      1|
| 0.35|               3|                3|  4.0|2019|    2|  Tuesday|      8|
|  1.0|              77|                3|  6.0|2019|    2| Thursday|      4|
| 0.67|               6|                3|  5.0|2019|    2|  Tuesday|      3|
| 0.63|              77|               77|  5.0|2019|    3| Saturday|      6|
| 3.87|              77|               77| 13.0|2019|    3|Wednesday|      3|
| 19.8|              33|               76|48.75|2018|   11|   Monday|      6|
|  1.3|              32|                8|  7.5|2018|   11|   Friday|      6|
| 0.81|              28|               32|  6.0|2018|   11|   Friday|      8|
|17.27|              76|                8| 50.0|2018|   11| Satu

In [47]:
from pyspark.ml.feature import VectorAssembler

from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator

#convert relevant categorical into one hot encoded
indexer1 = StringIndexer(inputCol="pickup_comm_area", outputCol="pickup_comm_Idx").setHandleInvalid("skip")
indexer2 = StringIndexer(inputCol="dropoff_comm_area", outputCol="dropoff_comm_Idx").setHandleInvalid("skip")
indexer3 = StringIndexer(inputCol="hourbin", outputCol="hour_Idx").setHandleInvalid("skip")
indexer4 = StringIndexer(inputCol="week_day", outputCol="day_Idx").setHandleInvalid("skip")
indexer5 = StringIndexer(inputCol="year", outputCol="year_Idx").setHandleInvalid("skip")
indexer6 = StringIndexer(inputCol="month", outputCol="month_Idx").setHandleInvalid("skip")

#gather all indexers as inputs to the One Hot Encoder
inputs = [indexer1.getOutputCol(), indexer2.getOutputCol(),indexer3.getOutputCol(),\
          indexer4.getOutputCol(),indexer5.getOutputCol(),indexer6.getOutputCol()]

#create the one hot encoder
encoder = OneHotEncoderEstimator(inputCols=inputs,  \
                                 outputCols=["pickup_comm_Vec", "dropoff_comm_Vec","hour_Vec","day_Vec"\
                                            ,"year_Vec","month_Vec"])

#run it through a pipeline
pipeline = Pipeline(stages=[indexer1, indexer2, indexer3, indexer4, indexer5, indexer6, encoder])
encodedTaxi = pipeline.fit(chicago_taxi).transform(chicago_taxi)

encodedTaxi.show(5)

+-----+----------------+-----------------+----+----+-----+--------+-------+---------------+----------------+--------+-------+--------+---------+--------------+----------------+-------------+---------------+---------------+-------------+
|miles|pickup_comm_area|dropoff_comm_area|fare|year|month|week_day|hourbin|pickup_comm_Idx|dropoff_comm_Idx|hour_Idx|day_Idx|year_Idx|month_Idx|     month_Vec|dropoff_comm_Vec|     year_Vec|pickup_comm_Vec|       hour_Vec|      day_Vec|
+-----+----------------+-----------------+----+----+-----+--------+-------+---------------+----------------+--------+-------+--------+---------+--------------+----------------+-------------+---------------+---------------+-------------+
| 2.53|              22|               22|11.0|2019|    2|Saturday|      1|           13.0|             9.0|    10.0|    5.0|     0.0|      2.0|(11,[2],[1.0])|  (76,[9],[1.0])|(2,[0],[1.0])|(76,[13],[1.0])|(11,[10],[1.0])|(6,[5],[1.0])|
| 0.35|               3|                3| 4.0|2019|

In [48]:
assembler = VectorAssembler(inputCols = ['pickup_comm_Vec', 'dropoff_comm_Vec', \
                                         'hour_Vec', 'day_Vec', 'year_Vec','month_Vec','miles'], \
                            outputCol = 'features')

encodedTaxi = assembler.transform(encodedTaxi)
encodedTaxi.show()

+-----+----------------+-----------------+-----+----+-----+---------+-------+---------------+----------------+--------+-------+--------+---------+--------------+----------------+-------------+---------------+---------------+-------------+--------------------+
|miles|pickup_comm_area|dropoff_comm_area| fare|year|month| week_day|hourbin|pickup_comm_Idx|dropoff_comm_Idx|hour_Idx|day_Idx|year_Idx|month_Idx|     month_Vec|dropoff_comm_Vec|     year_Vec|pickup_comm_Vec|       hour_Vec|      day_Vec|            features|
+-----+----------------+-----------------+-----+----+-----+---------+-------+---------------+----------------+--------+-------+--------+---------+--------------+----------------+-------------+---------------+---------------+-------------+--------------------+
| 2.53|              22|               22| 11.0|2019|    2| Saturday|      1|           13.0|             9.0|    10.0|    5.0|     0.0|      2.0|(11,[2],[1.0])|  (76,[9],[1.0])|(2,[0],[1.0])|(76,[13],[1.0])|(11,[10],[1.

In [49]:
#split data into train and test
taxi_train, taxi_test = encodedTaxi.randomSplit([.6,.4],seed=1234)
taxi_test, taxi_holdout = taxi_test.randomSplit([.7,.3],seed=1234)

taxi_train.show(1)

+-----+----------------+-----------------+----+----+-----+--------+-------+---------------+----------------+--------+-------+--------+---------+--------------+----------------+-------------+---------------+--------------+-------------+--------------------+
|miles|pickup_comm_area|dropoff_comm_area|fare|year|month|week_day|hourbin|pickup_comm_Idx|dropoff_comm_Idx|hour_Idx|day_Idx|year_Idx|month_Idx|     month_Vec|dropoff_comm_Vec|     year_Vec|pickup_comm_Vec|      hour_Vec|      day_Vec|            features|
+-----+----------------+-----------------+----+----+-----+--------+-------+---------------+----------------+--------+-------+--------+---------+--------------+----------------+-------------+---------------+--------------+-------------+--------------------+
| 0.35|               3|                3| 4.0|2019|    2| Tuesday|      8|            8.0|             8.0|     0.0|    3.0|     0.0|      2.0|(11,[2],[1.0])|  (76,[8],[1.0])|(2,[0],[1.0])| (76,[8],[1.0])|(11,[0],[1.0])|(6,[3],[

In [50]:
from pyspark.ml.regression import LinearRegression

#Elastic Net
lr = LinearRegression(featuresCol = 'features', labelCol='fare', regParam=0.3, elasticNetParam=0.8, maxIter=10)
lr_taxi = lr.fit(taxi_train)

#coefficients
print("Coefficients: " + str(lr_taxi.coefficients))
print("Intercept: " + str(lr_taxi.intercept))

#model summary
print("RMSE: %f" % lr_taxi.summary.rootMeanSquaredError)
print("r2: %f" % lr_taxi.summary.r2)


Coefficients: [0.0,0.0,0.0,10.567166273016595,0.0,0.0,0.0,9.823950741467195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.449024797548444,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.3946823464526221,-1.3588740130096368,-1.0648677870277303,0.0,0.0,7.546241779951226,0.0,0.0,0.0,0.0,0.0,5.067586651767389,0.0,0.0,0.0,0.10996299549589296,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5208161801136515,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.80706728070709,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.5086079511809427]
Intercept: 8.445981483184378
RMSE: 61.793379
r2: 0.033665


In [51]:
sigfeature=[]
[sigfeature.append(i) for i,j in enumerate(lr_taxi.coefficients) if lr_taxi.coefficients[i]!=0]
sigfeature

[3, 7, 15, 76, 77, 78, 81, 87, 91, 101, 134, 182]

lr2 = LinearRegression(featuresCol = 'features', labelCol='fare')

lr_paramGrid = ParamGridBuilder() \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .addGrid(lr.regParam, [0.01, 0.1, 0.5]) \
    .addGrid(lr.maxIter, [1, 5, 10]) \
    .build()
    
lrevaluater = RegressionEvaluator(predictionCol="prediction", labelCol="fare", metricName="rmse")

lrcv = CrossValidator(estimator = lr,
                    estimatorParamMaps = lrparamGrid,
                    evaluator = lrevaluator,
                    numFolds = 5)

lrcvModel = lrcv.fit(taxi_train)
print(lrcvModel)

lrcvSummary = lrcvModel.bestModel.summary
print("Coefficient Standard Errors: " + str(lrcvSummary.coefficientStandardErrors))
print("P Values: " + str(lrcvSummary.pValues)) # Last element is the intercept

lrpredictions = lrcvModel.transform(taxi_test)
print('RMSE:', lrevaluator.evaluate(lrpredictions))

In [53]:
#0-75 - 3=Lakeview, 7=O'Hare, 15=Lincoln Square
encodedTaxi.select('pickup_comm_area','pickup_comm_Vec').distinct().sort('pickup_comm_Vec').show(100)

+----------------+---------------+
|pickup_comm_area|pickup_comm_Vec|
+----------------+---------------+
|              55|     (76,[],[])|
|               8| (76,[0],[1.0])|
|              32| (76,[1],[1.0])|
|              28| (76,[2],[1.0])|
|              76| (76,[3],[1.0])|
|               6| (76,[4],[1.0])|
|              33| (76,[5],[1.0])|
|               7| (76,[6],[1.0])|
|              56| (76,[7],[1.0])|
|               3| (76,[8],[1.0])|
|              24| (76,[9],[1.0])|
|              77|(76,[10],[1.0])|
|               2|(76,[11],[1.0])|
|               1|(76,[12],[1.0])|
|              22|(76,[13],[1.0])|
|              16|(76,[14],[1.0])|
|              41|(76,[15],[1.0])|
|               4|(76,[16],[1.0])|
|              35|(76,[17],[1.0])|
|              21|(76,[18],[1.0])|
|              14|(76,[19],[1.0])|
|              11|(76,[20],[1.0])|
|               5|(76,[21],[1.0])|
|              15|(76,[22],[1.0])|
|              39|(76,[23],[1.0])|
|              59|(7

In [54]:
#76-151, 77=The Loop, 78=Near West Side, 81=O'Hare, 87=Garfield Ridge, 91=Irving park, 101=Norwood Park, 134=Brighton Park
encodedTaxi.select('dropoff_comm_area','dropoff_comm_Vec').distinct().sort('dropoff_comm_Vec').show(100)

+-----------------+----------------+
|dropoff_comm_area|dropoff_comm_Vec|
+-----------------+----------------+
|               55|      (76,[],[])|
|                8|  (76,[0],[1.0])|
|               32|  (76,[1],[1.0])|
|               28|  (76,[2],[1.0])|
|                7|  (76,[3],[1.0])|
|                6|  (76,[4],[1.0])|
|               76|  (76,[5],[1.0])|
|               24|  (76,[6],[1.0])|
|               33|  (76,[7],[1.0])|
|                3|  (76,[8],[1.0])|
|               22|  (76,[9],[1.0])|
|               77| (76,[10],[1.0])|
|               56| (76,[11],[1.0])|
|                5| (76,[12],[1.0])|
|                4| (76,[13],[1.0])|
|                2| (76,[14],[1.0])|
|               16| (76,[15],[1.0])|
|               41| (76,[16],[1.0])|
|                1| (76,[17],[1.0])|
|               21| (76,[18],[1.0])|
|               35| (76,[19],[1.0])|
|               14| (76,[20],[1.0])|
|               31| (76,[21],[1.0])|
|               34| (76,[22],[1.0])|
|

In [55]:
#152-162 - 
encodedTaxi.select('hourbin','hour_Vec').distinct().sort('hour_Vec').show(100)

+-------+---------------+
|hourbin|       hour_Vec|
+-------+---------------+
|      2|     (11,[],[])|
|      8| (11,[0],[1.0])|
|      9| (11,[1],[1.0])|
|      7| (11,[2],[1.0])|
|      6| (11,[3],[1.0])|
|      5| (11,[4],[1.0])|
|      4| (11,[5],[1.0])|
|     10| (11,[6],[1.0])|
|     11| (11,[7],[1.0])|
|      3| (11,[8],[1.0])|
|      0| (11,[9],[1.0])|
|      1|(11,[10],[1.0])|
+-------+---------------+



In [56]:
#163-168 
encodedTaxi.select('week_day','day_Vec').distinct().sort('day_Vec').show(100)

+---------+-------------+
| week_day|      day_Vec|
+---------+-------------+
|   Sunday|    (6,[],[])|
| Thursday|(6,[0],[1.0])|
|   Friday|(6,[1],[1.0])|
|Wednesday|(6,[2],[1.0])|
|  Tuesday|(6,[3],[1.0])|
|   Monday|(6,[4],[1.0])|
| Saturday|(6,[5],[1.0])|
+---------+-------------+



In [57]:
#169,170
encodedTaxi.select('year','year_Vec').distinct().sort('year_Vec').show(100)

+----+-------------+
|year|     year_Vec|
+----+-------------+
|2020|    (2,[],[])|
|2019|(2,[0],[1.0])|
|2018|(2,[1],[1.0])|
+----+-------------+



In [58]:
#171-181
encodedTaxi.select('month','month_Vec').distinct().sort('month_Vec').show(100)

+-----+---------------+
|month|      month_Vec|
+-----+---------------+
|    7|     (11,[],[])|
|   11| (11,[0],[1.0])|
|   12| (11,[1],[1.0])|
|    2| (11,[2],[1.0])|
|    1| (11,[3],[1.0])|
|    3| (11,[4],[1.0])|
|    5| (11,[5],[1.0])|
|    6| (11,[6],[1.0])|
|   10| (11,[7],[1.0])|
|    8| (11,[8],[1.0])|
|    9| (11,[9],[1.0])|
|    4|(11,[10],[1.0])|
+-----+---------------+



In [59]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import GBTRegressor


gbt = GBTRegressor(maxDepth=5, labelCol="fare", predictionCol="prediction")

gbt_taxi = gbt.fit(taxi_train)

gbt_taxi_predictions = gbt_taxi.transform(taxi_test)


In [60]:
evaluator = RegressionEvaluator(
    labelCol="fare", predictionCol="prediction", metricName="rmse")

rmse = evaluator.evaluate(gbt_taxi_predictions)
rmse

58.49310794570708

In [62]:
all_holdout=rideshare_holdout.union(taxi_holdout)
holdout_fares=all_holdout.select('fare')
holdout_x=all_holdout.drop('fare')
taxi_lm_holdout_predictions=lr_taxi.transform(holdout_x)
rideshare_lm_holdout_predictions=lrm.transform(holdout_x)

In [64]:
rideshare_lm_holdout_predictions.select('prediction').show()

+-----------------+
|       prediction|
+-----------------+
| 4.15834397063148|
| 4.15834397063148|
| 4.15834397063148|
|4.619251233549561|
|4.315598220273964|
|4.315598220273964|
| 4.15834397063148|
| 4.15834397063148|
| 4.15834397063148|
| 4.15834397063148|
|4.315598220273964|
| 4.15834397063148|
| 4.15834397063148|
| 4.15834397063148|
|4.315598220273964|
| 4.15834397063148|
| 4.15834397063148|
| 4.15834397063148|
| 4.15834397063148|
| 4.15834397063148|
+-----------------+
only showing top 20 rows



In [65]:
taxi_lm_holdout_predictions.select('prediction').show()

+------------------+
|        prediction|
+------------------+
| 8.445981483184378|
| 8.445981483184378|
| 8.445981483184378|
|19.013147756200972|
|15.992223263135603|
|15.992223263135603|
|7.0871074701747405|
| 7.051299136731755|
| 7.051299136731755|
| 7.051299136731755|
| 7.051299136731755|
| 7.051299136731755|
| 7.051299136731755|
| 7.051299136731755|
| 7.051299136731755|
| 7.051299136731755|
| 8.445981483184378|
| 7.381113696156647|
| 8.445981483184378|
| 8.445981483184378|
+------------------+
only showing top 20 rows

