In [1]:
from pyspark.sql import SparkSession

In [2]:
MAX_MEMORY="5g"
spark = SparkSession.builder.appName("taxi-fare-prediciton")\
                .config("spark.executor.memory", MAX_MEMORY)\
                .config("spark.driver.memory", MAX_MEMORY)\
                .getOrCreate()

In [3]:
trip_files = "/home/jovyan/trips/*"

In [4]:
trips_df = spark.read.csv(f"file:///{trip_files}", inferSchema=True, header=True)

In [5]:
trips_df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- VendorID: string (nullable = true)
 |-- tpep_pickup_datetime: string (nullable = true)
 |-- tpep_dropoff_datetime: string (nullable = true)
 |-- passenger_count: string (nullable = true)
 |-- trip_distance: string (nullable = true)
 |-- RatecodeID: string (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: string (nullable = true)
 |-- DOLocationID: string (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- fare_amount: string (nullable = true)
 |-- extra: string (nullable = true)
 |-- mta_tax: string (nullable = true)
 |-- tip_amount: string (nullable = true)
 |-- tolls_amount: string (nullable = true)
 |-- improvement_surcharge: string (nullable = true)
 |-- total_amount: string (nullable = true)
 |-- congestion_surcharge: string (nullable = true)
 |-- airport_fee: string (nullable = true)



In [6]:
trips_df.createOrReplaceTempView("trips")   # sql에서 쓸 수 있게 변환

In [18]:
query = """
SELECT 
    CAST(trip_distance AS double),
    CAST(total_amount AS double)
FROM
    trips
WHERE
    total_amount < 5000
    AND total_amount > 0
    AND trip_distance > 0
    AND trip_distance < 500
    AND passenger_count < 4
    AND TO_DATE(tpep_pickup_datetime) >= '2021-01-01'
    AND TO_DATE(tpep_pickup_datetime) < '2021-04-01'
"""

In [19]:
data_df = spark.sql(query)
data_df.createOrReplaceTempView("data")

In [20]:
data_df.show()

+-------------+------------+
|trip_distance|total_amount|
+-------------+------------+
|          2.1|        11.8|
|         14.7|       51.95|
|         10.6|       36.35|
|         4.94|       24.36|
|          1.6|       14.15|
|          4.1|        17.3|
|          5.7|        21.8|
|          9.1|        28.8|
|          2.7|       18.95|
|         6.11|        24.3|
|         1.21|       10.79|
|          7.4|       33.92|
|         1.01|        10.3|
|         1.17|       12.36|
|         1.66|        12.3|
|         1.16|       11.84|
|          2.2|        12.8|
|          3.6|        30.8|
|         19.1|       59.42|
|          5.4|        23.8|
+-------------+------------+
only showing top 20 rows



In [21]:
data_df.describe().show()

+-------+------------------+------------------+
|summary|     trip_distance|      total_amount|
+-------+------------------+------------------+
|  count|          10296896|          10296896|
|   mean|3.4849777583463717| 20.15858721223262|
| stddev|    4.110858401658|13.683625256252133|
|    min|               1.0|               1.0|
|    max|             475.5|            4973.3|
+-------+------------------+------------------+



In [22]:
data_df.describe()

DataFrame[summary: string, trip_distance: string, total_amount: string]

In [23]:
train_df, test_df = data_df.randomSplit([0.8, 0.2], seed=1)

In [24]:
print(train_df.count())
print(test_df.count())

8236773
2060123


In [25]:
from pyspark.ml.feature import VectorAssembler

In [26]:
vassembler = VectorAssembler(inputCols=["trip_distance"], outputCol="features")

In [27]:
vtrain_df = vassembler.transform(train_df)

In [28]:
vtrain_df.show()

+-------------+------------+--------+
|trip_distance|total_amount|features|
+-------------+------------+--------+
|          1.0|         3.3|   [1.0]|
|          1.0|         3.3|   [1.0]|
|          1.0|         3.3|   [1.0]|
|          1.0|         3.3|   [1.0]|
|          1.0|         3.8|   [1.0]|
|          1.0|         3.8|   [1.0]|
|          1.0|         3.8|   [1.0]|
|          1.0|         3.8|   [1.0]|
|          1.0|         5.3|   [1.0]|
|          1.0|         5.3|   [1.0]|
|          1.0|         5.8|   [1.0]|
|          1.0|         5.8|   [1.0]|
|          1.0|         5.8|   [1.0]|
|          1.0|         5.8|   [1.0]|
|          1.0|         5.8|   [1.0]|
|          1.0|         5.8|   [1.0]|
|          1.0|         5.8|   [1.0]|
|          1.0|         5.8|   [1.0]|
|          1.0|         5.8|   [1.0]|
|          1.0|         5.8|   [1.0]|
+-------------+------------+--------+
only showing top 20 rows



In [29]:
from pyspark.ml.regression import LinearRegression

In [30]:
lr = LinearRegression(
    maxIter=50,
    labelCol="total_amount",
    featuresCol="features"
)

In [31]:
model = lr.fit(vtrain_df)

In [32]:
vtest_df = vassembler.transform(test_df)

In [33]:
prediction = model.transform(vtest_df)

In [34]:
prediction.show()

+-------------+------------+--------+------------------+
|trip_distance|total_amount|features|        prediction|
+-------------+------------+--------+------------------+
|          1.0|         3.3|   [1.0]|12.903940581666353|
|          1.0|         5.8|   [1.0]|12.903940581666353|
|          1.0|         5.8|   [1.0]|12.903940581666353|
|          1.0|         5.8|   [1.0]|12.903940581666353|
|          1.0|         5.8|   [1.0]|12.903940581666353|
|          1.0|         5.8|   [1.0]|12.903940581666353|
|          1.0|         5.8|   [1.0]|12.903940581666353|
|          1.0|         5.8|   [1.0]|12.903940581666353|
|          1.0|         5.8|   [1.0]|12.903940581666353|
|          1.0|         5.8|   [1.0]|12.903940581666353|
|          1.0|         5.8|   [1.0]|12.903940581666353|
|          1.0|         5.8|   [1.0]|12.903940581666353|
|          1.0|         6.3|   [1.0]|12.903940581666353|
|          1.0|         6.3|   [1.0]|12.903940581666353|
|          1.0|         6.3|   

In [37]:
model.summary.rootMeanSquaredError

6.625252872942492

In [38]:
model.summary.r2   # total amount의 76%가 trip_distance로 설명이 가능하다는 말과 같음.

0.7663287605327782

In [43]:
from pyspark.sql.types import DoubleType
distance_list = [1.1, 5.5, 10.5, 30.0]
distance_df = spark.createDataFrame(distance_list, DoubleType()).toDF("trip_distance")

In [44]:
distance_df.show()

+-------------+
|trip_distance|
+-------------+
|          1.1|
|          5.5|
|         10.5|
|         30.0|
+-------------+



In [45]:
vdistance_df = vassembler.transform(distance_df)

In [46]:
vdistance_df.show()

+-------------+--------+
|trip_distance|features|
+-------------+--------+
|          1.1|   [1.1]|
|          5.5|   [5.5]|
|         10.5|  [10.5]|
|         30.0|  [30.0]|
+-------------+--------+



In [47]:
model.transform(vdistance_df).show()

+-------------+--------+------------------+
|trip_distance|features|        prediction|
+-------------+--------+------------------+
|          1.1|   [1.1]|13.195867906301784|
|          5.5|   [5.5]|26.040670190260713|
|         10.5|  [10.5]| 40.63703642203222|
|         30.0|  [30.0]|  97.5628647259411|
+-------------+--------+------------------+



1.1 - 13.19 => 1.7km당 17000원 정도