뉴욕 택시 데이터 활용
- `fhvhv_tripdata_2020-03.csv` : 2020년 3월 뉴욕 택시 데이터
    - `hvfhs_license_num` : 택시 회사 ID(뉴욕 택시, 우버, 등등...)
    - `dispatching_base_num` : 택시 회사 별 고유 택시 ID
    - `pickup_datetime` : 승객을 태운 시간
    - `dropoff_datetime` : 승객이 하차한 시간
    - `PULocationID` : `PickUp LocationID`. 승객이 승차한 지역 ID
    - `DOLocationID` : `DropOff LocationID`. 승객이 하차한 지역 ID

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("trip_count_sql").getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/06/13 06:04:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
filepath="/home/ubuntu/working/spark-examples/data/fhvhv_tripdata_2020-03.csv"
taxi_df = spark.read.csv(f"file://{filepath}", inferSchema=True, header=True)
taxi_df.show(5)

                                                                                

+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|hvfhs_license_num|dispatching_base_num|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|
+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|           HV0005|              B02510|2020-03-01 00:03:40|2020-03-01 00:23:39|          81|         159|   null|
|           HV0005|              B02510|2020-03-01 00:28:05|2020-03-01 00:38:57|         168|         119|   null|
|           HV0003|              B02764|2020-03-01 00:03:07|2020-03-01 00:15:04|         137|         209|      1|
|           HV0003|              B02764|2020-03-01 00:18:42|2020-03-01 00:38:42|         209|          80|   null|
|           HV0003|              B02764|2020-03-01 00:44:24|2020-03-01 00:58:44|         256|         226|   null|
+-----------------+--------------------+-------------------+-------------------+

In [3]:
taxi_df.createOrReplaceTempView("mobility_data")

In [4]:
# 승차 년-월-일 별 카운트 세기
query = """
SELECT pickup_date, count(*) as trips

FROM (
        SELECT split(pickup_datetime, ' ')[0] as pickup_date
        FROM mobility_data
     )

GROUP BY pickup_date
"""
spark.sql(query).show()



+-----------+------+
|pickup_date| trips|
+-----------+------+
| 2020-03-03|697880|
| 2020-03-02|648986|
| 2020-03-01|784246|
| 2020-03-06|872012|
| 2020-03-05|731165|
| 2020-03-04|707879|
| 2020-03-09|628940|
| 2020-03-08|731222|
| 2020-03-07|886071|
| 2020-03-10|626474|
| 2020-03-12|643257|
| 2020-03-11|628601|
| 2020-03-16|391518|
| 2020-03-13|660914|
| 2020-03-15|448125|
| 2020-03-14|569397|
| 2020-03-26|141607|
| 2020-03-25|141088|
| 2020-03-20|261900|
| 2020-03-24|141686|
+-----------+------+
only showing top 20 rows



                                                                                

In [5]:
zone_filepath="/home/ubuntu/working/spark-examples/data/taxi+_zone_lookup.csv"
zone_df = spark.read.csv(f"file://{zone_filepath}", inferSchema=True, header=True)
zone_df.show(5)

+----------+-------------+--------------------+------------+
|LocationID|      Borough|                Zone|service_zone|
+----------+-------------+--------------------+------------+
|         1|          EWR|      Newark Airport|         EWR|
|         2|       Queens|         Jamaica Bay|   Boro Zone|
|         3|        Bronx|Allerton/Pelham G...|   Boro Zone|
|         4|    Manhattan|       Alphabet City| Yellow Zone|
|         5|Staten Island|       Arden Heights|   Boro Zone|
+----------+-------------+--------------------+------------+
only showing top 5 rows



In [6]:
zone_df.createOrReplaceTempView("zone_data")

In [7]:
# Borough 별 택시 trip 수
query = """
    SELECT zone_data.borough, count(zone_data.borough) as trips
    FROM mobility_data
    JOIN zone_data ON mobility_data.PULocationID = zone_data.LocationID
    
    GROUP By zone_data.borough
"""

spark.sql(query).show()



+-------------+-------+
|      borough|  trips|
+-------------+-------+
|       Queens|2437383|
|          EWR|    362|
|      Unknown|    845|
|     Brooklyn|3735764|
|Staten Island| 178818|
|    Manhattan|4953140|
|        Bronx|2086592|
+-------------+-------+



                                                                                

In [8]:
# Borough 별 택시 노랑택시 trip 수.
query = """
    SELECT zone_data.borough, count(zone_data.borough) as trips
    FROM mobility_data
    JOIN zone_data ON mobility_data.PULocationID = zone_data.LocationID
    WHERE hvfhs_license_num = 'HV0003'
    GROUP By zone_data.borough
"""

spark.sql(query).show()



+-------------+-------+
|      borough|  trips|
+-------------+-------+
|       Queens|1863688|
|      Unknown|    548|
|     Brooklyn|2779375|
|Staten Island| 148199|
|    Manhattan|3270666|
|        Bronx|1774283|
|          EWR|      4|
+-------------+-------+



                                                                                

In [9]:
spark.sql(query).explain(True)

== Parsed Logical Plan ==
'Aggregate ['zone_data.borough], ['zone_data.borough, 'count('zone_data.borough) AS trips#171]
+- 'Filter ('hvfhs_license_num = HV0003)
   +- 'Join Inner, ('mobility_data.PULocationID = 'zone_data.LocationID)
      :- 'UnresolvedRelation [mobility_data], [], false
      +- 'UnresolvedRelation [zone_data], [], false

== Analyzed Logical Plan ==
borough: string, trips: bigint
Aggregate [borough#100], [borough#100, count(borough#100) AS trips#171L]
+- Filter (hvfhs_license_num#16 = HV0003)
   +- Join Inner, (PULocationID#20 = LocationID#99)
      :- SubqueryAlias mobility_data
      :  +- View (`mobility_data`, [hvfhs_license_num#16,dispatching_base_num#17,pickup_datetime#18,dropoff_datetime#19,PULocationID#20,DOLocationID#21,SR_Flag#22])
      :     +- Relation [hvfhs_license_num#16,dispatching_base_num#17,pickup_datetime#18,dropoff_datetime#19,PULocationID#20,DOLocationID#21,SR_Flag#22] csv
      +- SubqueryAlias zone_data
         +- View (`zone_data`, [Locati

In [10]:
spark.stop()