# 카탈리스트
- 쿼리에 대한 논리적인 실행 계획을 수립
- 쿼리 분석 -> 논리적 계획 수립 -> 물리적 계획 수립

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("trip_count_sql").getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/06/07 04:06:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


CSV를 불러와서 Spark DF으로 만들어보기

In [2]:
filepath = "/home/ubuntu/working/spark-examples/data/titanic_train.csv"
titanic_sdf = spark.read.csv(filepath,
                             inferSchema=True, # 데이터의 타입을 스파크가 자동으로 인식
                             header=True, # 첫 줄을 불러 올지 말지 결정
                            )

titanic_sdf.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+------

In [3]:
titanic_sdf.createOrReplaceTempView("titanic")

In [4]:
query = """
select *
from titanic
limit 5
"""
spark.sql(query).show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+------

In [6]:
# 남자 중 Embarked(탑승지) 별 Pclass(좌석 등급) 마다 몇 명이 탔는지
query = """
select Embarked, Pclass, count(*) as male_cnt
from titanic
where Sex='male'
group by Embarked, Pclass
order by Embarked, Pclass
"""

spark.sql(query).show()

+--------+------+--------+
|Embarked|Pclass|male_cnt|
+--------+------+--------+
|       C|     1|      42|
|       C|     2|      10|
|       C|     3|      43|
|       Q|     1|       1|
|       Q|     2|       1|
|       Q|     3|      39|
|       S|     1|      79|
|       S|     2|      97|
|       S|     3|     265|
+--------+------+--------+



In [7]:
spark.sql(query).explain(True)

== Parsed Logical Plan ==
'Sort ['Embarked ASC NULLS FIRST, 'Pclass ASC NULLS FIRST], true
+- 'Aggregate ['Embarked, 'Pclass], ['Embarked, 'Pclass, 'count(1) AS male_cnt#220]
   +- 'Filter ('Sex = male)
      +- 'UnresolvedRelation [titanic], [], false

== Analyzed Logical Plan ==
Embarked: string, Pclass: int, male_cnt: bigint
Sort [Embarked#27 ASC NULLS FIRST, Pclass#18 ASC NULLS FIRST], true
+- Aggregate [Embarked#27, Pclass#18], [Embarked#27, Pclass#18, count(1) AS male_cnt#220L]
   +- Filter (Sex#20 = male)
      +- SubqueryAlias titanic
         +- View (`titanic`, [PassengerId#16,Survived#17,Pclass#18,Name#19,Sex#20,Age#21,SibSp#22,Parch#23,Ticket#24,Fare#25,Cabin#26,Embarked#27])
            +- Relation [PassengerId#16,Survived#17,Pclass#18,Name#19,Sex#20,Age#21,SibSp#22,Parch#23,Ticket#24,Fare#25,Cabin#26,Embarked#27] csv

== Optimized Logical Plan ==
Sort [Embarked#27 ASC NULLS FIRST, Pclass#18 ASC NULLS FIRST], true
+- Aggregate [Embarked#27, Pclass#18], [Embarked#27, Pclass

In [8]:
spark.stop()

## 텍시 데이터 분석

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("taxi").getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/06/07 04:34:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
#csv 불러와서 Spark DF으로 만들어보기
filepath = "/home/ubuntu/working/spark-examples/data/fhvhv_tripdata_2020-03.csv"

taxi_sdf = spark.read.csv(filepath,
                            inferSchema=True, #데이터의 타입을 스파크가 자동으로 인식
                             header=True, #헤더의 첫줄을 불러올지 말지 결정하는 것
                            )
taxi_sdf.show(5)



+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|hvfhs_license_num|dispatching_base_num|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|
+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|           HV0005|              B02510|2020-03-01 00:03:40|2020-03-01 00:23:39|          81|         159|   null|
|           HV0005|              B02510|2020-03-01 00:28:05|2020-03-01 00:38:57|         168|         119|   null|
|           HV0003|              B02764|2020-03-01 00:03:07|2020-03-01 00:15:04|         137|         209|      1|
|           HV0003|              B02764|2020-03-01 00:18:42|2020-03-01 00:38:42|         209|          80|   null|
|           HV0003|              B02764|2020-03-01 00:44:24|2020-03-01 00:58:44|         256|         226|   null|
+-----------------+--------------------+-------------------+-------------------+

                                                                                

In [8]:
taxi_sdf.createOrReplaceTempView("mobility_data")

In [9]:
query = """

select 
    pickup_date, 
    count(*) as trips
from (  select
            split(pickup_datetime, ' ')[0] as pickup_date
        from mobility_data )
group by pickup_date

"""

spark.sql(query).show()



+-----------+------+
|pickup_date| trips|
+-----------+------+
| 2020-03-03|697880|
| 2020-03-02|648986|
| 2020-03-01|784246|
| 2020-03-06|872012|
| 2020-03-05|731165|
| 2020-03-04|707879|
| 2020-03-09|628940|
| 2020-03-08|731222|
| 2020-03-07|886071|
| 2020-03-10|626474|
| 2020-03-12|643257|
| 2020-03-11|628601|
| 2020-03-16|391518|
| 2020-03-13|660914|
| 2020-03-15|448125|
| 2020-03-14|569397|
| 2020-03-26|141607|
| 2020-03-25|141088|
| 2020-03-20|261900|
| 2020-03-24|141686|
+-----------+------+
only showing top 20 rows



                                                                                

In [10]:
spark.sql(query).explain(True)

== Parsed Logical Plan ==
'Aggregate ['pickup_date], ['pickup_date, 'count(1) AS trips#198]
+- 'SubqueryAlias __auto_generated_subquery_name
   +- 'Project ['split('pickup_datetime,  )[0] AS pickup_date#197]
      +- 'UnresolvedRelation [mobility_data], [], false

== Analyzed Logical Plan ==
pickup_date: string, trips: bigint
Aggregate [pickup_date#197], [pickup_date#197, count(1) AS trips#198L]
+- SubqueryAlias __auto_generated_subquery_name
   +- Project [split(pickup_datetime#129,  , -1)[0] AS pickup_date#197]
      +- SubqueryAlias mobility_data
         +- View (`mobility_data`, [hvfhs_license_num#127,dispatching_base_num#128,pickup_datetime#129,dropoff_datetime#130,PULocationID#131,DOLocationID#132,SR_Flag#133])
            +- Relation [hvfhs_license_num#127,dispatching_base_num#128,pickup_datetime#129,dropoff_datetime#130,PULocationID#131,DOLocationID#132,SR_Flag#133] csv

== Optimized Logical Plan ==
Aggregate [pickup_date#197], [pickup_date#197, count(1) AS trips#198L]
+- Proj

In [11]:
spark.stop()