In [1]:
from pyspark.sql import SparkSession
spark =  SparkSession.builder.appName("demo").getOrCreate()

24/03/20 15:19:25 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


## Create a Spark DataFrame

In [2]:
df = spark.createDataFrame([
    ("lee",32),("kim",20),("hong",30),("cho",50)
]
, ["name","age"]
)

In [3]:
df.show()

                                                                                

+----+---+
|name|age|
+----+---+
| lee| 32|
| kim| 20|
|hong| 30|
| cho| 50|
+----+---+



In [5]:
from pyspark.sql.functions import col, when

In [8]:
# add column life_stage
df1 = df.withColumn(
    'life_state',
    when(col('age')<30,'young')
    .when(col('age').between(20,30),'middle')
    .otherwise("old")
)

In [9]:
df1.show()

+----+---+----------+
|name|age|life_state|
+----+---+----------+
| lee| 32|       old|
| kim| 20|     young|
|hong| 30|    middle|
| cho| 50|       old|
+----+---+----------+



## filterring

In [12]:
# life_sate --> old 
# where( ).isin([])
df1.where(col('life_state').isin(['old'])).show()

+----+---+----------+
|name|age|life_state|
+----+---+----------+
| lee| 32|       old|
| cho| 50|       old|
+----+---+----------+



In [13]:
# df1 데이터프레임의  age컬럼의  avg 평균을 구해보자

In [14]:
df1.createOrReplaceTempView("view_df1")

In [15]:
sql_df1 = spark.sql("""
select avg(age) from view_df1
""")

In [16]:
sql_df1.show()

+--------+
|avg(age)|
+--------+
|    33.0|
+--------+



In [17]:
from pyspark.sql.functions import avg
df1.select(avg('age')).show()

+--------+
|avg(age)|
+--------+
|    33.0|
+--------+



In [18]:
df1.groupBy('life_state').avg().show()

[Stage 14:>                                                         (0 + 4) / 4]

+----------+--------+
|life_state|avg(age)|
+----------+--------+
|       old|    41.0|
|     young|    20.0|
|    middle|    30.0|
+----------+--------+



                                                                                

In [19]:
spark.sql("select avg(age) from {df1}",df1=df1).show()

+--------+
|avg(age)|
+--------+
|    33.0|
+--------+



In [20]:
spark.sql("select life_state, avg(age) from {df1} group by life_state", df1=df1).show()

+----------+--------+
|life_state|avg(age)|
+----------+--------+
|       old|    41.0|
|     young|    20.0|
|    middle|    30.0|
+----------+--------+



## SQL API

In [21]:
df1.write.saveAsTable("some_people")

24/03/20 16:08:26 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
24/03/20 16:08:26 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
24/03/20 16:08:32 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
24/03/20 16:08:32 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore root@127.0.0.1
24/03/20 16:08:32 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException
24/03/20 16:08:35 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
24/03/20 16:08:35 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist
24/03/20 16:08:35 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
24/03/20 16:08:35 WARN Hive

In [25]:
spark.sql("select * from some_people").show()

+-----+---+----------+
| name|age|life_state|
+-----+---+----------+
| hong| 30|    middle|
|  kim| 20|     young|
|frank| 55|       old|
|  cho| 50|       old|
|  lee| 32|       old|
+-----+---+----------+



In [24]:
spark.sql("insert into some_people values('frank',55,'old')")

DataFrame[]

In [26]:
spark.sql("select * from some_people where life_state ='old'").show()

+-----+---+----------+
| name|age|life_state|
+-----+---+----------+
|frank| 55|       old|
|  cho| 50|       old|
|  lee| 32|       old|
+-----+---+----------+



## Spark Structured Streaming

In [27]:
# Parquet : Aparch Parquet 형식의 테이블, 대규모데이터 세트를 저장하고 처리하는데 사용되는 오픈소스 파일형식
# kafka에서 데이터를 읽어와서 Parquet 테이블에 시간별로 쓰는 방법...
# kafka 스트림은 지속적으로 데이터가 채워지는 구조

In [29]:
{"student_name":"hong","graduation_year":"2023","major":"math"}
{"student_name":"lee","graduation_year":"2022","major":"korean"}

{'student_name': 'lee', 'graduation_year': '2022', 'major': 'korean'}

In [30]:
df = (
    spark.readStream.format('kafka')
    .option('kafka.bootstrap.servers','host1:port1,host2:port2')
    .option('subscribe',subscribeTopic)
    .load()
)

NameError: name 'subscribeTopic' is not defined

## 구조적 API

In [32]:
df = spark.range(500).toDF("number")
df.select(df['number'] + 10).show()

+-------------+
|(number + 10)|
+-------------+
|           10|
|           11|
|           12|
|           13|
|           14|
|           15|
|           16|
|           17|
|           18|
|           19|
|           20|
|           21|
|           22|
|           23|
|           24|
|           25|
|           26|
|           27|
|           28|
|           29|
+-------------+
only showing top 20 rows



In [33]:
df = spark.read.format("json").load("2015-summary.json")
df.show(3)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
+-----------------+-------------------+-----+
only showing top 3 rows



In [34]:
df.schema

StructType([StructField('DEST_COUNTRY_NAME', StringType(), True), StructField('ORIGIN_COUNTRY_NAME', StringType(), True), StructField('count', LongType(), True)])

In [36]:
from pyspark.sql.types import StructField,StructType,StringType,LongType
myManualSchema = StructType([
    StructField('DEST_COUNTRY_NAME', StringType(), True), 
    StructField('ORIGIN_COUNTRY_NAME', StringType(), True), 
    StructField('count', LongType(), True)
])
df = spark.read.format("json").schema(myManualSchema).load("2015-summary.json")

In [40]:
from pyspark.sql.functions import col,column
# col("column name)
df.select(column("DEST_COUNTRY_NAME")).show(5)

+-----------------+
|DEST_COUNTRY_NAME|
+-----------------+
|    United States|
|    United States|
|    United States|
|            Egypt|
|    United States|
+-----------------+
only showing top 5 rows



In [42]:
# 문자열 표현식을 사용해서 컬럼간 논리 연산을 수행 expr()
example_df = spark.createDataFrame(
    [(1,10),(2,20),(3,30)],['A','B']
)
example_df.show()

+---+---+
|  A|  B|
+---+---+
|  1| 10|
|  2| 20|
|  3| 30|
+---+---+



In [52]:
from pyspark.sql.functions import expr
example_df.filter( expr('   (((A+1)*5)+2) = B'  )).show()

+---+---+
|  A|  B|
+---+---+
|  1| 10|
+---+---+



In [60]:
example_df.filter( expr('A == 1')).show()

+---+---+
|  A|  B|
+---+---+
|  1| 10|
+---+---+



In [None]:
example_df.filter( expr('A = 1')).show()

In [68]:
# Row class 는  새로운 행을 만들어 준다
from pyspark.sql import Row
myRow = Row("hello",'None','1','False')

In [69]:
myRow[0]

'hello'

In [70]:
spark.createDataFrame([myRow],['a','b','c','d']).show()

+-----+----+---+-----+
|    a|   b|  c|    d|
+-----+----+---+-----+
|hello|None|  1|False|
+-----+----+---+-----+



In [71]:
from pyspark.sql.types import StructField,StructType,StringType,LongType,IntegerType,BooleanType
myRow = Row("hello",None,1,False)
myManualSchema = StructType([
    StructField('a', StringType(), True), 
    StructField('b', IntegerType(),True),
    StructField('c', IntegerType(), True),
    StructField('d', BooleanType(), True)
])
spark.createDataFrame([myRow],myManualSchema).show()

+-----+----+---+-----+
|    a|   b|  c|    d|
+-----+----+---+-----+
|hello|NULL|  1|false|
+-----+----+---+-----+



In [73]:
df.select('DEST_COUNTRY_NAME').show(2)

+-----------------+
|DEST_COUNTRY_NAME|
+-----------------+
|    United States|
|    United States|
+-----------------+
only showing top 2 rows



In [74]:
df.select('DEST_COUNTRY_NAME','count').show(2)

+-----------------+-----+
|DEST_COUNTRY_NAME|count|
+-----------------+-----+
|    United States|   15|
|    United States|    1|
+-----------------+-----+
only showing top 2 rows



In [85]:
df.select(
    expr('DEST_COUNTRY_NAME as aaa'),
    col('DEST_COUNTRY_NAME').alias('bbb'),
    column('DEST_COUNTRY_NAME').alias('ccc'),
    'DEST_COUNTRY_NAME'
).show(2)

+-------------+-------------+-------------+-----------------+
|          aaa|          bbb|          ccc|DEST_COUNTRY_NAME|
+-------------+-------------+-------------+-----------------+
|United States|United States|United States|    United States|
|United States|United States|United States|    United States|
+-------------+-------------+-------------+-----------------+
only showing top 2 rows



In [86]:
df.selectExpr("avg(count)","count(distinct(DEST_COUNTRY_NAME))").show(2)

+-----------+---------------------------------+
| avg(count)|count(DISTINCT DEST_COUNTRY_NAME)|
+-----------+---------------------------------+
|1770.765625|                              132|
+-----------+---------------------------------+

