# 06_SparkDataAnal.ipynb
TLC Trip Record Data 출처: https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("spark-sql").getOrCreate()

In [None]:
#2015-summary.json

In [2]:
df = spark.read.format('json').load('learning_spark_data/2015-summary.json')

In [3]:
df.count()

256

In [6]:
df.dtypes

[('DEST_COUNTRY_NAME', 'string'),
 ('ORIGIN_COUNTRY_NAME', 'string'),
 ('count', 'bigint')]

In [7]:
df.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



In [9]:
df.collect()

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count=344),
 Row(DEST_COUNTRY_NAME='Egypt', ORIGIN_COUNTRY_NAME='United States', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='India', count=62),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Singapore', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Grenada', count=62),
 Row(DEST_COUNTRY_NAME='Costa Rica', ORIGIN_COUNTRY_NAME='United States', count=588),
 Row(DEST_COUNTRY_NAME='Senegal', ORIGIN_COUNTRY_NAME='United States', count=40),
 Row(DEST_COUNTRY_NAME='Moldova', ORIGIN_COUNTRY_NAME='United States', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Sint Maarten', count=325),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Marshall Islands', count=39),
 

In [11]:
df.take(3)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count=344)]

In [15]:
df.select('count').show(5)

+-----+
|count|
+-----+
|   15|
|    1|
|  344|
|   15|
|   62|
+-----+
only showing top 5 rows



In [24]:
df.select('DEST_COUNTRY_NAME').distinct().show(5)

+-----------------+
|DEST_COUNTRY_NAME|
+-----------------+
|         Anguilla|
|           Russia|
|         Paraguay|
|          Senegal|
|           Sweden|
+-----------------+
only showing top 5 rows



In [27]:
df1 = df.select('DEST_COUNTRY_NAME').distinct().cache()
df1.count()

132

In [None]:
# ROW class를 이용한 단일 레코드 생성

In [30]:
from pyspark.sql import Row
myRow = Row('hello',None,1,False)
myRow

<Row('hello', None, 1, False)>

In [32]:
# 새로운 컬럼 추가하기
from pyspark.sql.functions import expr

df3 = df.withColumn('withinCountry',expr('ORIGIN_COUNTRY_NAME == DEST_COUNTRY_NAME')) # expr sql 표현식을 받아 생성
df3

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint, withinCountry: boolean]

In [39]:
df3.show(3)

+-----------------+-------------------+-----+-------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|withinCountry|
+-----------------+-------------------+-----+-------------+
|    United States|            Romania|   15|        false|
|    United States|            Croatia|    1|        false|
|    United States|            Ireland|  344|        false|
+-----------------+-------------------+-----+-------------+
only showing top 3 rows



In [40]:
df3.filter(df3.withinCountry == True).show()

+-----------------+-------------------+------+-------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME| count|withinCountry|
+-----------------+-------------------+------+-------------+
|    United States|      United States|370002|         true|
+-----------------+-------------------+------+-------------+



In [58]:
# count값이 10이하 under, 초과 upper로 변환 > category 컬럼 추가
query = '''
CASE
  WHEN count <= 10 THEN 'under'
  ELSE 'upper'
END
'''

df4 = df.withColumn('category', expr(query))

In [57]:
df4.show(5)

+-----------------+-------------------+-----+--------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|category|
+-----------------+-------------------+-----+--------+
|    United States|            Romania|   15|   upper|
|    United States|            Croatia|    1|   under|
|    United States|            Ireland|  344|   upper|
|            Egypt|      United States|   15|   upper|
|    United States|              India|   62|   upper|
+-----------------+-------------------+-----+--------+
only showing top 5 rows



In [None]:
# DataFrame의 select(), where(), filter() 트랜스포메이션
# show(), count() 액션

In [59]:
spark.stop()

In [None]:
# 집계함수

In [60]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("spark-sql").getOrCreate()

In [154]:
emp_df = spark.read.format('csv').option('header', 'true').option('inferSchema', 'true').load('learning_spark_data/emp.csv')
dept_df = spark.read.format('csv').option('header', 'true').option('inferSchema', 'true').load('learning_spark_data/dept.csv')

In [70]:
emp_df.show(5)

+-----+------+--------+----+----------+----+----+------+
|empno| ename|     job| mgr|  hiredate| sal|comm|deptno|
+-----+------+--------+----+----------+----+----+------+
| 7369| SMITH|   CLERK|7902|1980-12-17| 800|NULL|    20|
| 7499| ALLEN|SALESMAN|7698|1981-02-20|1600| 300|    30|
| 7521|  WARD|SALESMAN|7698|1981-02-22|1250| 500|    30|
| 7566| JONES| MANAGER|7839|1981-04-02|2975|NULL|    20|
| 7654|MARTIN|SALESMAN|7698|1981-09-28|1250|1400|    30|
+-----+------+--------+----+----------+----+----+------+
only showing top 5 rows



In [71]:
emp_df.count()

15

In [72]:
dept_df.count()

4

In [73]:
emp_df.select('ENAME', 'DEPTNO').show(5)

+------+------+
| ENAME|DEPTNO|
+------+------+
| SMITH|    20|
| ALLEN|    30|
|  WARD|    30|
| JONES|    20|
|MARTIN|    30|
+------+------+
only showing top 5 rows



In [75]:
#filter()랑 동일
emp_df.select('*').where('deptno=20').show()

+-----+-----+-------+----+----------+----+----+------+
|empno|ename|    job| mgr|  hiredate| sal|comm|deptno|
+-----+-----+-------+----+----------+----+----+------+
| 7369|SMITH|  CLERK|7902|1980-12-17| 800|NULL|    20|
| 7566|JONES|MANAGER|7839|1981-04-02|2975|NULL|    20|
| 7788|SCOTT|ANALYST|7566|1987-04-19|3000|NULL|    20|
| 7876|ADAMS|  CLERK|7788|1987-05-23|1100|NULL|    20|
| 7902| FORD|ANALYST|7566|1981-12-03|3000|NULL|    20|
+-----+-----+-------+----+----------+----+----+------+



In [78]:
emp_df.selectExpr('count(*)').show()

+--------+
|count(1)|
+--------+
|      15|
+--------+



In [83]:
from pyspark.sql.functions import countDistinct
emp_df.select(countDistinct('job')).show()

+-------------------+
|count(DISTINCT job)|
+-------------------+
|                  5|
+-------------------+



In [84]:
from pyspark.sql.functions import approx_count_distinct
emp_df.select(approx_count_distinct('job', 0.1)).show()

+--------------------------+
|approx_count_distinct(job)|
+--------------------------+
|                         5|
+--------------------------+



In [103]:
# first,last,min,max,sum,avg -> (expr : sql 문장), function으로 처리
from pyspark.sql.functions import first, last, min, max, sum, avg, round

In [92]:
emp_df.select(first('sal')).show()

+----------+
|first(sal)|
+----------+
|       800|
+----------+



In [94]:
emp_df.select(last('sal')).show()

+---------+
|last(sal)|
+---------+
|     3200|
+---------+



In [96]:
emp_df.select(min('sal')).show()

+--------+
|min(sal)|
+--------+
|    1100|
+--------+



In [98]:
emp_df.select(max('sal')).show()

+--------+
|max(sal)|
+--------+
|     950|
+--------+



In [99]:
emp_df.select(sum('sal')).show()

+--------+
|sum(sal)|
+--------+
| 32225.0|
+--------+



In [104]:
emp_df.select(round(avg('sal'),2)).show()

+------------------+
|round(avg(sal), 2)|
+------------------+
|           2148.33|
+------------------+



In [105]:
emp_df.selectExpr('round(avg(sal),2)').show()

+------------------+
|round(avg(sal), 2)|
+------------------+
|           2148.33|
+------------------+



In [106]:
emp_df.selectExpr('sum(distinct sal)').show()

+-----------------+
|sum(DISTINCT sal)|
+-----------------+
|          27975.0|
+-----------------+



In [None]:
# total_salary / total_transaction, avg_salay, mean_salary

In [108]:
emp_df.selectExpr('sum(sal) / count(sal)', 'avg(sal)', 'mean(sal)').show()

+-----------------------+------------------+------------------+
|(sum(sal) / count(sal))|          avg(sal)|         mean(sal)|
+-----------------------+------------------+------------------+
|     2148.3333333333335|2148.3333333333335|2148.3333333333335|
+-----------------------+------------------+------------------+



In [115]:
from pyspark.sql.functions import count, mean
emp_df.select((
    round(sum('sal') / count('sal'),2)).alias('sum/count'),
    round(avg('sal'),2).alias('avg_salay'),
    round(mean('sal'),2).alias('mean_salay'),
             ).show()

+---------+---------+----------+
|sum/count|avg_salay|mean_salay|
+---------+---------+----------+
|  2148.33|  2148.33|   2148.33|
+---------+---------+----------+



In [116]:
# 그룹화
emp_df.groupBy('job').count().show()

+---------+-----+
|      job|count|
+---------+-----+
|  ANALYST|    2|
| SALESMAN|    4|
|    CLERK|    5|
|  MANAGER|    3|
|PRESIDENT|    1|
+---------+-----+



In [118]:
# select job,
#     count(job),
#     sum(sal)
# group by job
group_df = emp_df.groupBy('job').agg(
    count('job').alias('qty'),
    expr('count(job)'),
    sum('sal')
)
group_df.show()

+---------+---+----------+--------+
|      job|qty|count(job)|sum(sal)|
+---------+---+----------+--------+
|  ANALYST|  2|         2|  6000.0|
| SALESMAN|  4|         4|  5600.0|
|    CLERK|  5|         5|  7350.0|
|  MANAGER|  3|         3|  8275.0|
|PRESIDENT|  1|         1|  5000.0|
+---------+---+----------+--------+



In [None]:
#sal의 평균 SAL_AVG, 표준편차 SAL_STDDEV를 job별로 계산해서 출력, 소수점 2자리


In [123]:
from pyspark.sql.functions import stddev
emp_df.groupBy('job').agg(
    round(avg('sal'),2).alias('SAL_AVG'),
    round(stddev('sal'),2).alias('SAL_STDDEV')
).show()

+---------+-------+----------+
|      job|SAL_AVG|SAL_STDDEV|
+---------+-------+----------+
|  ANALYST| 3000.0|       0.0|
| SALESMAN| 1400.0|    177.95|
|    CLERK| 1470.0|    984.63|
|  MANAGER|2758.33|    274.24|
|PRESIDENT| 5000.0|      NULL|
+---------+-------+----------+



In [148]:
# 급여 TOP10 구하기
from pyspark.sql.functions import desc, col, rank

emp_df.orderBy(col('sal').cast('int').desc()).limit(10).show()

+-----+------+---------+----+----------+----+----+------+
|empno| ename|      job| mgr|  hiredate| sal|comm|deptno|
+-----+------+---------+----+----------+----+----+------+
| 7839|  KING|PRESIDENT|NULL|1981-11-17|5000|NULL|    10|
| 9292|  JACK|    CLERK|7782|1982-01-23|3200|NULL|    70|
| 7788| SCOTT|  ANALYST|7566|1987-04-19|3000|NULL|    20|
| 7902|  FORD|  ANALYST|7566|1981-12-03|3000|NULL|    20|
| 7566| JONES|  MANAGER|7839|1981-04-02|2975|NULL|    20|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|2850|NULL|    30|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450|NULL|    10|
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600| 300|    30|
| 7844|TURNER| SALESMAN|7698|1981-09-08|1500|   0|    30|
| 7934|MILLER|    CLERK|7782|1982-01-23|1300|NULL|    10|
+-----+------+---------+----+----------+----+----+------+



In [155]:
from pyspark.sql.window import Window
windowspec = Window.orderBy(desc('sal'))
salAllRank = rank().over(windowspec)
salAllRank

Column<'RANK() OVER (ORDER BY sal DESC NULLS LAST unspecifiedframe$())'>

In [156]:
emp_df.withColumn('salary_rank', salAllRank).show(10)

+-----+------+---------+----+----------+----+----+------+-----------+
|empno| ename|      job| mgr|  hiredate| sal|comm|deptno|salary_rank|
+-----+------+---------+----+----------+----+----+------+-----------+
| 7839|  KING|PRESIDENT|NULL|1981-11-17|5000|NULL|    10|          1|
| 9292|  JACK|    CLERK|7782|1982-01-23|3200|NULL|    70|          2|
| 7788| SCOTT|  ANALYST|7566|1987-04-19|3000|NULL|    20|          3|
| 7902|  FORD|  ANALYST|7566|1981-12-03|3000|NULL|    20|          3|
| 7566| JONES|  MANAGER|7839|1981-04-02|2975|NULL|    20|          5|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|2850|NULL|    30|          6|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450|NULL|    10|          7|
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600| 300|    30|          8|
| 7844|TURNER| SALESMAN|7698|1981-09-08|1500|   0|    30|          9|
| 7934|MILLER|    CLERK|7782|1982-01-23|1300|NULL|    10|         10|
+-----+------+---------+----+----------+----+----+------+-----------+
only showing top 10 

In [None]:
#직무별로 rank 작성
#Window.partitionBy()
#job_rank_df 작성

In [158]:
windowspec = Window.partitionBy('job').orderBy(desc('sal'))
salJobRank = rank().over(windowspec)
salJobRank

Column<'RANK() OVER (PARTITION BY job ORDER BY sal DESC NULLS LAST unspecifiedframe$())'>

In [160]:
job_rank_df = emp_df.withColumn('salary_rank', salJobRank)

In [161]:
job_rank_df.show()

+-----+------+---------+----+----------+----+----+------+-----------+
|empno| ename|      job| mgr|  hiredate| sal|comm|deptno|salary_rank|
+-----+------+---------+----+----------+----+----+------+-----------+
| 7788| SCOTT|  ANALYST|7566|1987-04-19|3000|NULL|    20|          1|
| 7902|  FORD|  ANALYST|7566|1981-12-03|3000|NULL|    20|          1|
| 9292|  JACK|    CLERK|7782|1982-01-23|3200|NULL|    70|          1|
| 7934|MILLER|    CLERK|7782|1982-01-23|1300|NULL|    10|          2|
| 7876| ADAMS|    CLERK|7788|1987-05-23|1100|NULL|    20|          3|
| 7900| JAMES|    CLERK|7698|1981-12-03| 950|NULL|    30|          4|
| 7369| SMITH|    CLERK|7902|1980-12-17| 800|NULL|    20|          5|
| 7566| JONES|  MANAGER|7839|1981-04-02|2975|NULL|    20|          1|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|2850|NULL|    30|          2|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450|NULL|    10|          3|
| 7839|  KING|PRESIDENT|NULL|1981-11-17|5000|NULL|    10|          1|
| 7499| ALLEN| SALES