## 단어 개수 세기
- book 파일 활용
- Spark SQl로 작성
- pyspark.sql.functions의 explode 함수 활용할 것
- DataFrame API 활용해서 워드카운트 구현
- 단어 수가 많은 순대로(내림차순) 정렬

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark= SparkSession.builder.appName('word_count').getOrCreate()

In [3]:
pwd

'c:\\Users\\apfhd\\SparkExam'

In [4]:
from pyspark.sql.types import StructField, StructType
from pyspark.sql.types import StringType, IntegerType, FloatType

# 컬럼이 없다.
directory='c://Users//apfhd//SparkExam//data//book'
sdf=spark.read.text(f'file:///{directory}')
sdf.show(5)

+--------------------+
|               value|
+--------------------+
|Self-Employment: ...|
|Achieving Financi...|
|       By Frank Kane|
|                    |
|                    |
+--------------------+
only showing top 5 rows



In [5]:
from pyspark.sql.functions import explode
from pyspark.sql import functions as func
sdf.select(explode(sdf.value)).show() # error

AnalysisException: cannot resolve 'explode(value)' due to data type mismatch: input to function explode should be array or map type, not string;
'Project [explode(value#0) AS List()]
+- Relation [value#0] text


In [36]:
# '\\W+' -> 단어 문자열로 분리해주는 정규식!
words =sdf.select(func.split(sdf.value,"\\W+").alias('word'))
words.show(5)

+--------------------+
|                word|
+--------------------+
|[Self, Employment...|
|[Achieving, Finan...|
|   [By, Frank, Kane]|
|                  []|
|                  []|
+--------------------+
only showing top 5 rows



In [40]:
flat_words=words.select(explode(words.word).alias("word"))
flat_words.show(5)

+----------+
|      word|
+----------+
|      Self|
|Employment|
|  Building|
|        an|
|  Internet|
+----------+
only showing top 5 rows



In [49]:
# 해답 코드
words = sdf.select(func.explode(func.split(sdf.value,' ')).alias('word'))
words.filter(words.word != '')
words.show(5)

+----------------+
|            word|
+----------------+
|Self-Employment:|
|        Building|
|              an|
|        Internet|
|        Business|
+----------------+
only showing top 5 rows



In [51]:
# 소문자화
lower_str=words.select(func.lower(words.word).alias("word"))
lower_str.show(5)

+----------------+
|            word|
+----------------+
|self-employment:|
|        building|
|              an|
|        internet|
|        business|
+----------------+
only showing top 5 rows



In [54]:
lower_str.createOrReplaceTempView("words")

In [57]:
query="""

select count(*) as wordcount
from words
"""
spark.sql(query).show()

+---------+
|wordcount|
+---------+
|    46448|
+---------+



In [59]:
# 해답 코드
wordCounts=lower_str.groupby("word").count()
wordCounts.show(5)

+-----------+-----+
|       word|count|
+-----------+-----+
|     online|   39|
|        few|   39|
|       some|  121|
|requirement|    1|
|       hope|    5|
+-----------+-----+
only showing top 5 rows



In [60]:
# 내림차순
wordCountSorted = wordCounts.sort('count',ascending=False)
wordCountSorted.show()

+----+-----+
|word|count|
+----+-----+
|  to| 1801|
|your| 1416|
| you| 1415|
| the| 1282|
|   a| 1187|
|  of|  960|
| and|  923|
|that|  662|
|  in|  594|
|  is|  549|
| for|  522|
| are|  414|
|  if|  411|
|  on|  401|
|  be|  358|
|  it|  357|
|  as|  343|
| can|  340|
|   i|  322|
|have|  309|
+----+-----+
only showing top 20 rows



In [61]:
# sort 와 orderby 동일한 역할
wordCountSorted_2 = wordCounts.orderBy('count',ascending=False)
wordCountSorted_2.show()

+----+-----+
|word|count|
+----+-----+
|  to| 1801|
|your| 1416|
| you| 1415|
| the| 1282|
|   a| 1187|
|  of|  960|
| and|  923|
|that|  662|
|  in|  594|
|  is|  549|
| for|  522|
| are|  414|
|  if|  411|
|  on|  401|
|  be|  358|
|  it|  357|
|  as|  343|
| can|  340|
|   i|  322|
|have|  309|
+----+-----+
only showing top 20 rows



In [6]:
spark.stop()