In [46]:
from pyspark.sql import SparkSession

In [47]:
spark = SparkSession.builder.appName('word_count_basics').master('local[*]').getOrCreate()

In [48]:
text_content = spark.read.text("./data/book.txt")
text_content.show(truncate=False)

+--------------------------------------------------------------------------------+
|value                                                                           |
+--------------------------------------------------------------------------------+
|Self-Employment: Building an Internet Business of One                           |
|Achieving Financial and Personal Freedom through a Lifestyle Technology Business|
|By Frank Kane                                                                   |
|                                                                                |
|                                                                                |
|                                                                                |
|Copyright � 2015 Frank Kane.                                                    |
|All rights reserved worldwide.                                                  |
|                                                                                |
|   

In [49]:
from pyspark.sql.functions import split
split_by_space_data = text_content.select(split(text_content.value, " ").alias("split_data"))
split_by_space_data.show(truncate=False)

+-------------------------------------------------------------------------------------------+
|split_data                                                                                 |
+-------------------------------------------------------------------------------------------+
|[Self-Employment:, Building, an, Internet, Business, of, One]                              |
|[Achieving, Financial, and, Personal, Freedom, through, a, Lifestyle, Technology, Business]|
|[By, Frank, Kane]                                                                          |
|[]                                                                                         |
|[]                                                                                         |
|[]                                                                                         |
|[Copyright, �, 2015, Frank, Kane., ]                                                       |
|[All, rights, reserved, worldwide.]                        

In [50]:
from pyspark.sql.functions import explode, count

exploded_df = split_by_space_data.select(explode(split_by_space_data.split_data).alias("words"))

In [51]:
grouped_df = exploded_df.select("words").groupBy("words").agg(count("words").alias("count"))
sorted_grouped_df = grouped_df.orderBy("count", ascending= False)
sorted_grouped_df.show(30)

+--------+-----+
|   words|count|
+--------+-----+
|      to| 1789|
|    your| 1339|
|     you| 1267|
|     the| 1176|
|       a| 1148|
|      of|  941|
|     and|  901|
|    that|  641|
|      in|  552|
|      is|  531|
|     for|  500|
|      on|  399|
|     are|  391|
|      be|  347|
|       I|  322|
|     can|  319|
|      it|  311|
|    have|  299|
|      as|  297|
|    with|  292|
|      or|  267|
|business|  261|
|      If|  237|
|    will|  220|
|    this|  208|
|      my|  199|
|        |  199|
|    they|  192|
|     but|  192|
|      at|  189|
+--------+-----+
only showing top 30 rows

