In [1]:
from pyspark.sql import SparkSession

import pyspark.sql.functions as F
import pyspark.sql.types as T

spark = SparkSession.builder.appName("wordcount").master("local[*]").getOrCreate()

In [2]:
df = spark.read.text("sample.txt")
df.show(5, False)

+-------------------------------------------------+
|value                                            |
+-------------------------------------------------+
|Hello Spark, this is a sample file.              |
|This file is for practicing Spark RDD operations.|
|Spark is powerful and Spark is fast.             |
|We will count all the words in this file.        |
|This is a simple example.                        |
+-------------------------------------------------+
only showing top 5 rows



In [3]:
df2 = df.withColumn("value_new",F.split("value",' '))\
        .withColumn("all", F.explode("value_new"))
df.show(5)

+--------------------+
|               value|
+--------------------+
|Hello Spark, this...|
|This file is for ...|
|Spark is powerful...|
|We will count all...|
|This is a simple ...|
+--------------------+
only showing top 5 rows



In [4]:
df3 =df2.select(F.regexp_replace("all","[^A-Za-z0-9]","").alias("words"))
df3.show(5)

+-----+
|words|
+-----+
|Hello|
|Spark|
| this|
|   is|
|    a|
+-----+
only showing top 5 rows



In [5]:
df3.groupBy("words").count().orderBy(F.col("count").desc()).show()

+----------+-----+
|     words|count|
+----------+-----+
|        is|    5|
|     Spark|    5|
|      file|    3|
|    simple|    3|
|         a|    2|
|     Hello|    2|
|      This|    2|
|      this|    2|
|   example|    2|
|practicing|    1|
|      will|    1|
|       for|    1|
|     count|    1|
|         A|    1|
|  powerful|    1|
|     words|    1|
|       RDD|    1|
|      fast|    1|
|       the|    1|
|        in|    1|
+----------+-----+
only showing top 20 rows



In [6]:
spark.stop()

# Using rdd

In [8]:
spark = SparkSession.builder.appName("wordcount").master("local[*]").getOrCreate()
sc = spark.sparkContext

In [9]:
rdd = sc.textFile("sample.txt")
rdd.collect()

['Hello Spark, this is a sample file.',
 'This file is for practicing Spark RDD operations.',
 'Spark is powerful and Spark is fast.',
 'We will count all the words in this file.',
 'This is a simple example.',
 'A simple, simple example.',
 'Hello again, Spark.']

In [12]:
rdd1 = rdd.flatMap(lambda x: x.split(' '))
rdd1.take(10)

['Hello', 'Spark,', 'this', 'is', 'a', 'sample', 'file.', 'This', 'file', 'is']

In [13]:
rdd2 = rdd1.map(lambda x: (x,1))
rdd2.take(5)

[('Hello', 1), ('Spark,', 1), ('this', 1), ('is', 1), ('a', 1)]

In [15]:
rdd3 = rdd2.reduceByKey(lambda x,y:x+y)
rdd3.take(10)

[('Hello', 2),
 ('Spark,', 1),
 ('this', 2),
 ('is', 5),
 ('file.', 2),
 ('practicing', 1),
 ('Spark', 3),
 ('operations.', 1),
 ('count', 1),
 ('in', 1)]

In [16]:
spark.stop()