In [2]:
import findspark
findspark.init()

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import desc

In [6]:
# Create a SparkSession
spark = SparkSession.builder \
    .appName("DataFrame Demo") \
    .config("spark.executor.memory", "2g") \
    .config("spark.sql.shuffle.partitions", "4") \
    .getOrCreate()

In [7]:
spark

## **Using RDD's**

In [11]:
# Ensure spark is already initialized from the previous cell
rdd = spark.sparkContext.textFile("./data/data.txt")

result_rdd = rdd.flatMap(lambda line: line.split(" ")) \
    .map(lambda word: (word, 1)) \
    .reduceByKey(lambda a, b: a + b) \
    .sortBy(lambda x: x[1], ascending=False)

# This should now produce output
result_rdd.take(10)

[('is', 3),
 ('Spark', 3),
 ('text', 2),
 ('for', 2),
 ('distributed', 2),
 ('of', 2),
 ('This', 2),
 ('a', 2),
 ('RDD', 2),
 ('sample', 1)]

# **Using DataFrames**

In [12]:
df = spark.read.text("./data/data.txt")

result_df = df.selectExpr("explode(split(value, ' ')) as word") \
    .groupBy("word").count().orderBy(desc("count"))

In [13]:
result_df.take(10)

[Row(word='is', count=3),
 Row(word='Spark', count=3),
 Row(word='for', count=2),
 Row(word='RDD', count=2),
 Row(word='text', count=2),
 Row(word='distributed', count=2),
 Row(word='a', count=2),
 Row(word='This', count=2),
 Row(word='of', count=2),
 Row(word='file', count=1)]

In [14]:
spark.stop()