In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import desc

In [2]:
# Create a SparkSession
spark = SparkSession.builder \
    .appName("DataFrame-Demo") \
    .master("spark://spark-master:7077") \
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/17 21:48:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Using RDDs

In [34]:
rdd = spark.sparkContext.textFile("hdfs://namenode:9000/input_files/words.txt")

In [35]:
result_rdd = rdd.flatMap(lambda line: line.split(" ")) \
    .map(lambda word: (word, 1)) \
    .reduceByKey(lambda a, b: a + b) \
    .sortBy(lambda x: x[1], ascending=False)

In [36]:
rdd.collect()

["Gone with the Wind is a 1939 American epic historical romance film adapted from the 1936 novel by Margaret Mitchell. The film was produced by David O. Selznick of Selznick International Pictures and directed by Victor Fleming. Set in the American South against the backdrop of the Civil War and the Reconstruction era, the film tells the story of Scarlett O'Hara (Vivien Leigh), the strong-willed daughter of a Georgia plantation owner, following her romantic pursuit of Ashley Wilkes (Leslie Howard), who is married to his cousin, Melanie Hamilton (Olivia de Havilland), and her subsequent marriage to Rhett Butler (Clark Gable).",
 '',
 "The film had a troubled production. The start of filming was delayed for two years until January 1939 because of Selznick's determination to secure Gable for the role of Rhett, and concluded in July. The role of Scarlett was difficult to cast, and 1,400 unknown women were interviewed for the part. Sidney Howard's original screenplay underwent many revision

In [37]:
result_rdd.take(5)

[('the', 31), ('of', 15), ('and', 12), ('in', 10), ('to', 10)]

### Using DataFrames

In [39]:
df = spark.read.text("hdfs://namenode:9000/input_files/data.txt")

result_df = df.selectExpr("explode(split(value, ' ')) as word") \
    .groupBy("word").count().orderBy(desc("count"))

In [40]:
result_df.take(10)

                                                                                

[Row(word='the', count=12),
 Row(word='a', count=7),
 Row(word='of', count=7),
 Row(word='in', count=5),
 Row(word='distributed', count=5),
 Row(word='Spark', count=4),
 Row(word='API', count=3),
 Row(word='RDD', count=3),
 Row(word='is', count=3),
 Row(word='on', count=3)]

In [41]:
spark.stop()