# SparkSession
A SparkSession can be used create DataFrame, register DataFrame as tables, execute SQL over tables, cache tables, and read parquet files.
The entry point to programming Spark with the Dataset and DataFrame API.

In [8]:
from datetime import datetime
from pyspark import SparkContext
from pyspark.sql import SparkSession, SQLContext

In [9]:
spark = (SparkSession.builder.appName("pyspark-dataframe-demo-{}".format(datetime.today()))
        .master("spark://spark-master:7077")      
        .getOrCreate())

sqlContext = SQLContext(spark)
# spark.sparkContext.getConf().getAll()

In [10]:
sc = spark.sparkContext
sc

# DataFrame
A distributed collection of data grouped into named columns

## From list of tuples, dictionary

In [18]:
l = [("Alice", 1)]
spark.createDataFrame(l).collect()

[Row(_1='Alice', _2=1)]

In [39]:
df = spark.createDataFrame(l, ["name", "age"])

In [40]:
print(type(df))

<class 'pyspark.sql.dataframe.DataFrame'>


In [20]:
df = spark.createDataFrame(l, ["name", "age"])
df.show()

+-----+---+
| name|age|
+-----+---+
|Alice|  1|
+-----+---+



In [21]:
d = [{"name": "Alice", "age": 1}]
spark.createDataFrame(d).collect()

[Row(age=1, name='Alice')]

In [22]:
spark.createDataFrame(d).show()

+---+-----+
|age| name|
+---+-----+
|  1|Alice|
+---+-----+



## From RDDs

In [13]:
l = [("Alice", 1)]
rdd = sc.parallelize(l)
spark.createDataFrame(rdd).collect()

[Row(_1='Alice', _2=1)]

In [14]:
# with list of column names
df = spark.createDataFrame(rdd, ["name", "age"])
df.collect()

[Row(name='Alice', age=1)]

In [15]:
print(type(df))

<class 'pyspark.sql.dataframe.DataFrame'>


In [16]:
# with Row definition
from pyspark.sql import Row
Person = Row("name", "age")
person = rdd.map(lambda r: Person(*r))
df2 = spark.createDataFrame(person)
df2.collect()

[Row(name='Alice', age=1)]

In [17]:
df2.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)



In [18]:
# with schema definition
from pyspark.sql.types import *
schema = StructType([
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True)
  ])
df3 = spark.createDataFrame(rdd, schema)
df3.collect()

[Row(name='Alice', age=1)]

In [19]:
df3.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)



In [20]:
# with string definition, New in version 2.0.
rdd = sc.parallelize(l)
print(spark.createDataFrame(rdd, "a: string, b: int").collect())

rdd = rdd.map(lambda row: row[1])
print(spark.createDataFrame(rdd, "int").collect())

[Row(a='Alice', b=1)]
[Row(value=1)]


## From pandas

In [21]:
import pandas
print(spark.createDataFrame(df.toPandas()).collect())
print(spark.createDataFrame(pandas.DataFrame([["Alice", 2]])).collect())

  for column, series in pdf.iteritems():
  for column, series in pdf.iteritems():


[Row(name='Alice', age=1)]
[Row(0='Alice', 1=2)]


In [22]:
df.toPandas().head()

Unnamed: 0,name,age
0,Alice,1


# SQLContext

In [23]:
df.show()

+-----+---+
| name|age|
+-----+---+
|Alice|  1|
+-----+---+



In [25]:
# New in version 2.0
df.createOrReplaceTempView("table1")
df2 = spark.sql("SELECT name as N, age as A from table1")
df2.show()

+-----+---+
|    N|  A|
+-----+---+
|Alice|  1|
+-----+---+



In [26]:
sqlContext.registerDataFrameAsTable(df, "table1")
sqlContext.registerDataFrameAsTable(df2, "table2")
sqlContext.tableNames()


['table1', 'table2']

In [28]:
spark.sql("SELECT * FROM table1").show()

+-----+---+
| name|age|
+-----+---+
|Alice|  1|
+-----+---+



In [29]:
sqlContext.tables().show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|         |   table1|      false|
|         |   table2|      false|
+---------+---------+-----------+



In [30]:
df3 = sqlContext.tables()
df3.filter("tableName = 'table1'").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|         |   table1|      false|
+---------+---------+-----------+



In [46]:
sqlContext.dropTempTable("table1")
sqlContext.dropTempTable("table2")

In [47]:
sqlContext.tableNames()

[]

## UDF: User Defined Function

In [48]:
sqlContext.registerFunction("stringLengthString", lambda x: len(x))
sqlContext.sql("SELECT stringLengthString('test')").collect()



[Row(stringLengthString(test)='4')]

In [31]:
from pyspark.sql.types import IntegerType
sqlContext.registerFunction("stringLengthInt", lambda x: len(x), IntegerType())
sqlContext.sql("SELECT stringLengthInt('test')").show()



+---------------------+
|stringLengthInt(test)|
+---------------------+
|                    4|
+---------------------+



In [32]:
sqlContext.udf.register("stringLengthInt", lambda x: len(x), IntegerType())
sqlContext.sql("SELECT stringLengthInt('test')").collect()

[Row(stringLengthInt(test)=4)]

# Working with DataFrame

In [81]:
l = [("Alice", 2, 12), ("Bob", 5, 25)]
rdd = sc.parallelize(l)
df = sqlContext.createDataFrame(rdd, "name: string, age: int, height: int")
df.show()

+-----+---+------+
| name|age|height|
+-----+---+------+
|Alice|  2|    12|
|  Bob|  5|    25|
+-----+---+------+



In [57]:
print(type(df))

<class 'pyspark.sql.dataframe.DataFrame'>


In [61]:
t = spark.createDataFrame(rdd, "name: string, age: int, height: int")

In [62]:
print(type(t))

<class 'pyspark.sql.dataframe.DataFrame'>


In [82]:
df.createOrReplaceTempView("people")

df2 = spark.sql("select * from people")
df2.show()

+-----+---+------+
| name|age|height|
+-----+---+------+
|Alice|  2|    12|
|  Bob|  5|    25|
+-----+---+------+



In [66]:
df.repartition(10).rdd.getNumPartitions()

10

In [87]:
%%time
data = df.union(df).repartition("age")
data.show()

+-----+---+------+
| name|age|height|
+-----+---+------+
|Alice|  2|    12|
|Alice|  2|    12|
|  Bob|  5|    25|
|  Bob|  5|    25|
+-----+---+------+

CPU times: user 5.46 ms, sys: 223 µs, total: 5.68 ms
Wall time: 877 ms


In [92]:
%%time
data = data.repartition(7, "age")
data.show()

+-----+---+------+
| name|age|height|
+-----+---+------+
|Alice|  2|    12|
|  Bob|  5|    25|
|  Bob|  5|    25|
|Alice|  2|    12|
+-----+---+------+

CPU times: user 3.96 ms, sys: 0 ns, total: 3.96 ms
Wall time: 616 ms


In [94]:
data.rdd.getNumPartitions()

7

In [96]:
%%time
data = data.repartition("name", "age")
data.show()

+-----+---+------+
| name|age|height|
+-----+---+------+
|Alice|  2|    12|
|Alice|  2|    12|
|  Bob|  5|    25|
|  Bob|  5|    25|
+-----+---+------+

CPU times: user 5.04 ms, sys: 155 µs, total: 5.2 ms
Wall time: 483 ms


In [97]:
# withColumn(colName, col)
# Returns a new DataFrame by adding a column or replacing the existing column that has the same name.
df.withColumn("age2", df.age + 2).show()

+-----+---+------+----+
| name|age|height|age2|
+-----+---+------+----+
|Alice|  2|    12|   4|
|  Bob|  5|    25|   7|
+-----+---+------+----+



In [98]:
df.withColumnRenamed("age", "age2").show()

+-----+----+------+
| name|age2|height|
+-----+----+------+
|Alice|   2|    12|
|  Bob|   5|    25|
+-----+----+------+



In [99]:
df.select(df.age.cast("string").alias("ages")).show()

+----+
|ages|
+----+
|   2|
|   5|
+----+



In [101]:
df.select(df.age.cast(StringType()).alias("ages")).show()

+----+
|ages|
+----+
|   2|
|   5|
+----+



In [111]:
print(type(df))

<class 'pyspark.sql.dataframe.DataFrame'>


## Aggregate
Aggregate on the entire DataFrame without groups (shorthand for df.groupBy.agg()).

In [102]:
df.agg({"age": "max"}).show()

+--------+
|max(age)|
+--------+
|       5|
+--------+



In [103]:
from pyspark.sql import functions as F
df.agg(F.min(df.age)).show()

+--------+
|min(age)|
+--------+
|       2|
+--------+



In [105]:
gdf = df.groupBy(df.name)
gdf.agg({"*": "count"}).show()

+-----+--------+
| name|count(1)|
+-----+--------+
|  Bob|       1|
|Alice|       1|
+-----+--------+



In [107]:
data.groupBy("name").agg({"*": "count"}).show()

+-----+--------+
| name|count(1)|
+-----+--------+
|  Bob|       2|
|Alice|       2|
+-----+--------+



In [108]:
from pyspark.sql import functions as F
gdf.agg(F.min(df.age)).show()

+-----+--------+
| name|min(age)|
+-----+--------+
|Alice|       2|
|  Bob|       5|
+-----+--------+



In [109]:
gdf.agg(F.min(df.age)).explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[name#632], functions=[min(age#633)])
   +- Exchange hashpartitioning(name#632, 200), ENSURE_REQUIREMENTS, [plan_id=2442]
      +- HashAggregate(keys=[name#632], functions=[partial_min(age#633)])
         +- Project [name#632, age#633]
            +- Scan ExistingRDD[name#632,age#633,height#634]




## Alias

In [110]:
from pyspark.sql.functions import *
df_as1 = df.alias("df_as1")
df_as2 = df.alias("df_as2")
joined_df = df_as1.join(df_as2, col("df_as1.name") == col("df_as2.name"), "inner")
joined_df = joined_df.select("df_as1.name", "df_as2.name", "df_as2.age")
joined_df.toPandas()

Unnamed: 0,name,name.1,age
0,Alice,Alice,2
1,Bob,Bob,5


In [49]:
joined_df.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=true
+- == Final Plan ==
   *(5) SortMergeJoin [name#4], [name#299], Inner
   :- *(3) Sort [name#4 ASC NULLS FIRST], false, 0
   :  +- AQEShuffleRead coalesced
   :     +- ShuffleQueryStage 0
   :        +- Exchange hashpartitioning(name#4, 200), ENSURE_REQUIREMENTS, [plan_id=609]
   :           +- *(1) Project [name#4]
   :              +- *(1) Filter isnotnull(name#4)
   :                 +- *(1) Scan ExistingRDD[name#4,age#5L]
   +- *(4) Sort [name#299 ASC NULLS FIRST], false, 0
      +- AQEShuffleRead coalesced
         +- ShuffleQueryStage 1
            +- Exchange hashpartitioning(name#299, 200), ENSURE_REQUIREMENTS, [plan_id=621]
               +- *(2) Filter isnotnull(name#299)
                  +- *(2) Scan ExistingRDD[name#299,age#300L]
+- == Initial Plan ==
   SortMergeJoin [name#4], [name#299], Inner
   :- Sort [name#4 ASC NULLS FIRST], false, 0
   :  +- Exchange hashpartitioning(name#4, 200), ENSURE_REQUIREMENTS, [plan_id=5

## Stats

In [112]:
df.show()

+-----+---+------+
| name|age|height|
+-----+---+------+
|Alice|  2|    12|
|  Bob|  5|    25|
+-----+---+------+



In [113]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- height: integer (nullable = true)



In [114]:
df.schema

StructType([StructField('name', StringType(), True), StructField('age', IntegerType(), True), StructField('height', IntegerType(), True)])

In [115]:
df.storageLevel

StorageLevel(False, False, False, False, 1)

In [116]:
df.count()

2

In [121]:
df.groupBy().count().show()

+-----+
|count|
+-----+
|    2|
+-----+



In [117]:
df.groupBy().sum("age").show()

+--------+
|sum(age)|
+--------+
|       7|
+--------+



In [122]:
df.groupBy().sum("age", "height").show()

+--------+-----------+
|sum(age)|sum(height)|
+--------+-----------+
|       7|         37|
+--------+-----------+



In [124]:
df.groupBy().avg("age").show()

+--------+
|avg(age)|
+--------+
|     3.5|
+--------+



In [125]:
df.groupBy().avg("age", "height").show()

+--------+-----------+
|avg(age)|avg(height)|
+--------+-----------+
|     3.5|       18.5|
+--------+-----------+



In [126]:
df.columns

['name', 'age', 'height']

In [127]:
df.name

Column<'name'>

In [128]:
df["name"]

Column<'name'>

In [151]:
t = df
t.age = df.age + 1
t.show()

+-----+---+------+
| name|age|height|
+-----+---+------+
|Alice|  2|    12|
|  Bob|  5|    25|
+-----+---+------+



In [148]:
df.show()

+-----+---+------+
| name|age|height|
+-----+---+------+
|Alice|  2|    12|
|  Bob|  5|    25|
+-----+---+------+



In [130]:
# cube(*col): Create a multi-dimensional cube for the current DataFrame using the specified columns, so we can run aggregation on them.
df.cube("name", df.age).count().orderBy("name", "age").show()

+-----+----+-----+
| name| age|count|
+-----+----+-----+
| null|null|    2|
| null|   2|    1|
| null|   5|    1|
|Alice|null|    1|
|Alice|   2|    1|
|  Bob|null|    1|
|  Bob|   5|    1|
+-----+----+-----+



In [131]:
df.describe(["age"]).show()

+-------+------------------+
|summary|               age|
+-------+------------------+
|  count|                 2|
|   mean|               3.5|
| stddev|2.1213203435596424|
|    min|                 2|
|    max|                 5|
+-------+------------------+



In [132]:
df.describe().show()

+-------+-----+------------------+-----------------+
|summary| name|               age|           height|
+-------+-----+------------------+-----------------+
|  count|    2|                 2|                2|
|   mean| null|               3.5|             18.5|
| stddev| null|2.1213203435596424|9.192388155425117|
|    min|Alice|                 2|               12|
|    max|  Bob|                 5|               25|
+-------+-----+------------------+-----------------+



In [133]:
df.distinct().count()

2

In [134]:
df.dtypes

[('name', 'string'), ('age', 'int'), ('height', 'int')]

In [135]:
df.explain()

== Physical Plan ==
*(1) Scan ExistingRDD[name#632,age#633,height#634]




In [136]:
df.explain(True)

== Parsed Logical Plan ==
LogicalRDD [name#632, age#633, height#634], false

== Analyzed Logical Plan ==
name: string, age: int, height: int
LogicalRDD [name#632, age#633, height#634], false

== Optimized Logical Plan ==
LogicalRDD [name#632, age#633, height#634], false

== Physical Plan ==
*(1) Scan ExistingRDD[name#632,age#633,height#634]



In [137]:
df.groupBy().avg().show()

+--------+-----------+
|avg(age)|avg(height)|
+--------+-----------+
|     3.5|       18.5|
+--------+-----------+



In [138]:
df.groupBy("name").agg({"age": "mean"}).show()

+-----+--------+
| name|avg(age)|
+-----+--------+
|Alice|     2.0|
|  Bob|     5.0|
+-----+--------+



In [139]:
df.groupBy(df.name).avg().show()

+-----+--------+-----------+
| name|avg(age)|avg(height)|
+-----+--------+-----------+
|  Bob|     5.0|       25.0|
|Alice|     2.0|       12.0|
+-----+--------+-----------+



In [140]:
df.groupBy(["name", df.age]).count().show()

+-----+---+-----+
| name|age|count|
+-----+---+-----+
|Alice|  2|    1|
|  Bob|  5|    1|
+-----+---+-----+



In [141]:
df.groupBy().max("age").show()

+--------+
|max(age)|
+--------+
|       5|
+--------+



In [142]:
df.groupBy().max("age", "height").show()

+--------+-----------+
|max(age)|max(height)|
+--------+-----------+
|       5|         25|
+--------+-----------+



In [143]:
df.groupBy().mean("age").show()

+--------+
|avg(age)|
+--------+
|     3.5|
+--------+



In [152]:
df.groupBy().mean("age", "height").show()

+--------+-----------+
|avg(age)|avg(height)|
+--------+-----------+
|     3.5|       18.5|
+--------+-----------+



## Join

In [153]:
df.select("age", "name").show()

+---+-----+
|age| name|
+---+-----+
|  2|Alice|
|  5|  Bob|
+---+-----+



In [154]:
df2.select("name", "height").show()

+-----+------+
| name|height|
+-----+------+
|Alice|    12|
|  Bob|    25|
+-----+------+



In [155]:
df.drop("age").show()

+-----+------+
| name|height|
+-----+------+
|Alice|    12|
|  Bob|    25|
+-----+------+



In [156]:
df.drop(df.age).show()

+-----+---+------+
| name|age|height|
+-----+---+------+
|Alice|  2|    12|
|  Bob|  5|    25|
+-----+---+------+



In [157]:
df.join(df2, df.name == df2.name, "inner").drop(df.name).drop(df.age).show()

+---+------+-----+---+------+
|age|height| name|age|height|
+---+------+-----+---+------+
|  2|    12|Alice|  2|    12|
|  5|    25|  Bob|  5|    25|
+---+------+-----+---+------+



In [158]:
df.join(df2, "name", "inner").drop("age", "height").show()

+-----+
| name|
+-----+
|Alice|
|  Bob|
+-----+



In [159]:
from pyspark.sql import Row
df = sc.parallelize([
    Row(name="Alice", age=5, height=80),
    Row(name="Alice", age=5, height=80),
    Row(name="Alice", age=10, height=80)
  ]).toDF()
df.dropDuplicates().show()

+-----+---+------+
| name|age|height|
+-----+---+------+
|Alice|  5|    80|
|Alice| 10|    80|
+-----+---+------+



In [160]:
df.dropDuplicates(["name", "height"]).show()

+-----+---+------+
| name|age|height|
+-----+---+------+
|Alice|  5|    80|
+-----+---+------+



In [161]:
df.join(df2, df.name == df2.name, 'outer').select(df.name, df2.height).show()

+-----+------+
| name|height|
+-----+------+
|Alice|    12|
|Alice|    12|
|Alice|    12|
| null|    25|
+-----+------+



In [162]:
df.join(df2, 'name', 'outer').select('name', df.height).show()

+-----+------+
| name|height|
+-----+------+
|Alice|    80|
|Alice|    80|
|Alice|    80|
|  Bob|  null|
+-----+------+



In [163]:
cond = [df.name == df2.name, df.age == df2.age]
df.join(df2, cond, 'outer').select(df.name, df2.age).show()

+-----+----+
| name| age|
+-----+----+
| null|   2|
|Alice|null|
|Alice|null|
|Alice|null|
| null|   5|
+-----+----+



In [164]:
df.join(df2, 'name').select(df.name, df2.height).show()

+-----+------+
| name|height|
+-----+------+
|Alice|    12|
|Alice|    12|
|Alice|    12|
+-----+------+



In [165]:
df.join(df2, ['name', 'age']).select(df.name, df.age).show()

+----+---+
|name|age|
+----+---+
+----+---+



## Filter

In [166]:
l = [("Alice", 2, 12), ("Bob", 5, 25)]
rdd = sc.parallelize(l)
df = sqlContext.createDataFrame(rdd, "name: string, age: int, height: int")
df.show()


+-----+---+------+
| name|age|height|
+-----+---+------+
|Alice|  2|    12|
|  Bob|  5|    25|
+-----+---+------+



In [167]:
df.filter(df.age > 3).show()

+----+---+------+
|name|age|height|
+----+---+------+
| Bob|  5|    25|
+----+---+------+



In [168]:
df.filter("age > 3").show()

+----+---+------+
|name|age|height|
+----+---+------+
| Bob|  5|    25|
+----+---+------+



In [169]:
df.where("age=2").show()

+-----+---+------+
| name|age|height|
+-----+---+------+
|Alice|  2|    12|
+-----+---+------+



In [170]:
df.first()

Row(name='Alice', age=2, height=12)

In [171]:
df.head()

Row(name='Alice', age=2, height=12)

In [172]:
df.limit(1).collect()

[Row(name='Alice', age=2, height=12)]

In [173]:
df.limit(0).collect()

[]

In [174]:
# orderBy
print(df.sort(df.age.desc()).collect())
print(df.sort("age", ascending=False).collect())
print(df.orderBy(df.age.desc()).collect())

from pyspark.sql.functions import *
print(df.sort(asc("age")).collect())
print(df.sort(desc("age"), "name").collect())
print(df.orderBy(["age", "name"], ascending=[0, 1]).collect())

[Row(name='Bob', age=5, height=25), Row(name='Alice', age=2, height=12)]
[Row(name='Bob', age=5, height=25), Row(name='Alice', age=2, height=12)]
[Row(name='Bob', age=5, height=25), Row(name='Alice', age=2, height=12)]
[Row(name='Alice', age=2, height=12), Row(name='Bob', age=5, height=25)]
[Row(name='Bob', age=5, height=25), Row(name='Alice', age=2, height=12)]
[Row(name='Bob', age=5, height=25), Row(name='Alice', age=2, height=12)]


In [175]:
print(df.filter(df.name.endswith("ice")).collect())
print(df.filter(df.name.endswith("ice$")).collect())

[Row(name='Alice', age=2, height=12)]
[]


In [176]:
# get subfield RDD > RDD, gets a field by name in a StructField.
from pyspark.sql import Row
df1 = sc.parallelize([Row(r=Row(a=1, b="b"))]).toDF()
df1.show()

+------+
|     r|
+------+
|{1, b}|
+------+



In [177]:
df1.select(df1.r.getField("b")).show()
df1.select(df1.r.getField("a")).show()

+---+
|r.b|
+---+
|  b|
+---+

+---+
|r.a|
+---+
|  1|
+---+



In [178]:
# RDD contains list and dictionary
df1 = sc.parallelize([([1, 2], {"key": "value"})]).toDF(["l", "d"])
df1.show()

+------+--------------+
|     l|             d|
+------+--------------+
|[1, 2]|{key -> value}|
+------+--------------+



In [179]:
df1.select(df1.l.getItem(0), df1.d.getItem("key")).show()
df1.select(df1.l[0], df1.d["key"]).show()

+----+------+
|l[0]|d[key]|
+----+------+
|   1| value|
+----+------+

+----+------+
|l[0]|d[key]|
+----+------+
|   1| value|
+----+------+



In [180]:
from pyspark.sql import Row
df1 = sc.parallelize([Row(name=u"Tom", height=80), Row(name=u"Alice", height=None)]).toDF()
df1.show()

+-----+------+
| name|height|
+-----+------+
|  Tom|    80|
|Alice|  null|
+-----+------+



In [181]:
print(df1.filter(df1.height.isNotNull()).collect())
print(df1.filter(df1.height.isNull()).collect())

[Row(name='Tom', height=80)]
[Row(name='Alice', height=None)]


In [182]:
print(df[df.name.isin("Bob", "Mike")].collect())
print(df[df.age.isin(1, 2, 3)].collect())

[Row(name='Bob', age=5, height=25)]
[Row(name='Alice', age=2, height=12)]


In [183]:
df.filter(df.name.like("Al%")).collect()

[Row(name='Alice', age=2, height=12)]

In [184]:
from pyspark.sql import functions as F
df.select(df.name, F.when(df.age > 3, 1).otherwise(0)).show()

+-----+-------------------------------------+
| name|CASE WHEN (age > 3) THEN 1 ELSE 0 END|
+-----+-------------------------------------+
|Alice|                                    0|
|  Bob|                                    1|
+-----+-------------------------------------+

