# Study Guide

* Quickstart: https://spark.apache.org/docs/latest/quick-start.html
    * spark shell
    * caching
    * Spark Session: ``SparkSession.builder.appName("SimpleApp").getOrCreate()``
* https://spark.apache.org/docs/latest/rdd-programming-guide.html
    * Abstractions:
        * RDD
        * shared variables: (_broadcast_, _accumulators_)
        * DataFrame vs Datasets (Python vs strong typing)
* Pyspark docs
    * https://spark.apache.org/docs/latest/api/python/index.html
        * pyspark-connect
* Pyspark Pandas API (`pyspark.pandas.DataFrame`)
    * https://spark.apache.org/docs/latest/api/python/getting_started/quickstart_ps.html
* Testing
    * https://spark.apache.org/docs/latest/api/python/getting_started/testing_pyspark.html

In [7]:
## Settings
# sudo apt update
# sudo apt install openjdk-17-jdk
# !pip install pyspark
# export JAVA_HOME=$(dirname $(dirname $(readlink -f $(which java))))
# !pip install pyspark-connect
# !pip install pyspark-client

# First Contact

## Session

In [1]:
# Creating a session
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Test").getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/09/01 08:02:46 WARN Utils: Your hostname, WIN-NJTBBD8GS0T, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/09/01 08:02:46 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/01 08:02:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## DataFrame API

In [None]:
# Dataframe from pandas
import pandas as pd

df = pd.DataFrame({"a": [1], "b": [2]})

# We can do a lot of things from session object
dfspark = spark.createDataFrame(df)
dfspark.show(1)

                                                                                

+---+---+
|  a|  b|
+---+---+
|  1|  2|
+---+---+



In [None]:
# Creating a dataframe

from pyspark.sql import Row

df_rows = spark.createDataFrame([
    Row(a=1, b=2, c=3),
    Row(a=1, b=2, c=3),
    Row(a=1, b=2, c=3)],
    schema='a string, b string, c int'
)
df_rows.show(3)

+---+---+---+
|  a|  b|  c|
+---+---+---+
|  1|  2|  3|
|  1|  2|  3|
|  1|  2|  3|
+---+---+---+



In [8]:
df_rows.show(3, vertical=True)

-RECORD 0--
 a   | 1   
 b   | 2   
 c   | 3   
-RECORD 1--
 a   | 1   
 b   | 2   
 c   | 3   
-RECORD 2--
 a   | 1   
 b   | 2   
 c   | 3   



In [None]:
# Displaying options
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
# spark.sql.repl.eagerEval.maxNumRows for max number of rows to show
# spark.conf.set("spark.sql.repl.eagerEval.enabled", False)
df_rows

a,b,c
1,2,3
1,2,3
1,2,3


In [10]:
display(df_rows.printSchema()), display(df_rows.columns)

root
 |-- a: string (nullable = true)
 |-- b: string (nullable = true)
 |-- c: integer (nullable = true)



None

['a', 'b', 'c']

(None, None)

In [11]:
df_rows.describe().show()

[Stage 15:>                                                       (0 + 12) / 12]

+-------+---+---+---+
|summary|  a|  b|  c|
+-------+---+---+---+
|  count|  3|  3|  3|
|   mean|1.0|2.0|3.0|
| stddev|0.0|0.0|0.0|
|    min|  1|  2|  3|
|    max|  1|  2|  3|
+-------+---+---+---+



                                                                                

In [None]:
# Collect distributed data to local. Risk of out-of-memory error. Not lazy operation
df_rows.collect()

[Row(a='1', b='2', c=3), Row(a='1', b='2', c=3), Row(a='1', b='2', c=3)]

In [13]:
# pd.DataFrame.head(), tail() equivalent
df_rows.take(2), df_rows.tail(1)

([Row(a='1', b='2', c=3), Row(a='1', b='2', c=3)], [Row(a='1', b='2', c=3)])

In [14]:
df_rows.toPandas()

Unnamed: 0,a,b,c
0,1,2,3
1,1,2,3
2,1,2,3


In [15]:
# Lazy!
df_rows.a

Column<'a'>

In [16]:
df_rows.printSchema()

root
 |-- a: string (nullable = true)
 |-- b: string (nullable = true)
 |-- c: integer (nullable = true)



### Pandas + Spark: The power of UDFs (User Defined Functions)

In [19]:
# "apply" example: user defined function (UDF)
import pandas as pd
from pyspark.sql.functions import pandas_udf

@pandas_udf('long')
def pandas_plus_one(series: pd.Series) -> pd.Series:
    # Simply plus one by using pandas Series.
    return series + 1

df_rows.select(pandas_plus_one(df_rows.c)).show()



+------------------+
|pandas_plus_one(c)|
+------------------+
|                 4|
|                 4|
|                 4|
+------------------+



                                                                                

In [23]:
df_groupby = spark.createDataFrame(
    [
        Row(a=1, b=2, c=3),
        Row(a=1, b=2, c=3),
        Row(a=1, b=2, c=3),
        Row(a=2, b=20, c=30),
        Row(a=2, b=20, c=30),
        Row(a=2, b=20, c=30),
        Row(a=3, b=200, c=300),
        Row(a=3, b=200, c=300),
        Row(a=3, b=200, c=300),
    ],
    schema='a int, b int, c int'
)

In [26]:
df_groupby.groupby("a").avg().show()

+---+------+------+------+
|  a|avg(a)|avg(b)|avg(c)|
+---+------+------+------+
|  1|   1.0|   2.0|   3.0|
|  2|   2.0|  20.0|  30.0|
|  3|   3.0| 200.0| 300.0|
+---+------+------+------+



In [32]:
df_groupby.schema

StructType([StructField('a', IntegerType(), True), StructField('b', IntegerType(), True), StructField('c', IntegerType(), True)])

In [35]:
def plus_mean(pandas_df):
    return pandas_df.assign(v1=pandas_df.b - pandas_df.c.mean())

from pyspark.sql.types import StructType, StructField, DoubleType, StringType

schema = StructType([
    StructField("a", DoubleType()),
    StructField("b", DoubleType()),
    StructField("c", DoubleType()),
    StructField("v1", DoubleType()),
])

df_groupby.groupby('a').applyInPandas(plus_mean, schema=schema).show()

+---+-----+-----+------+
|  a|    b|    c|    v1|
+---+-----+-----+------+
|1.0|  2.0|  3.0|  -1.0|
|1.0|  2.0|  3.0|  -1.0|
|1.0|  2.0|  3.0|  -1.0|
|2.0| 20.0| 30.0| -10.0|
|2.0| 20.0| 30.0| -10.0|
|2.0| 20.0| 30.0| -10.0|
|3.0|200.0|300.0|-100.0|
|3.0|200.0|300.0|-100.0|
|3.0|200.0|300.0|-100.0|
+---+-----+-----+------+



## Spark SQL API

In [None]:
# As SQL

df_groupby.createOrReplaceTempView("table_groupby")

In [39]:
spark.sql("SELECT COUNT(1) FROM table_groupby")

count(1)
9


In [42]:
@pandas_udf("integer")
def add_one(s: pd.Series) -> pd.Series:
    return s + 1

spark.udf.register("add_one", add_one)
spark.sql("SELECT add_one(a) FROM table_groupby").show()

25/09/01 08:16:44 WARN SimpleFunctionRegistry: The function add_one replaced a previously registered function.


+----------+
|add_one(a)|
+----------+
|         2|
|         2|
|         2|
|         3|
|         3|
|         3|
|         4|
|         4|
|         4|
+----------+



                                                                                