In [1]:
# Import findspark to locate Spark in your Python Env
import findspark
findspark.init()
findspark.find()

# Import SparkSession

from pyspark.sql import SparkSession

# Create SparkSession

spark = SparkSession.builder \
        .master("local") \
        .appName("PySpark Databricks") \
        .getOrCreate()

print("Spark Session Details:",spark)

Spark Session Details: <pyspark.sql.session.SparkSession object at 0x000001DD6D0ABA60>


In [2]:
# Range toDF
my_range = spark.range(5).toDF("number")
my_range.show()

+------+
|number|
+------+
|     0|
|     1|
|     2|
|     3|
|     4|
+------+



In [3]:
# Using Where for Evens 
evens = my_range.where("number % 2 = 0")
evens.show()

+------+
|number|
+------+
|     0|
|     2|
|     4|
+------+



In [4]:
# Read CSV with InferSchema and Headers
# Read is a Transformation. Thus, it has Lazy Evaluation

flightData2015 = spark.read \
    .option("inferschema", "true") \
    .option("header","true") \
    .csv("Data_Files\Flights.csv")

flightData2015.take(3)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count=344)]

In [5]:
# Explain - Physical Plan

flightData2015.sort("count").explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [count#33 ASC NULLS FIRST], true, 0
   +- Exchange rangepartitioning(count#33 ASC NULLS FIRST, 200), ENSURE_REQUIREMENTS, [plan_id=53]
      +- FileScan csv [DEST_COUNTRY_NAME#31,ORIGIN_COUNTRY_NAME#32,count#33] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/c:/Users/PRATIK/Documents/Practice/PySpark_Practice/Databricks_B..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,ORIGIN_COUNTRY_NAME:string,count:int>




In [6]:
spark.conf.set("spark.sql.shuffle.partitions","5")      # Set to 5, Becoz Default Shuffle Partitions is 200

flightData2015.sort("count").take(2)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1),
 Row(DEST_COUNTRY_NAME='Ireland', ORIGIN_COUNTRY_NAME='Afganistan', count=2)]

In [7]:
flightData2015.rdd.getNumPartitions()

1

In [8]:
flightData2015.createOrReplaceTempView("flights_data_2015")

# SQL Way
flights_sql_res = spark.sql("""
SELECT DEST_COUNTRY_NAME, count(1)
FROM flights_data_2015
GROUP BY DEST_COUNTRY_NAME""")

flights_sql_res.show()

# Dataframe Way
flights_df_res = flightData2015 \
    .groupBy("DEST_COUNTRY_NAME") \
    .count()

flights_df_res.show()

# It does not matter which way we use, final plan which Spark complies remains the same
flights_sql_res.explain()
flights_df_res.explain()

+-----------------+--------+
|DEST_COUNTRY_NAME|count(1)|
+-----------------+--------+
|   United Kingdom|       2|
|          Ireland|       1|
|          Germany|       1|
|           Russia|       1|
|    United States|       3|
|            India|       1|
+-----------------+--------+

+-----------------+-----+
|DEST_COUNTRY_NAME|count|
+-----------------+-----+
|   United Kingdom|    2|
|          Ireland|    1|
|          Germany|    1|
|           Russia|    1|
|    United States|    3|
|            India|    1|
+-----------------+-----+

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[DEST_COUNTRY_NAME#31], functions=[count(1)])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#31, 5), ENSURE_REQUIREMENTS, [plan_id=160]
      +- HashAggregate(keys=[DEST_COUNTRY_NAME#31], functions=[partial_count(1)])
         +- FileScan csv [DEST_COUNTRY_NAME#31] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/c:/Users/PRA

In [9]:
# Max
# SQL
spark.sql(""" SELECT MAX(count) FROM flights_data_2015 """).show()

# DF
from pyspark.sql.functions import max
flightData2015.select(max("count")).take(1)

+----------+
|max(count)|
+----------+
|       344|
+----------+



[Row(max(count)=344)]

In [10]:
max_sql = spark.sql("""
SELECT DEST_COUNTRY_NAME, SUM(count) AS Destination_Total
FROM flights_data_2015
GROUP BY DEST_COUNTRY_NAME
ORDER BY SUM(count) DESC
LIMIT 5
""")

max_sql.collect()

[Row(DEST_COUNTRY_NAME='United States', Destination_Total=360),
 Row(DEST_COUNTRY_NAME='United Kingdom', Destination_Total=25),
 Row(DEST_COUNTRY_NAME='Germany', Destination_Total=10),
 Row(DEST_COUNTRY_NAME='Russia', Destination_Total=5),
 Row(DEST_COUNTRY_NAME='India', Destination_Total=5)]

In [11]:
from pyspark.sql.functions import desc

max_df = flightData2015 \
    .groupBy("DEST_COUNTRY_NAME") \
    .sum("count") \
    .withColumnRenamed("sum(count)", "destination_total") \
    .sort(desc("destination_total")) \
    .limit(5)

max_df.explain()

max_df.collect()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- TakeOrderedAndProject(limit=5, orderBy=[destination_total#121L DESC NULLS LAST], output=[DEST_COUNTRY_NAME#31,destination_total#121L])
   +- HashAggregate(keys=[DEST_COUNTRY_NAME#31], functions=[sum(count#33)])
      +- Exchange hashpartitioning(DEST_COUNTRY_NAME#31, 5), ENSURE_REQUIREMENTS, [plan_id=298]
         +- HashAggregate(keys=[DEST_COUNTRY_NAME#31], functions=[partial_sum(count#33)])
            +- FileScan csv [DEST_COUNTRY_NAME#31,count#33] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/c:/Users/PRATIK/Documents/Practice/PySpark_Practice/Databricks_B..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,count:int>




[Row(DEST_COUNTRY_NAME='United States', destination_total=360),
 Row(DEST_COUNTRY_NAME='United Kingdom', destination_total=25),
 Row(DEST_COUNTRY_NAME='Germany', destination_total=10),
 Row(DEST_COUNTRY_NAME='Russia', destination_total=5),
 Row(DEST_COUNTRY_NAME='India', destination_total=5)]

In [12]:
#  Spark Streaming Example

staticDataFrame = spark.read.format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load("Data_Files/Retail.csv")

staticDataFrame.createOrReplaceTempView("retail_data")

staticSchema = staticDataFrame.schema
staticDataFrame.printSchema()

root
 |-- InvoiceNo: integer (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



In [13]:
spark.sql("""
SELECT DISTINCT DATE(InvoiceDate), COUNT(*)
FROM retail_data
GROUP BY 1
ORDER BY 2 DESC
""").show()

+-----------+--------+
|InvoiceDate|count(1)|
+-----------+--------+
| 2010-12-01|       6|
| 2010-12-03|       3|
| 2010-12-05|       1|
| 2010-12-02|       1|
+-----------+--------+



In [14]:
# Windowing per day based on a Business Column

from pyspark.sql.functions import window, column, col, desc

staticDataFrame \
.selectExpr(
    "CustomerId",
    "(UnitPrice*Quantity) as total_cost",
    "InvoiceDate")\
.groupBy(
    col("CustomerId"), window(col("InvoiceDate"), "1 day"))\
.sum("total_cost")\
.show(truncate=False)

+----------+------------------------------------------+------------------+
|CustomerId|window                                    |sum(total_cost)   |
+----------+------------------------------------------+------------------+
|17850.0   |{2010-12-01 05:30:00, 2010-12-02 05:30:00}|112.92000000000002|
|17850.0   |{2010-12-03 05:30:00, 2010-12-04 05:30:00}|15.3              |
|17850.0   |{2010-12-05 05:30:00, 2010-12-06 05:30:00}|11.100000000000001|
|13047.0   |{2010-12-03 05:30:00, 2010-12-04 05:30:00}|108.16            |
|17850.0   |{2010-12-02 05:30:00, 2010-12-03 05:30:00}|22.0              |
+----------+------------------------------------------+------------------+



In [15]:
# Converting above Batch logic to Stream
# Won't Run, but keeping the Syntax for References
"""
# Reading Stream
streamingDataFrame = spark.readStream\
 .schema(staticSchema)\     # Providing Schema of Streaming Data
 .option("maxFilesPerTrigger", 1)\
 .format("csv")\
 .option("header", "true")\
 .load(".../*.csv")

streamingDataFrame.isStreaming      # returns true

purchaseByCustomerPerHour = streamingDataFrame\
 .selectExpr(
 "CustomerId",
 "(UnitPrice * Quantity) as total_cost" ,
 "InvoiceDate" )\
 .groupBy(
 col("CustomerId"), window(col("InvoiceDate"), "1 day"))\
 .sum("total_cost")

# But to Run, we have to call an Action
# Streaming has different actions
purchaseByCustomerPerHour.writeStream\
 .format("memory")\     #  memory = store in-memory table | For writing to Console >> .format("console")
 .queryName("customer_purchases")\      # customer_purchases = name of the in-memory table
 .outputMode("complete")\       #  complete = all the counts should be in the table
 .start()
"""

'\n# Reading Stream\nstreamingDataFrame = spark.readStream .schema(staticSchema)\\     # Providing Schema of Streaming Data\n .option("maxFilesPerTrigger", 1) .format("csv") .option("header", "true") .load(".../*.csv")\n\nstreamingDataFrame.isStreaming      # returns true\n\npurchaseByCustomerPerHour = streamingDataFrame .selectExpr(\n "CustomerId",\n "(UnitPrice * Quantity) as total_cost" ,\n "InvoiceDate" ) .groupBy(\n col("CustomerId"), window(col("InvoiceDate"), "1 day")) .sum("total_cost")\n\n# But to Run, we have to call an Action\n# Streaming has different actions\npurchaseByCustomerPerHour.writeStream .format("memory")\\     #  memory = store in-memory table | For writing to Console >> .format("console")\n .queryName("customer_purchases")\\      # customer_purchases = name of the in-memory table\n .outputMode("complete")\\       #  complete = all the counts should be in the table\n .start()\n'

In [16]:
# Lower Level APIs - RDD
# Parallelizing RDD into a DF

from pyspark.sql import Row

spark.sparkContext.parallelize([Row(1), Row(2), Row(3)]).toDF()\
    .withColumnRenamed("_1", "Numbers")\
    .show()

+-------+
|Numbers|
+-------+
|      1|
|      2|
|      3|
+-------+



In [17]:
# lit Function - Converting to Spark Native Type

from pyspark.sql.functions import lit

df = spark.read.format("csv")\
    .option("inferSchema", "true")\
    .option("header", "true")\
    .load("Data_Files/Retail.csv")

df.select(lit(5), lit("five"), lit(5.0))

DataFrame[5: int, five: string, 5.0: double]

In [18]:
# using Where

# Way 1 using col()

df.where(col("InvoiceNo") != 536365)\
    .select("InvoiceNo", "Description")\
    .show(5, False)

# Way 2 using Expression String
df.where("InvoiceNo != 536365")\
    .select("InvoiceNo", "Description")\
    .show(5, False)

+---------+-----------------------------+
|InvoiceNo|Description                  |
+---------+-----------------------------+
|536366   |HAND WARMER UNION JACK       |
|536366   |HAND WARMER RED POLKA DOT    |
|536367   |ASSORTED COLOUR BIRD ORNAMENT|
|536367   |hello                        |
+---------+-----------------------------+

+---------+-----------------------------+
|InvoiceNo|Description                  |
+---------+-----------------------------+
|536366   |HAND WARMER UNION JACK       |
|536366   |HAND WARMER RED POLKA DOT    |
|536367   |ASSORTED COLOUR BIRD ORNAMENT|
|536367   |hello                        |
+---------+-----------------------------+



In [19]:
# More Difficult Where Example
from pyspark.sql.functions import instr

priceFilter = col("Unitprice") > 1.85
descriptionFilter = instr(df.Description, "WHITE") >= 1

# col("StockCode") and df.StockCode - Ways to access a column of Dataframe
# where and filter - Are Alias in case of DF. Both works similarly for Dataframes

df.where(col("StockCode").isin("71053"))\
    .where(priceFilter | descriptionFilter)\
    .show()

df.filter(df.StockCode.isin("71053"))\
    .filter(priceFilter | descriptionFilter)\
    .show()

+---------+---------+-------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|        Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+-------------------+--------+-------------------+---------+----------+--------------+
|   536365|    71053|WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
+---------+---------+-------------------+--------+-------------------+---------+----------+--------------+

+---------+---------+-------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|        Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+-------------------+--------+-------------------+---------+----------+--------------+
|   536365|    71053|WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
+---------+---------+---------------

In [20]:
spark.sql("""
SELECT *
FROM retail_data
WHERE 
StockCode in ("71053") AND
(UnitPrice > 1.85 OR instr(Description, "WHITE") >= 1 )
""").show()

+---------+---------+-------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|        Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+-------------------+--------+-------------------+---------+----------+--------------+
|   536365|    71053|WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
+---------+---------+-------------------+--------+-------------------+---------+----------+--------------+



In [21]:
# Where vs Filter
"""
Where - 
- is for Dataframes
- SQL like conditions
- Alias for Filter
Filter 
- for Dataframes and RDDs
- Supports lamda function for advanced filters
"""

# Use & for and, | for or, ~ for not In DF boolean expressions
DOTCodeFilter = col("StockCode") == "71053"
priceFilter = col("UnitPrice") > 1.85
descripFilter = instr(col("Description"), "WHITE") >= 1
df.withColumn("isExpensive",
DOTCodeFilter & (priceFilter | descripFilter))\
.where("isExpensive")\
.select("unitPrice", "isExpensive")\
.show(5)

spark.sql("""
SELECT
UnitPrice,
(StockCode = "71053" AND
(UnitPrice > 1.85 OR
instr(Description, "WHITE") >= 1)) as isExpensive
FROM retail_data
""").show()

+---------+-----------+
|unitPrice|isExpensive|
+---------+-----------+
|     3.39|       true|
+---------+-----------+

+---------+-----------+
|UnitPrice|isExpensive|
+---------+-----------+
|     2.55|      false|
|     3.39|       true|
|     2.75|      false|
|     3.39|      false|
|     3.39|      false|
|     7.65|      false|
|     4.25|      false|
|     1.85|      false|
|     1.85|      false|
|     1.69|      false|
|     1.69|      false|
+---------+-----------+



In [22]:
from pyspark.sql.functions import expr, col

df.withColumn("isExpensive", expr("NOT UnitPrice <= 1.85"))\
    .where(col("isExpensive"))\
    .select("Description", "UnitPrice", "isExpensive").show()

+--------------------+---------+-----------+
|         Description|UnitPrice|isExpensive|
+--------------------+---------+-----------+
|WHITE HANGING HEA...|     2.55|       true|
| WHITE METAL LANTERN|     3.39|       true|
|CREAM CUPID HEART...|     2.75|       true|
|KNITTED UNION FLA...|     3.39|       true|
|RED WOOLLY HOTTIE...|     3.39|       true|
|SET 7 BABUSHKA NE...|     7.65|       true|
|GLASS STAR FROSTE...|     4.25|       true|
+--------------------+---------+-----------+



In [23]:
# To perform a null safe equivalence test
df.where(col("Description").eqNullSafe("hello")).show()

+---------+---------+-----------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+-----------+--------+-------------------+---------+----------+--------------+
|   536367|    84879|      hello|      32|2010-12-03 08:34:00|     1.69|   13047.0|United Kingdom|
+---------+---------+-----------+--------+-------------------+---------+----------+--------------+



In [24]:
# Mathematical Examples
# pow and expr
from pyspark.sql.functions import expr, pow

fabricatedQty = pow( (col("Quantity") * col("UnitPrice")), 2) + 5
df.select(
    expr("CustomerId"),
    fabricatedQty.alias("RealQty")
).show(3)

# Other Way
df.selectExpr(
    "CustomerId",
    "(POWER((Quantity*UnitPrice),2)+5) as RealQty"
).show(3)

+----------+------------------+
|CustomerId|           RealQty|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
|   17850.0|             489.0|
+----------+------------------+
only showing top 3 rows

+----------+------------------+
|CustomerId|           RealQty|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
|   17850.0|             489.0|
+----------+------------------+
only showing top 3 rows



In [25]:
# Round, BRound
from pyspark.sql.functions import round, bround, lit

df.select(
    round(lit("2.5")),      # Round to Ceil
    bround(lit("2.5")),     # Round to Bottom when Exact Centre Number is Rounder eg. 2.5 
    round(lit("2.4")),
    bround(lit("2.6"))
).show(2)

+-------------+--------------+-------------+--------------+
|round(2.5, 0)|bround(2.5, 0)|round(2.4, 0)|bround(2.6, 0)|
+-------------+--------------+-------------+--------------+
|          3.0|           2.0|          2.0|           3.0|
|          3.0|           2.0|          2.0|           3.0|
+-------------+--------------+-------------+--------------+
only showing top 2 rows



In [26]:
# Describes a DF
df.describe().show()

+-------+------------------+------------------+--------------------+------------------+------------------+------------------+--------------+
|summary|         InvoiceNo|         StockCode|         Description|          Quantity|         UnitPrice|        CustomerID|       Country|
+-------+------------------+------------------+--------------------+------------------+------------------+------------------+--------------+
|  count|                11|                11|                  11|                11|                11|                11|            11|
|   mean| 536365.5454545454| 47222.57142857143|                null|10.545454545454545|3.1318181818181823|16976.727272727272|          null|
| stddev|0.8201995322555489|31256.801504404033|                null|10.699192832766744|1.7260464546576852|1942.9091636465712|          null|
|    min|            536365|             21730|ASSORTED COLOUR B...|                 2|              1.69|           13047.0|United Kingdom|
|    max|    

In [29]:
# Monotonically Increasing ID
from pyspark.sql.functions import monotonically_increasing_id

df.select(monotonically_increasing_id()).show(5)

+-----------------------------+
|monotonically_increasing_id()|
+-----------------------------+
|                            0|
|                            1|
|                            2|
|                            3|
|                            4|
+-----------------------------+
only showing top 5 rows

