In [1]:
# Import findspark to locate Spark in your Python Env
import findspark
findspark.init()
findspark.find()

# Import SparkSession

from pyspark.sql import SparkSession

# Create SparkSession

spark = SparkSession.builder \
        .master("local") \
        .appName("PySpark Databricks") \
        .getOrCreate()

print("Spark Session Details:",spark)

Spark Session Details: <pyspark.sql.session.SparkSession object at 0x000001B86AA8F9A0>


In [2]:
# Range toDF
my_range = spark.range(5).toDF("number")
my_range.show()

+------+
|number|
+------+
|     0|
|     1|
|     2|
|     3|
|     4|
+------+



In [3]:
# Using Where for Evens 
evens = my_range.where("number % 2 = 0")
evens.show()

+------+
|number|
+------+
|     0|
|     2|
|     4|
+------+



In [4]:
# Read CSV with InferSchema and Headers
# Read is a Transformation. Thus, it has Lazy Evaluation

flightData2015 = spark.read \
    .option("inferschema", "true") \
    .option("header","true") \
    .csv("Data_Files\Flights.csv")

flightData2015.take(3)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count=344)]

In [5]:
# Explain - Physical Plan

flightData2015.sort("count").explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [count#33 ASC NULLS FIRST], true, 0
   +- Exchange rangepartitioning(count#33 ASC NULLS FIRST, 200), ENSURE_REQUIREMENTS, [plan_id=53]
      +- FileScan csv [DEST_COUNTRY_NAME#31,ORIGIN_COUNTRY_NAME#32,count#33] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/c:/Users/PRATIK/Documents/Practice/PySpark_Practice/Databricks_B..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,ORIGIN_COUNTRY_NAME:string,count:int>




In [6]:
spark.conf.set("spark.sql.shuffle.partitions","5")      # Set to 5, Becoz Default Shuffle Partitions is 200

flightData2015.sort("count").take(2)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1),
 Row(DEST_COUNTRY_NAME='Ireland', ORIGIN_COUNTRY_NAME='Afganistan', count=2)]

In [7]:
flightData2015.rdd.getNumPartitions()

1

In [8]:
flightData2015.createOrReplaceTempView("flights_data_2015")

# SQL Way
flights_sql_res = spark.sql("""
SELECT DEST_COUNTRY_NAME, count(1)
FROM flights_data_2015
GROUP BY DEST_COUNTRY_NAME""")

flights_sql_res.show()

# Dataframe Way
flights_df_res = flightData2015 \
    .groupBy("DEST_COUNTRY_NAME") \
    .count()

flights_df_res.show()

# It does not matter which way we use, final plan which Spark complies remains the same
flights_sql_res.explain()
flights_df_res.explain()

+-----------------+--------+
|DEST_COUNTRY_NAME|count(1)|
+-----------------+--------+
|   United Kingdom|       2|
|          Ireland|       1|
|          Germany|       1|
|           Russia|       1|
|    United States|       3|
|            India|       1|
+-----------------+--------+

+-----------------+-----+
|DEST_COUNTRY_NAME|count|
+-----------------+-----+
|   United Kingdom|    2|
|          Ireland|    1|
|          Germany|    1|
|           Russia|    1|
|    United States|    3|
|            India|    1|
+-----------------+-----+

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[DEST_COUNTRY_NAME#31], functions=[count(1)])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#31, 5), ENSURE_REQUIREMENTS, [plan_id=160]
      +- HashAggregate(keys=[DEST_COUNTRY_NAME#31], functions=[partial_count(1)])
         +- FileScan csv [DEST_COUNTRY_NAME#31] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/c:/Users/PRA

In [9]:
# Max
# SQL
spark.sql(""" SELECT MAX(count) FROM flights_data_2015 """).show()

# DF
from pyspark.sql.functions import max
flightData2015.select(max("count")).take(1)

+----------+
|max(count)|
+----------+
|       344|
+----------+



[Row(max(count)=344)]

In [10]:
max_sql = spark.sql("""
SELECT DEST_COUNTRY_NAME, SUM(count) AS Destination_Total
FROM flights_data_2015
GROUP BY DEST_COUNTRY_NAME
ORDER BY SUM(count) DESC
LIMIT 5
""")

max_sql.collect()

[Row(DEST_COUNTRY_NAME='United States', Destination_Total=360),
 Row(DEST_COUNTRY_NAME='United Kingdom', Destination_Total=25),
 Row(DEST_COUNTRY_NAME='Germany', Destination_Total=10),
 Row(DEST_COUNTRY_NAME='Russia', Destination_Total=5),
 Row(DEST_COUNTRY_NAME='India', Destination_Total=5)]

In [15]:
from pyspark.sql.functions import desc

max_df = flightData2015 \
    .groupBy("DEST_COUNTRY_NAME") \
    .sum("count") \
    .withColumnRenamed("sum(count)", "destination_total") \
    .sort(desc("destination_total")) \
    .limit(5)

max_df.explain()

max_df.collect()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- TakeOrderedAndProject(limit=5, orderBy=[destination_total#155L DESC NULLS LAST], output=[DEST_COUNTRY_NAME#31,destination_total#155L])
   +- HashAggregate(keys=[DEST_COUNTRY_NAME#31], functions=[sum(count#33)])
      +- Exchange hashpartitioning(DEST_COUNTRY_NAME#31, 5), ENSURE_REQUIREMENTS, [plan_id=360]
         +- HashAggregate(keys=[DEST_COUNTRY_NAME#31], functions=[partial_sum(count#33)])
            +- FileScan csv [DEST_COUNTRY_NAME#31,count#33] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/c:/Users/PRATIK/Documents/Practice/PySpark_Practice/Databricks_B..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,count:int>




[Row(DEST_COUNTRY_NAME='United States', destination_total=360),
 Row(DEST_COUNTRY_NAME='United Kingdom', destination_total=25),
 Row(DEST_COUNTRY_NAME='Germany', destination_total=10),
 Row(DEST_COUNTRY_NAME='Russia', destination_total=5),
 Row(DEST_COUNTRY_NAME='India', destination_total=5)]