In [None]:
# Run for the first time to setup.
# !pip install findspark

In [None]:
# Run for the first time to setup.
# !pip install pyspark

* These 2 statements are required to ensure that the notebook finds the installation of spark on our machine and initializes the required paths/variables for spark to work locally. Local spark testing works pretty well for smaller datasets when the laptop is pretty well configured. I have 32GB of RAM and I can test even medium sized datasets pretty easily.

In [None]:
import findspark
findspark.init()

* This snippet shows us how to run the spark pi program inside the notebook via VSCODE and see the output. This is a good sanity testing exercise to ensure we do not have any environment issues.

In [None]:
import pyspark
import random

# This is already defined for us by synapse and the notebook will throw an error
# if we try to create a new spark context. This is a change we have to make in the notebook 
# when we want to run inside synapse. 

# UNCOMMENT WHEN RUNNING LOCALLY!
# sc = pyspark.SparkContext(appName="Pi")

num_samples = 100000000
def inside(p):     
  x, y = random.random(), random.random()
  return x*x + y*y < 1
count = sc.parallelize(range(0, num_samples)).filter(inside).count()
pi = 4 * count / num_samples
print(pi)
sc.stop()

* Now we can do some custom spark testing to ensure that we can do some computes passing some local datasets and getting somee basic aggregates.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import min
from pyspark.sql.functions import max

TEMPERATURES_DS_PATH = "file:///C:/Venky/AzureSynapseExperiments/datafiles/AirQualityIndexWithTemperatures_5/"

# This is the path that needs to be there inside Synapse
TEMPERATURES_DS_PATH = "abfss://files@venkydatalake1001.dfs.core.windows.net/temperatures/"

# The spark session create should be commented out when we are running inside Synapse 
# Otherwise it will throw a security violation error!

spark = SparkSession \
    .builder \
    .appName("Temperatures Analytics") \
    .getOrCreate()
    
print("Created spark session " + str(spark))

temperatures_ds = spark.read.parquet(TEMPERATURES_DS_PATH)
temperatures_ds.show(10)

print("Calculating min and max temperatures per lat lng")
temperatures_agg = temperatures_ds.groupBy("latitude", "longitude").agg(
    min(col("temperature_2m")).alias("min_temp"),
    max(col("temperature_2m")).alias("max_temp")
)

temperatures_agg.show()