# Practice Apache Spark (PySpark)

[Learning Source](https://www.linkedin.com/company/justenough-spark/)

In [None]:
# Oct-19-2021 - WINDOW Functions

from pyspark.sql import SparkSession, Window
from pyspark.sql.types import StructField, StructType, IntegerType,DoubleType, StringType
from pyspark.sql.functions import col, row_number,avg, sum, min, max, rank, dense_rank, percent_rank, ntile, cume_dist, lag, lead

spark = SparkSession.builder.appName("Spark Question Bank - JustEnough Spark(Linkedin)").getOrCreate()

df_schema = StructType([
    StructField("Country", StringType(), True ),
    StructField("StateName", StringType(), True ),
    StructField("OverallPopulationInMillion", DoubleType(), True ),
    StructField("VaccinatedPopulationInMillion", DoubleType(), True )    
])

df_data = [
    ('USA', 'California', 39.55, 28.89),
    ('USA', 'Seattle', 0.79, 0.76),
    ('USA', 'Texas', 29.25, 15.17),
    ('India', 'Maharastra', 12.24, 9.2),
    ('India', 'Kerala', 3.5, 3.7),
    ('India', 'Andhra Pradesh', 5.2, 4.7), 
    ('India', 'TamilNadu', 7.5, 5.3),
    ('UK', 'London', 8.96, 7.23),
    ('UK', 'Liverpool', 0.96, 0.87),
    ('UK', 'Machester', 2.96, 2.03),
    ('China', 'Beijing', 20.89, 19.37), 
    ('France', 'Paris', 2.18, 1.99), 
]

vaccines_df = spark.createDataFrame(df_data, df_schema)

vaccines_df = vaccines_df.withColumn('VaccinationPercentage',
                                     round((col('VaccinatedPopulationInMillion')/col('OverallPopulationInMillion')*100), 2))

vaccines_df.printSchema()

print(' >>>>>  WINDOW RANKING FUNCTIONS  START <<<<< ') 

# ROW_NUM
    
print(' >>>>>  ROW_NUM Plain SQL  <<<<< ')
vaccines_df.createOrReplaceTempView('VACCINATION_VW')
spark.sql('select *, row_number() over(order by StateName) as ROW_NUM from VACCINATION_VW').show()
spark.sql('select *, row_number() over(partition by Country order by StateName) as ROW_NUM from VACCINATION_VW').show()

print(' >>>>>  ROW_NUM Spark SQL  <<<<< ')
row_window_spec = Window.orderBy('StateName')
vaccines_df.withColumn('ROW_NUM', row_number().over(row_window_spec)).show()
row_window_spec = Window.partitionBy('Country').orderBy('StateName')
vaccines_df.withColumn('ROW_NUM', row_number().over(row_window_spec)).show()


# RANK
print(' >>>>>  RANK Plain SQL <<<<< ')
vaccines_df.createOrReplaceTempView('VACCINATION_VW')
spark.sql('select *, rank() over(order by VaccinationPercentage) as RANK from VACCINATION_VW').show()
spark.sql('select *, rank() over(partition by Country order by VaccinationPercentage desc) as RANK from VACCINATION_VW').show()
    
print(' >>>>>  RANK Spark SQL  <<<<< ')
rank_window_spec = Window.orderBy(col('VaccinationPercentage'))
vaccines_df.withColumn('RANK', rank().over(rank_window_spec)).show()
rank_window_spec = Window.partitionBy('Country').orderBy(col('VaccinationPercentage').desc())
vaccines_df.withColumn('RANK', rank().over(rank_window_spec)).show() 



# DENSE RANK
print(' >>>>> DENSE RANK Plain SQL <<<<< ')
vaccines_df.createOrReplaceTempView('VACCINATION_VW')
spark.sql('select *, dense_rank() over(order by VaccinationPercentage) as DENSE_RANK from VACCINATION_VW').show()
spark.sql('select *, dense_rank() over(partition by Country order by VaccinationPercentage desc) as DENSE_RANK from VACCINATION_VW').show()
    
print(' >>>>> DENSE RANK Spark SQL  <<<<< ')
rank_window_spec = Window.orderBy(col('VaccinationPercentage'))
vaccines_df.withColumn('DENSE_RANK', dense_rank().over(rank_window_spec)).show()
rank_window_spec = Window.partitionBy('Country').orderBy(col('VaccinationPercentage').desc())
vaccines_df.withColumn('DENSE_RANK', dense_rank().over(rank_window_spec)).show() 

# PERCENT RANK
print(' >>>>> PERCENT RANK Plain SQL <<<<< ')
vaccines_df.createOrReplaceTempView('VACCINATION_VW')
spark.sql('select Country, StateName, VaccinationPercentage, percent_rank() over(order by VaccinationPercentage) as PERCENT_RANK from VACCINATION_VW').show()
spark.sql('select Country, StateName, VaccinationPercentage, percent_rank() over(partition by Country order by VaccinationPercentage desc) as PERCENT_RANK from VACCINATION_VW').show()
    
print(' >>>>> PERCENT RANK Spark SQL  <<<<< ')
rank_window_spec = Window.orderBy(col('VaccinationPercentage'))
vaccines_df.withColumn('PERCENT_RANK', percent_rank().over(rank_window_spec)).show()
rank_window_spec = Window.partitionBy('Country').orderBy(col('VaccinationPercentage').desc())
vaccines_df.withColumn('PERCENT_RANK', percent_rank().over(rank_window_spec)).show() 


# NTILE
print(' >>>>> NTILE Plain SQL <<<<< ')
vaccines_df.createOrReplaceTempView('VACCINATION_VW')
spark.sql('select Country, StateName, VaccinationPercentage, ntile(5) over(order by VaccinationPercentage) as NTILE from VACCINATION_VW').show()
spark.sql('select Country, StateName, VaccinationPercentage, ntile(5) over(partition by Country order by VaccinationPercentage desc) as NTILE from VACCINATION_VW').show()
    
print(' >>>>> NTILE Spark SQL  <<<<< ')
rank_window_spec = Window.orderBy(col('VaccinationPercentage'))
vaccines_df.withColumn('NTILE', ntile(5).over(rank_window_spec)).show()
rank_window_spec = Window.partitionBy('Country').orderBy(col('VaccinationPercentage').desc())
vaccines_df.withColumn('NTILE', ntile(5).over(rank_window_spec)).show() 


print(' >>>>>  WINDOW RANKING FUNCTIONS  END <<<<< ') 


print(' >>>>>  WINDOW ANALYTIC FUNCTIONS  START <<<<< ')

# cume_dist
print(' >>>>> cume_dist Plain SQL <<<<< ')
vaccines_df.createOrReplaceTempView('VACCINATION_VW')
spark.sql('select Country, StateName, VaccinationPercentage, cume_dist() over(order by Country) as cume_dist from VACCINATION_VW').show()
spark.sql('select Country, StateName, VaccinationPercentage, cume_dist() over(partition by Country order by StateName) as cume_dist from VACCINATION_VW').show()
    
print(' >>>>> cume_dist Spark SQL  <<<<< ')
rank_window_spec = Window.orderBy('Country')
vaccines_df.withColumn('cume_dist', cume_dist().over(rank_window_spec)).show()
rank_window_spec = Window.partitionBy('Country').orderBy(col('StateName'))
vaccines_df.withColumn('cume_dist', cume_dist().over(rank_window_spec)).show() 

# LAG & LEAD
print(' >>>>> LAG & LEAD Plain SQL <<<<< ')
vaccines_df.createOrReplaceTempView('VACCINATION_VW')
spark.sql('select Country, StateName, VaccinationPercentage, lag(Country) over(order by Country) as LAG, lead(Country) over(order by Country) as LEAD from VACCINATION_VW').show()
spark.sql('select Country, StateName, VaccinationPercentage, lag(Country, 1) over(partition by Country order by StateName) as LAG,lead(Country, 1) over(partition by Country order by StateName) as LEAD from VACCINATION_VW').show()
    
print(' >>>>> LAG & LEAD Spark SQL  <<<<< ')
rank_window_spec = Window.orderBy('Country')
vaccines_df.withColumn('LAG', lag('Country').over(rank_window_spec)).withColumn('LEAD', lead('Country').over(rank_window_spec)).show()
rank_window_spec = Window.partitionBy('Country').orderBy(col('StateName'))
vaccines_df.withColumn('LAG', lag(col('Country'), 1).over(rank_window_spec)).withColumn('LEAD', lead(col('Country'), 1).over(rank_window_spec)).show() 


print(' >>>>>  WINDOW ANALYTIC FUNCTIONS  END <<<<< ')


print(' >>>>>  WINDOW AGGREGATE FUNCTIONS  START <<<<< ') 

print(' >>>>> AGGREGATE Plain SQL <<<<< ')
vaccines_df.createOrReplaceTempView('VACCINATION_VW') 
spark.sql('select distinct Country, min(VaccinationPercentage) over(partition by Country order by Country) as MIN_VACC, max(VaccinationPercentage) over(partition by Country order by Country) as MAX_VACC, avg(VaccinationPercentage) over(partition by Country order by Country) as AVG_VACC from VACCINATION_VW').show()
    
print(' >>>>> AGGREGATE Spark SQL  <<<<< ')
rank_window_spec = Window.partitionBy('Country').orderBy(col('Country').desc())
vaccines_df.withColumn('MIN_VACC', min(col('VaccinationPercentage')).over(rank_window_spec)).withColumn('MAX_VACC', max('VaccinationPercentage').over(rank_window_spec)).withColumn('AVG_VACC', avg('VaccinationPercentage').over(rank_window_spec)).select('Country', 'MIN_VACC', 'MAX_VACC', 'AVG_VACC').distinct().show() 


print(' >>>>>  WINDOW AGGREGATE FUNCTIONS  END <<<<< ')

