In [2]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark=SparkSession.builder.appName("DD").master("local[*]").getOrCreate()

In [3]:
simpleData = [("James", "Sales", 3000),
    ("Michael", "Sales", 4600),
    ("Robert", "Sales", 4100),
    ("Maria", "Finance", 3000),
    ("James", "Sales", 3000),
    ("Scott", "Finance", 3300)
  ]
schema = ["employee_name", "department", "salary"]
df = spark.createDataFrame(data=simpleData, schema = schema)
df.printSchema()
df.show(truncate=False)

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: long (nullable = true)

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|James        |Sales     |3000  |
|Michael      |Sales     |4600  |
|Robert       |Sales     |4100  |
|Maria        |Finance   |3000  |
|James        |Sales     |3000  |
|Scott        |Finance   |3300  |
+-------------+----------+------+



In [9]:
from pyspark.sql.functions import *

#lets consider aggregate functions over the entire data

# count()
#If we want to get total number of rows in whole dataframe we use count() method of the dataframe
# if we want to get count of values present in a certain record we use count() method of functions class.

df.count() # this is not a transformation but an action

df.select(count("department")).show() #this is a transformation and it won't consider NULLs while counting like SQL

#now lets look at other agg functions
df.select(countDistinct("department")).show() # this will count the distinct values in a column

# Now see how to let first, last, min and max from a column
df.select(first("salary"),last("salary"),min("salary"),max("salary")).show()

#lets perform sum, avg and sumDistinct
df.select(sum("salary").alias("salary"),avg("salary"),sumDistinct("salary")).show()  # we can use alias to give different name in output

# we can also perform variance, co variance, standard deviation and much more operations

#countDistinct can be also done as
df.select("department").distinct().count()


+-----------------+
|count(department)|
+-----------------+
|                6|
+-----------------+

+--------------------------+
|count(DISTINCT department)|
+--------------------------+
|                         2|
+--------------------------+

+-------------+------------+-----------+-----------+
|first(salary)|last(salary)|min(salary)|max(salary)|
+-------------+------------+-----------+-----------+
|         3000|        3300|       3000|       4600|
+-------------+------------+-----------+-----------+

+------+-----------+--------------------+
|salary|avg(salary)|sum(DISTINCT salary)|
+------+-----------+--------------------+
| 21000|     3500.0|               15000|
+------+-----------+--------------------+



2

In [19]:
#in general countDistinct takes a very long time on large datasets, so when the data is very huge and getting perfect count value is not necessary we can use approx_count_distinct
#this function will increase the performance by huge amount

#it takes two parameter, one is col on which it performs count distinct and other is maximum estimation error allowed which is a float value

df.select(approx_count_distinct("salary",0.1)).show()
df.agg(approx_count_distinct("salary",0.1)).show()

#spark also provides agg on dataframe, where aggregation is performed on whole data.
#using agg is same as select


+-----------------------------+
|approx_count_distinct(salary)|
+-----------------------------+
|                            4|
+-----------------------------+

+-----------------------------+
|approx_count_distinct(salary)|
+-----------------------------+
|                            4|
+-----------------------------+



# groupBy
Thus far, we have performed only DataFrame-level aggregations. A more common task is to perform calculations based on groups in the data. This is typically done on categorical data for which we group our data on one column and perform some calculations on the other columns that end up in that group.
#We do this grouping in two phases. First we specify the column(s) on which we would like to group, and then we specify the aggregation(s). The first step returns a RelationalGroupedDataset, and the second step returns a DataFrame.

In [16]:
display(df.groupBy("salary")) # we can see that we get GroupedData
display(df.groupBy("department").sum("Salary")) # we can see that now it is dataframe. 


df.groupBy("department").sum("salary").show()
df.groupBy("department").max("salary").show() 
#similarly in the place of sum ,we can use max, min, avg etc.
#instead of calculating aggregation on group one at a time, we can use agg() to calculate multiple aggregate functions in single groupBY

df.groupBy("department").agg(sum("salary").alias("Sum"),avg("salary").alias("Average"),min("salary").alias("minimum")).show()

GroupedData[grouping expressions: [salary], value: [employee_name: string, department: string ... 1 more field], type: GroupBy]

DataFrame[department: string, sum(Salary): bigint]

+----------+-----------+
|department|sum(salary)|
+----------+-----------+
|     Sales|      14700|
|   Finance|       6300|
+----------+-----------+

+----------+-----------+
|department|max(salary)|
+----------+-----------+
|     Sales|       4600|
|   Finance|       3300|
+----------+-----------+

+----------+-----+-------+-------+
|department|  Sum|Average|minimum|
+----------+-----+-------+-------+
|     Sales|14700| 3675.0|   3000|
|   Finance| 6300| 3150.0|   3000|
+----------+-----+-------+-------+



# window functions

In [29]:
df.show()
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number,rank,lead,lag

#first we will declare the window and then use this inside over function
window_spec= Window.partitionBy("department").orderBy("salary")

df.withColumn("row_number",row_number().over(window_spec)).show()
df.withColumn("rank",rank().over(window_spec)).show()
#similarly we can use dense_rank, percent_Rank, ntile

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|        James|     Sales|  3000|
|      Michael|     Sales|  4600|
|       Robert|     Sales|  4100|
|        Maria|   Finance|  3000|
|        James|     Sales|  3000|
|        Scott|   Finance|  3300|
+-------------+----------+------+

+-------------+----------+------+----------+
|employee_name|department|salary|row_number|
+-------------+----------+------+----------+
|        Maria|   Finance|  3000|         1|
|        Scott|   Finance|  3300|         2|
|        James|     Sales|  3000|         1|
|        James|     Sales|  3000|         2|
|       Robert|     Sales|  4100|         3|
|      Michael|     Sales|  4600|         4|
+-------------+----------+------+----------+

+-------------+----------+------+----+
|employee_name|department|salary|rank|
+-------------+----------+------+----+
|        Maria|   Finance|  3000|   1|
|        Scott|   Finance|  3300|   2|
|        Jame

In [40]:
# now lets look at value window functions such as lead,lag
window_spec2= Window.partitionBy(df.department).orderBy(df.salary)

df.withColumn("last",lead(df.salary,1).over(window_spec2)).show()
df.withColumn("lag",lag(df.salary,1).over(window_spec2)).show()

+-------------+----------+------+----+
|employee_name|department|salary|last|
+-------------+----------+------+----+
|        Maria|   Finance|  3000|3300|
|        Scott|   Finance|  3300|NULL|
|        James|     Sales|  3000|3000|
|        James|     Sales|  3000|4100|
|       Robert|     Sales|  4100|4600|
|      Michael|     Sales|  4600|NULL|
+-------------+----------+------+----+

+-------------+----------+------+----+
|employee_name|department|salary| lag|
+-------------+----------+------+----+
|        Maria|   Finance|  3000|NULL|
|        Scott|   Finance|  3300|3000|
|        James|     Sales|  3000|NULL|
|        James|     Sales|  3000|3000|
|       Robert|     Sales|  4100|3000|
|      Michael|     Sales|  4600|4100|
+-------------+----------+------+----+



In [42]:
# now lets see window functions for aggregate functions
from pyspark.sql.functions import max
df.withColumn("max",max(df.salary).over(window_spec2)).show()

# we see that max is not working properly this is because current frame size is unbounded preceeding to current row
#however we can change the frame size using rowsBetween

window_spec3= Window.partitionBy(df.department).orderBy(df.salary).rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
df.withColumn("max",max(df.salary).over(window_spec3)).show()
#now we see its working porperly

#similarly we can use other aggregate functions

+-------------+----------+------+----+
|employee_name|department|salary| max|
+-------------+----------+------+----+
|        Maria|   Finance|  3000|3000|
|        Scott|   Finance|  3300|3300|
|        James|     Sales|  3000|3000|
|        James|     Sales|  3000|3000|
|       Robert|     Sales|  4100|4100|
|      Michael|     Sales|  4600|4600|
+-------------+----------+------+----+

+-------------+----------+------+----+
|employee_name|department|salary| max|
+-------------+----------+------+----+
|        Maria|   Finance|  3000|3300|
|        Scott|   Finance|  3300|3300|
|        James|     Sales|  3000|4600|
|        James|     Sales|  3000|4600|
|       Robert|     Sales|  4100|4600|
|      Michael|     Sales|  4600|4600|
+-------------+----------+------+----+

