In [2]:
# Testing pyspark Intallation
import findspark
findspark.init('C:\Spark')
findspark.find()

import pyspark
findspark.find()

'C:\\Spark'

In [3]:
import pyspark
from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession.builder.master("local[1]").appName('SparkByExamples.com').getOrCreate()

spark

In [None]:
'''
PySpark Aggregate Functions

    - approx_count_distinct
    - avg
    - collect_list
    - collect_set
    - countDistinct
    - count
    - grouping
    - first
    - last
    - kurtosis
    - max
    - min
    - mean
    - skewness
    - stddev
    - stddev_samp
    - stddev_pop
    - sum
    - sumDistinct
    - variance
    - var_samp
    - var_pop

In [5]:
# Read CSV File
csv_file = 'data/3.csv'

df = spark.read.option("header",True).csv(csv_file)

df.show(5)

+---------+--------+----------+----------------+
|Item_Name|Quantity|Price/item|       Shop_Name|
+---------+--------+----------+----------------+
|Chocolate|      12|        20|      Deck Store|
|  Kurkure|      20|        25|Mooney ShopStore|
| Biscuits|       6|        45|      Deck Store|
|      Pen|       2|         5|     Shyam Store|
|   Sheets|       5|        10|     Shyam Store|
+---------+--------+----------+----------------+
only showing top 5 rows



In [6]:
#View Schema
df.printSchema()

root
 |-- Item_Name: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- Price/item: string (nullable = true)
 |-- Shop_Name: string (nullable = true)



In [7]:
# Change the Datatype of the Column 'Quantity'
df = df.withColumn("Quantity",df.Quantity.cast('int'))

In [8]:
#View Schema
df.printSchema()

root
 |-- Item_Name: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price/item: string (nullable = true)
 |-- Shop_Name: string (nullable = true)



In [9]:
# Importing all SQL functions
from pyspark.sql.functions import col

df_avg = df.withColumn("Total Cost",col("Quantity") * col("Price/item"))

#View dataframe
df_avg.show()

+---------+--------+----------+----------------+----------+
|Item_Name|Quantity|Price/item|       Shop_Name|Total Cost|
+---------+--------+----------+----------------+----------+
|Chocolate|      12|        20|      Deck Store|     240.0|
|  Kurkure|      20|        25|Mooney ShopStore|     500.0|
| Biscuits|       6|        45|      Deck Store|     270.0|
|      Pen|       2|         5|     Shyam Store|      10.0|
|   Sheets|       5|        10|     Shyam Store|      50.0|
|    Novel|      10|       100|Mooney ShopStore|    1000.0|
+---------+--------+----------+----------------+----------+



In [10]:
# Applying average function

# avg

from pyspark.sql.functions import avg

df_avg.select(avg("Total Cost").alias("Average Cost")).show()

+------------+
|Average Cost|
+------------+
|       345.0|
+------------+



In [11]:
# sum

from pyspark.sql.functions import sum

df_avg.select(sum("Quantity").alias("Total Items")).show()

+-----------+
|Total Items|
+-----------+
|         55|
+-----------+



In [12]:
# groupBy

df.groupBy("Shop_Name").sum("Quantity").show()

+----------------+-------------+
|       Shop_Name|sum(Quantity)|
+----------------+-------------+
|     Shyam Store|            7|
|Mooney ShopStore|           30|
|      Deck Store|           18|
+----------------+-------------+



In [13]:
# Question:Calculate the average amount earn by each shopstore.

#Create a new column

df_avg=df.withColumn("Total Cost",col("Quantity")*col("Price/item"))
df_avg.show()

#Average money earn
df_avg.groupBy("Shop_Name").avg("Total Cost").show()

+---------+--------+----------+----------------+----------+
|Item_Name|Quantity|Price/item|       Shop_Name|Total Cost|
+---------+--------+----------+----------------+----------+
|Chocolate|      12|        20|      Deck Store|     240.0|
|  Kurkure|      20|        25|Mooney ShopStore|     500.0|
| Biscuits|       6|        45|      Deck Store|     270.0|
|      Pen|       2|         5|     Shyam Store|      10.0|
|   Sheets|       5|        10|     Shyam Store|      50.0|
|    Novel|      10|       100|Mooney ShopStore|    1000.0|
+---------+--------+----------+----------------+----------+

+----------------+---------------+
|       Shop_Name|avg(Total Cost)|
+----------------+---------------+
|     Shyam Store|           30.0|
|Mooney ShopStore|          750.0|
|      Deck Store|          255.0|
+----------------+---------------+



In [17]:
# Question:Find the quantity of item purchased maximum from each shopstore?

df.groupBy("Shop_Name").max("Quantity").show()

+----------------+-------------+
|       Shop_Name|max(Quantity)|
+----------------+-------------+
|     Shyam Store|            5|
|Mooney ShopStore|           20|
|      Deck Store|           12|
+----------------+-------------+



In [38]:
# max
from pyspark.sql.functions import max

df.select(max("Quantity").alias("Maximum Quantity")).show()

+----------------+
|Maximum Quantity|
+----------------+
|              20|
+----------------+



In [19]:
# min : min() returns the minimum value in a given column.

df.groupBy("Shop_Name").min("Quantity").show()

+----------------+-------------+
|       Shop_Name|min(Quantity)|
+----------------+-------------+
|     Shyam Store|            2|
|Mooney ShopStore|           10|
|      Deck Store|            6|
+----------------+-------------+



In [37]:
# count : count() returns the number of elements in a column.

from pyspark.sql.functions import count

df.select(count("Quantity")).show()

+---------------+
|count(Quantity)|
+---------------+
|              6|
+---------------+



In [21]:
# distinct : distinct() returns only unique values of a column.

df.select(df["Shop_Name"]).distinct().show()

+----------------+
|       Shop_Name|
+----------------+
|     Shyam Store|
|Mooney ShopStore|
|      Deck Store|
+----------------+



In [39]:
# collect_list : collect_list() function returns all values from an input column with duplicates.

from pyspark.sql.functions import collect_list

df.select(collect_list("Shop_Name")).show(truncate=False)

+--------------------------------------------------------------------------------------+
|collect_list(Shop_Name)                                                               |
+--------------------------------------------------------------------------------------+
|[Deck Store, Mooney ShopStore, Deck Store, Shyam Store, Shyam Store, Mooney ShopStore]|
+--------------------------------------------------------------------------------------+



In [40]:
# collect_set() function returns all values from an input column with duplicate values eliminated.
from pyspark.sql.functions import collect_set

df.select(collect_set("Shop_Name")).show(truncate=False)

+-------------------------------------------+
|collect_set(Shop_Name)                     |
+-------------------------------------------+
|[Deck Store, Shyam Store, Mooney ShopStore]|
+-------------------------------------------+



In [24]:
# Avaerage Salary

print("Avg:" + str (df.select(avg('Price/item')).collect()[0][0]))

Avg:34.166666666666664


In [41]:
# countDistinct Aggregate Function
from pyspark.sql.functions import countDistinct

df.select(countDistinct("Shop_Name")).show()

+-------------------------+
|count(DISTINCT Shop_Name)|
+-------------------------+
|                        3|
+-------------------------+



In [42]:
# first function
from pyspark.sql.functions import first

df.select(first("Shop_Name")).show(truncate=False)

+----------------+
|first(Shop_Name)|
+----------------+
|Deck Store      |
+----------------+



In [None]:
# last function

df.select(last("Shop_Name")).show(truncate=False)

In [27]:
df.show()

+---------+--------+----------+----------------+
|Item_Name|Quantity|Price/item|       Shop_Name|
+---------+--------+----------+----------------+
|Chocolate|      12|        20|      Deck Store|
|  Kurkure|      20|        25|Mooney ShopStore|
| Biscuits|       6|        45|      Deck Store|
|      Pen|       2|         5|     Shyam Store|
|   Sheets|       5|        10|     Shyam Store|
|    Novel|      10|       100|Mooney ShopStore|
+---------+--------+----------+----------------+



In [43]:
# kurtosis function
from pyspark.sql.functions import kurtosis

df.select(kurtosis('Quantity')).show(truncate=False)

+-------------------+
|kurtosis(Quantity) |
+-------------------+
|-0.5862989683145523|
+-------------------+



In [44]:
# mean function
from pyspark.sql.functions import mean

df.select(mean('Quantity')).show(truncate=False)

+-----------------+
|avg(Quantity)    |
+-----------------+
|9.166666666666666|
+-----------------+



In [45]:
# skewness function
from pyspark.sql.functions import skewness

df.select(skewness('Quantity')).show(truncate=False)

+------------------+
|skewness(Quantity)|
+------------------+
|0.6872898280823618|
+------------------+



In [46]:
# stddev :
# stddev_samp : function returns the sample standard deviation of values in a column
# stddev_pop : function returns the population standard deviation of the values in a column

from pyspark.sql.functions import stddev,stddev_samp,stddev_pop

df.select(stddev("Quantity"), stddev_samp("Quantity"),stddev_pop("Quantity")).show(truncate=False)


+---------------------+---------------------+--------------------+
|stddev_samp(Quantity)|stddev_samp(Quantity)|stddev_pop(Quantity)|
+---------------------+---------------------+--------------------+
|6.400520812142295    |6.400520812142295    |5.842849380986035   |
+---------------------+---------------------+--------------------+



In [47]:
# sumDistinct :
from pyspark.sql.functions import sumDistinct

df.select(sumDistinct("Quantity")).show(truncate=False)

+----------------------+
|sum(DISTINCT Quantity)|
+----------------------+
|55                    |
+----------------------+



In [48]:
# variance()
# var_samp() :  function returns the unbiased variance of the values in a column.
# var_pop() : function returns the population variance of the values in a column.


from pyspark.sql.functions import variance,var_samp,var_pop
df.select(variance("Quantity"),var_samp("Quantity"),var_pop("Quantity")).show(truncate=False)

+------------------+------------------+-----------------+
|var_samp(Quantity)|var_samp(Quantity)|var_pop(Quantity)|
+------------------+------------------+-----------------+
|40.96666666666667 |40.96666666666667 |34.13888888888889|
+------------------+------------------+-----------------+

