#PySpark Practice Notebook

In [None]:
#Source Data
#Create a csv file for the following data
product_id,product,country,sales
1,Product A,USA,100
2,Product B,USA,80
3,Product C,USA,30
1,Product A,Canada,60
2,Product B,Canada,40
4,Product D,UK,50
5,Product E,UK,20
1,Product A,Germany,70
3,Product C,Germany,90
4,Product D,Germany,40

In [2]:
#Import Pyspark & other necessary functions
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=2f96ca1235c8a534c1f4b20129370c14a1c9ffeb9b53fe3a1b5ab2653d11d0ba
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [4]:
#Create SparkSession for app "Sales Data Analysis"
from pyspark.sql import SparkSession
ps_spark=SparkSession.builder.appName("Sales Data Analysis").getOrCreate()

In [6]:
%%writefile product.csv
product_id,product,country,sales
1,Product A,USA,100
2,Product B,USA,80
3,Product C,USA,30
1,Product A,Canada,60
2,Product B,Canada,40
4,Product D,UK,50
5,Product E,UK,20
1,Product A,Germany,70
3,Product C,Germany,90
4,Product D,Germany,40


Writing product.csv


In [9]:
#Create a pyspark DataFrame from the csv file on local storage
ps_df=ps_spark.read.option("header","True").csv("/content/product.csv")

In [10]:
#Verify schema for the newly created file
ps_df.printSchema()

root
 |-- product_id: string (nullable = true)
 |-- product: string (nullable = true)
 |-- country: string (nullable = true)
 |-- sales: string (nullable = true)



In [11]:
#Display all entries for country "Germany"
ps_df.filter(ps_df["country"]=="Germany").show()

+----------+---------+-------+-----+
|product_id|  product|country|sales|
+----------+---------+-------+-----+
|         1|Product A|Germany|   70|
|         3|Product C|Germany|   90|
|         4|Product D|Germany|   40|
+----------+---------+-------+-----+



In [19]:
#Find the total number of products
df3=ps_df.select("Product").distinct()
print(df3.count())
df3.show()

5
+---------+
|  Product|
+---------+
|Product A|
|Product B|
|Product C|
|Product D|
|Product E|
+---------+



In [29]:
#Find top 3 products
ps_df.groupBy("Product").agg({"Sales":"sum"}).show(3)

+---------+----------+
|  Product|sum(Sales)|
+---------+----------+
|Product A|     230.0|
|Product B|     120.0|
|Product C|     120.0|
+---------+----------+
only showing top 3 rows



In [28]:
#Calculate total sales
ps_df.groupby("Product").agg({"Sales":"sum"}).show()

+---------+----------+
|  Product|sum(Sales)|
+---------+----------+
|Product A|     230.0|
|Product B|     120.0|
|Product C|     120.0|
|Product D|      90.0|
|Product E|      20.0|
+---------+----------+



In [40]:
#Find the market share of all products individually
tot=ps_df.groupby("Product").agg({"Sales":"sum"})
#ms_a=ps_df.groupby("Product").agg({"Sales":"sum"})/tot
#ms_a.show()
x = tot.withColumnRenamed("sum(Sales)","Total_Sales")
x.show()

+---------+-----------+
|  Product|Total_Sales|
+---------+-----------+
|Product A|      230.0|
|Product B|      120.0|
|Product C|      120.0|
|Product D|       90.0|
|Product E|       20.0|
+---------+-----------+



In [50]:
t = x.agg({"Total_Sales":"sum"}).collect()[0][0]
t

580.0

In [56]:
y =x.withColumn("market_share",x.Total_Sales/t)
y.show()

+---------+-----------+--------------------+
|  Product|Total_Sales|        market_share|
+---------+-----------+--------------------+
|Product A|      230.0| 0.39655172413793105|
|Product B|      120.0| 0.20689655172413793|
|Product C|      120.0| 0.20689655172413793|
|Product D|       90.0| 0.15517241379310345|
|Product E|       20.0|0.034482758620689655|
+---------+-----------+--------------------+

