In [0]:
from pyspark.sql import SparkSession 
from pyspark.sql.functions import *
from pyspark.sql.types import *



spark = SparkSession.builder.appName("read").getOrCreate()

My file path is : 

dbfs:/FileStore/shared_uploads/timilsina.ra@northeastern.edu/Products.csv

In [0]:
schema = """ TransactionId int, Product string , Category string , Amount int , Date string , Region string """



df= spark.read.format("csv").schema(schema).option("header", True).load("dbfs:/FileStore/shared_uploads/timilsina.ra@northeastern.edu/Products.csv")

df.display()

TransactionId,Product,Category,Amount,Date,Region
1,Laptop,Electronics,800,2024-12-01,North
2,Smartphone,Electronics,500,2024-12-02,South
3,Tablet,Electronics,300,2024-12-02,West
4,Laptop,Electronics,850,2024-12-03,East
5,Headphones,Accessories,150,2024-12-03,North
6,Keyboard,Accessories,100,2024-12-04,South
7,Mouse,Accessories,50,2024-12-04,West
8,Smartphone,Electronics,600,2024-12-05,North
9,Tablet,Electronics,350,2024-12-05,East
10,Headphones,Accessories,200,2024-12-06,South


In [0]:
#Typecasting date 

df.withColumn("Date", col("Date").cast("date"))
df.printSchema()
df.display()


root
 |-- TransactionId: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Amount: integer (nullable = true)
 |-- Date: string (nullable = true)
 |-- Region: string (nullable = true)



TransactionId,Product,Category,Amount,Date,Region
1,Laptop,Electronics,800,2024-12-01,North
2,Smartphone,Electronics,500,2024-12-02,South
3,Tablet,Electronics,300,2024-12-02,West
4,Laptop,Electronics,850,2024-12-03,East
5,Headphones,Accessories,150,2024-12-03,North
6,Keyboard,Accessories,100,2024-12-04,South
7,Mouse,Accessories,50,2024-12-04,West
8,Smartphone,Electronics,600,2024-12-05,North
9,Tablet,Electronics,350,2024-12-05,East
10,Headphones,Accessories,200,2024-12-06,South


In [0]:
# Filter all records with amount less than 100 

df_filtered = df.filter( col("Amount") >= 100)
df_filtered.display()


TransactionId,Product,Category,Amount,Date,Region
1,Laptop,Electronics,800,2024-12-01,North
2,Smartphone,Electronics,500,2024-12-02,South
3,Tablet,Electronics,300,2024-12-02,West
4,Laptop,Electronics,850,2024-12-03,East
5,Headphones,Accessories,150,2024-12-03,North
6,Keyboard,Accessories,100,2024-12-04,South
8,Smartphone,Electronics,600,2024-12-05,North
9,Tablet,Electronics,350,2024-12-05,East
10,Headphones,Accessories,200,2024-12-06,South


In [0]:
# Find the total sales for each category 

df_total_by_category = df_filtered.groupBy("Category").agg(sum("Amount").alias("Sale by Category"))
df_total_by_category.display()

Category,Sale by Category
Electronics,3400
Accessories,450


In [0]:
#Calculate total_sales for each region 

df_total_by_region = df_filtered.groupBy("Region").agg(sum("Amount").alias("Total Sales by Region"))
df_total_by_region.display()

Region,Total Sales by Region
South,800
East,1200
West,300
North,1550


In [0]:
# Identify product in highest sales in electronic category
from pyspark.sql.window import Window

window_spec = Window.partitionBy("Category").orderBy(desc("Amount"))
df_ranked = df_filtered.withColumn("rank", dense_rank().over(window_spec))
df_filtered_result = df_ranked.filter((col("Category") == "Electronics") & (col("rank") == 1))
df_filtered_result.select("Product", "Amount").display()


Product,Amount
Laptop,850


In [0]:
#Alternative Way 
df_highest_sale_electronic = df_filtered.filter(col("Category") == "Electronics").orderBy(desc("Amount"))
df_highest_sale_electronic.display()

df_result = df_highest_sale_electronic.select("Product", "Amount").limit(1)
df_result.display()


TransactionId,Product,Category,Amount,Date,Region
4,Laptop,Electronics,850,2024-12-03,East
1,Laptop,Electronics,800,2024-12-01,North
8,Smartphone,Electronics,600,2024-12-05,North
2,Smartphone,Electronics,500,2024-12-02,South
9,Tablet,Electronics,350,2024-12-05,East
3,Tablet,Electronics,300,2024-12-02,West


Product,Amount
Laptop,850


In [0]:

# count number of transactions per region
df_count = df_filtered.groupBy("Region").agg(count("TransactionId").alias("Num of Transactions"))
df_count.display()


Region,Num of Transactions
South,3
East,2
West,1
North,3


In [0]:
df_write = df_count.write.format("csv").mode("append").save("FileStore/shared_uploads/timilsina.ra@northeastern.edu/MyOutput3")


Union and Union All in Data Bricks 


DataFrame unionAll() – unionAll() is deprecated since Spark “2.0.0” version and replaced with union()

In [0]:
data1 = [(1, "Alice"), (2, "Bob"), (3, "Charlie")] 
data2 = [(2, "Bob"), (3, "Charlie"), (4, "David")] 
data3= data3 = [(1, "Alice"), (2, "Bob"), (3, "Charlie")]
schema = """id int, name string"""
df_1= spark.createDataFrame(data1, schema)
df_1.display()

df_2= spark.createDataFrame(data2, schema)
df_2.display()

df_3= spark.createDataFrame(data3,schema)
df_3.display()

id,name
1,Alice
2,Bob
3,Charlie


id,name
2,Bob
3,Charlie
4,David


id,name
1,Alice
2,Bob
3,Charlie


In [0]:
df_1.union(df_2).display()

df_1.unionAll(df_2).display()

id,name
1,Alice
2,Bob
3,Charlie
2,Bob
3,Charlie
4,David


id,name
1,Alice
2,Bob
3,Charlie
2,Bob
3,Charlie
4,David


In [0]:
df_union = df_1.union(df_3)
df_unionall = df_1.unionAll(df_3)
df_union.dropDuplicates().display()
df_unionall.dropDuplicates().display()

id,name
1,Alice
2,Bob
3,Charlie


id,name
1,Alice
2,Bob
3,Charlie
