In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=f7e17018233261d37d3aa56db02d56b4c911d22407ab606529f2531d197f2807
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Advanced DataFrame Operations - Different Dataset") \
    .getOrCreate()

# Create two sample DataFrames for Product Sales
data1 = [
    (1, 'Product A', 'Electronics', 1200, '2022-05-10'),
    (2, 'Product B', 'Clothing', 500, '2022-07-15'),
    (3, 'Product C', 'Electronics', 1800, '2021-11-05')
]

data2 = [
    (4, 'Product D', 'Furniture', 3000, '2022-03-25'),
    (5, 'Product E', 'Clothing', 800, '2022-09-12'),
    (6, 'Product F', 'Electronics', 1500, '2021-10-19')
]

# Define schema (columns)
columns = ['ProductID', 'ProductName', 'Category', 'Price', 'SaleDate']

# Create DataFrames
sales_df1 = spark.createDataFrame(data1, columns)
sales_df2 = spark.createDataFrame(data2, columns)

In [None]:
# Task 1: Union of DataFrames and remove duplicates
union_df_distinct = sales_df1.union(sales_df2).distinct()
union_df_distinct.show()


+---------+-----------+-----------+-----+----------+
|ProductID|ProductName|   Category|Price|  SaleDate|
+---------+-----------+-----------+-----+----------+
|        1|  Product A|Electronics| 1200|2022-05-10|
|        2|  Product B|   Clothing|  500|2022-07-15|
|        3|  Product C|Electronics| 1800|2021-11-05|
|        4|  Product D|  Furniture| 3000|2022-03-25|
|        6|  Product F|Electronics| 1500|2021-10-19|
|        5|  Product E|   Clothing|  800|2022-09-12|
+---------+-----------+-----------+-----+----------+



In [None]:
# Task 2: Union of DataFrames including duplicates
union_df_all = sales_df1.union(sales_df2)
union_df_all.show()


+---------+-----------+-----------+-----+----------+
|ProductID|ProductName|   Category|Price|  SaleDate|
+---------+-----------+-----------+-----+----------+
|        1|  Product A|Electronics| 1200|2022-05-10|
|        2|  Product B|   Clothing|  500|2022-07-15|
|        3|  Product C|Electronics| 1800|2021-11-05|
|        4|  Product D|  Furniture| 3000|2022-03-25|
|        5|  Product E|   Clothing|  800|2022-09-12|
|        6|  Product F|Electronics| 1500|2021-10-19|
+---------+-----------+-----------+-----+----------+



In [None]:
#Task 3: Rank Products by Price Within Their Category
window_spec_rank = Window.partitionBy("Category").orderBy(F.col("Price").desc())
ranked_df = union_df_all.withColumn("Rank", F.rank().over(window_spec_rank))
ranked_df.show()

+---------+-----------+-----------+-----+----------+----+
|ProductID|ProductName|   Category|Price|  SaleDate|Rank|
+---------+-----------+-----------+-----+----------+----+
|        5|  Product E|   Clothing|  800|2022-09-12|   1|
|        2|  Product B|   Clothing|  500|2022-07-15|   2|
|        3|  Product C|Electronics| 1800|2021-11-05|   1|
|        6|  Product F|Electronics| 1500|2021-10-19|   2|
|        1|  Product A|Electronics| 1200|2022-05-10|   3|
|        4|  Product D|  Furniture| 3000|2022-03-25|   1|
+---------+-----------+-----------+-----+----------+----+



In [None]:
#Task 4: Calculate Cumulative Price per Category
window_spec_cum_sum = Window.partitionBy("Category").orderBy("Price")
cumulative_df = union_df_all.withColumn("CumulativePrice", F.sum("Price").over(window_spec_cum_sum))
cumulative_df.show()

+---------+-----------+-----------+-----+----------+---------------+
|ProductID|ProductName|   Category|Price|  SaleDate|CumulativePrice|
+---------+-----------+-----------+-----+----------+---------------+
|        2|  Product B|   Clothing|  500|2022-07-15|            500|
|        5|  Product E|   Clothing|  800|2022-09-12|           1300|
|        1|  Product A|Electronics| 1200|2022-05-10|           1200|
|        6|  Product F|Electronics| 1500|2021-10-19|           2700|
|        3|  Product C|Electronics| 1800|2021-11-05|           4500|
|        4|  Product D|  Furniture| 3000|2022-03-25|           3000|
+---------+-----------+-----------+-----+----------+---------------+



In [None]:
#Task 5: Convert SaleDate from String to Date Type
date_converted_df = union_df_all.withColumn("SaleDate", F.to_date("SaleDate", "yyyy-MM-dd"))
date_converted_df.show()

+---------+-----------+-----------+-----+----------+
|ProductID|ProductName|   Category|Price|  SaleDate|
+---------+-----------+-----------+-----+----------+
|        1|  Product A|Electronics| 1200|2022-05-10|
|        2|  Product B|   Clothing|  500|2022-07-15|
|        3|  Product C|Electronics| 1800|2021-11-05|
|        4|  Product D|  Furniture| 3000|2022-03-25|
|        5|  Product E|   Clothing|  800|2022-09-12|
|        6|  Product F|Electronics| 1500|2021-10-19|
+---------+-----------+-----------+-----+----------+



In [None]:
#Task 6: Calculate the Number of Days Since Each Sale
days_since_sale_df = date_converted_df.withColumn("DaysSinceSale", F.datediff(F.current_date(), "SaleDate"))
days_since_sale_df.show()

+---------+-----------+-----------+-----+----------+-------------+
|ProductID|ProductName|   Category|Price|  SaleDate|DaysSinceSale|
+---------+-----------+-----------+-----+----------+-------------+
|        1|  Product A|Electronics| 1200|2022-05-10|          848|
|        2|  Product B|   Clothing|  500|2022-07-15|          782|
|        3|  Product C|Electronics| 1800|2021-11-05|         1034|
|        4|  Product D|  Furniture| 3000|2022-03-25|          894|
|        5|  Product E|   Clothing|  800|2022-09-12|          723|
|        6|  Product F|Electronics| 1500|2021-10-19|         1051|
+---------+-----------+-----------+-----+----------+-------------+



In [None]:
#Task 7: Add a Column for the Next Sale Deadline
next_sale_deadline_df = date_converted_df.withColumn("NextSaleDeadline",F.date_add("SaleDate", 30))
next_sale_deadline_df.show()

+---------+-----------+-----------+-----+----------+----------------+
|ProductID|ProductName|   Category|Price|  SaleDate|NextSaleDeadline|
+---------+-----------+-----------+-----+----------+----------------+
|        1|  Product A|Electronics| 1200|2022-05-10|      2022-06-09|
|        2|  Product B|   Clothing|  500|2022-07-15|      2022-08-14|
|        3|  Product C|Electronics| 1800|2021-11-05|      2021-12-05|
|        4|  Product D|  Furniture| 3000|2022-03-25|      2022-04-24|
|        5|  Product E|   Clothing|  800|2022-09-12|      2022-10-12|
|        6|  Product F|Electronics| 1500|2021-10-19|      2021-11-18|
+---------+-----------+-----------+-----+----------+----------------+



In [None]:
#Task 8: Calculate Total Revenue and Average Price per Category
revenue_avg_df = union_df_all.groupBy("Category").agg(F.sum("Price").alias("TotalRevenue"),F.avg("Price").alias("AveragePrice"))
revenue_avg_df.show()

+-----------+------------+------------+
|   Category|TotalRevenue|AveragePrice|
+-----------+------------+------------+
|Electronics|        4500|      1500.0|
|   Clothing|        1300|       650.0|
|  Furniture|        3000|      3000.0|
+-----------+------------+------------+



In [None]:
#Task 9: Convert All Product Names to Lowercase
lowercase_names_df = union_df_all.withColumn("ProductNameLower", F.lower("ProductName"))
lowercase_names_df.show()

+---------+-----------+-----------+-----+----------+----------------+
|ProductID|ProductName|   Category|Price|  SaleDate|ProductNameLower|
+---------+-----------+-----------+-----+----------+----------------+
|        1|  Product A|Electronics| 1200|2022-05-10|       product a|
|        2|  Product B|   Clothing|  500|2022-07-15|       product b|
|        3|  Product C|Electronics| 1800|2021-11-05|       product c|
|        4|  Product D|  Furniture| 3000|2022-03-25|       product d|
|        5|  Product E|   Clothing|  800|2022-09-12|       product e|
|        6|  Product F|Electronics| 1500|2021-10-19|       product f|
+---------+-----------+-----------+-----+----------+----------------+

