In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=f3e330dc3d3365f081b5373fda1af2065d258fce9150ab1253aa248747101123
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Advanced DataFrame Operations - Different Dataset") \
    .getOrCreate()

# Create two sample DataFrames for Product Sales
data1 = [
    (1, 'Product A', 'Electronics', 1200, '2022-05-10'),
    (2, 'Product B', 'Clothing', 500, '2022-07-15'),
    (3, 'Product C', 'Electronics', 1800, '2021-11-05')
]

data2 = [
    (4, 'Product D', 'Furniture', 3000, '2022-03-25'),
    (5, 'Product E', 'Clothing', 800, '2022-09-12'),
    (6, 'Product F', 'Electronics', 1500, '2021-10-19')
]

# Define schema (columns)
columns = ['ProductID', 'ProductName', 'Category', 'Price', 'SaleDate']

# Create DataFrames
sales_df1 = spark.createDataFrame(data1, columns)
sales_df2 = spark.createDataFrame(data2, columns)

# show dataframes
sales_df1.show()
sales_df2.show()

+---------+-----------+-----------+-----+----------+
|ProductID|ProductName|   Category|Price|  SaleDate|
+---------+-----------+-----------+-----+----------+
|        1|  Product A|Electronics| 1200|2022-05-10|
|        2|  Product B|   Clothing|  500|2022-07-15|
|        3|  Product C|Electronics| 1800|2021-11-05|
+---------+-----------+-----------+-----+----------+

+---------+-----------+-----------+-----+----------+
|ProductID|ProductName|   Category|Price|  SaleDate|
+---------+-----------+-----------+-----+----------+
|        4|  Product D|  Furniture| 3000|2022-03-25|
|        5|  Product E|   Clothing|  800|2022-09-12|
|        6|  Product F|Electronics| 1500|2021-10-19|
+---------+-----------+-----------+-----+----------+



In [9]:
# Tasks

#  1. Union of dataframes(Removing duplicates)
# Combine the two DataFrames (`sales_df1` and `sales_df2`) using `union` and remove any duplicate rows
combined_df = sales_df1.union(sales_df2).dropDuplicates()
print("Union of dataframes (removing duplicates)")
combined_df.show()

# 2. Union of DataFrames (including duplicates)**:
# Combine both DataFrames using `unionAll` (replaced by `union`) and include duplicate rows
combined_all_df = sales_df1.unionAll(sales_df2)
print("Union of dataframes (including duplicates): ")
combined_all_df.show()

# 3. Rank products by price within their category:
# Use window functions to rank the products in each category by price in descending order
window_spec = Window.partitionBy('Category').orderBy(F.desc('Price'))
ranked_df = combined_df.withColumn('Rank', F.row_number().over(window_spec))
print("Rank of the products in descending order")
ranked_df.show()

# 4. Calculate cumulative price per category
# Use window functions to calculate the cumulative price of products within each category
sales_df_cumulative = combined_df.withColumn('CumulativePrice', F.sum('Price').over(window_spec))
print("Cumulative price of products per each category: ")
sales_df_cumulative.show()

# 5. Convert SaleDate from string to date type
# Convert the SaleDate column from string format to a PySpark date type
sales_df_date_converted = combined_df.withColumn('SaleDate', F.to_date('SaleDate', 'yyyy-MM-dd'))
print("Saledate from string to date type: ")
sales_df_date_converted.show()

# 6. Calculate the number of days since each sale
# Calculate the number of days since each product was sold using the current date
sales_df_days_since = combined_df.withColumn('DaysSinceSale', F.datediff(F.current_date(), 'SaleDate'))
print("No.of days since each product sold: ")
sales_df_days_since.show()

# 7. Add a column for the next sale deadline
# Add a new column NextSaleDeadline, which should be 30 days after the SaleDate
sales_df_next_deadline = combined_df.withColumn('NextSaleDeadline', F.date_add('SaleDate', 30))
print("Add a column for the next sale deadline: ")
sales_df_next_deadline.show()

# 8.Calculate total revenue and average price per category
# Find the total revenue (sum of prices) and the average price per category
total_revenue_df = combined_df.groupBy('Category').agg(
    F.sum('Price').alias('TotalRevenue'),
    F.avg('Price').alias('AveragePrice')
)
print("Total revenue and average price per category: ")
total_revenue_df.show()

# 9. Convert all product names to lowercase
# Create a new column with all product names in lowercase
sales_df_lowercase = combined_df.withColumn('ProductNameLower', F.lower('ProductName'))
print("Convert all product names to lowercase: ")
sales_df_lowercase.show()

Union of dataframes (removing duplicates)
+---------+-----------+-----------+-----+----------+
|ProductID|ProductName|   Category|Price|  SaleDate|
+---------+-----------+-----------+-----+----------+
|        1|  Product A|Electronics| 1200|2022-05-10|
|        2|  Product B|   Clothing|  500|2022-07-15|
|        3|  Product C|Electronics| 1800|2021-11-05|
|        4|  Product D|  Furniture| 3000|2022-03-25|
|        6|  Product F|Electronics| 1500|2021-10-19|
|        5|  Product E|   Clothing|  800|2022-09-12|
+---------+-----------+-----------+-----+----------+

Union of dataframes (including duplicates): 
+---------+-----------+-----------+-----+----------+
|ProductID|ProductName|   Category|Price|  SaleDate|
+---------+-----------+-----------+-----+----------+
|        1|  Product A|Electronics| 1200|2022-05-10|
|        2|  Product B|   Clothing|  500|2022-07-15|
|        3|  Product C|Electronics| 1800|2021-11-05|
|        4|  Product D|  Furniture| 3000|2022-03-25|
|        5|