<a href="https://colab.research.google.com/github/TanishqLambhate/Data-Science-Training/blob/pyspark/pyspark_day_3_excercise_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install pyspark

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import col,rank
# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Advanced DataFrame Operations - Different Dataset") \
    .getOrCreate()

# Create two sample DataFrames for Product Sales
data1 = [
    (1, 'Product A', 'Electronics', 1200, '2022-05-10'),
    (2, 'Product B', 'Clothing', 500, '2022-07-15'),
    (3, 'Product C', 'Electronics', 1800, '2021-11-05')
]

data2 = [
    (4, 'Product D', 'Furniture', 3000, '2022-03-25'),
    (5, 'Product E', 'Clothing', 800, '2022-09-12'),
    (6, 'Product F', 'Electronics', 1500, '2021-10-19')
]

# Define schema (columns)
columns = ['ProductID', 'ProductName', 'Category', 'Price', 'SaleDate']

# Create DataFrames
sales_df1 = spark.createDataFrame(data1, columns)
sales_df2 = spark.createDataFrame(data2, columns)

In [5]:
# ### Tasks:

# 1. **Union of DataFrames (removing duplicates)**:
#    Combine the two DataFrames (`sales_df1` and `sales_df2`) using `union` and remove any duplicate rows.
union_df=sales_df1.union(sales_df2).dropDuplicates()
union_df.show()
# 2. **Union of DataFrames (including duplicates)**:
#    Combine both DataFrames using `unionAll` (replaced by `union`) and include duplicate rows.
union_all_df=sales_df1.union(sales_df2)
union_all_df.show()

+---------+-----------+-----------+-----+----------+
|ProductID|ProductName|   Category|Price|  SaleDate|
+---------+-----------+-----------+-----+----------+
|        1|  Product A|Electronics| 1200|2022-05-10|
|        2|  Product B|   Clothing|  500|2022-07-15|
|        3|  Product C|Electronics| 1800|2021-11-05|
|        4|  Product D|  Furniture| 3000|2022-03-25|
|        6|  Product F|Electronics| 1500|2021-10-19|
|        5|  Product E|   Clothing|  800|2022-09-12|
+---------+-----------+-----------+-----+----------+

+---------+-----------+-----------+-----+----------+
|ProductID|ProductName|   Category|Price|  SaleDate|
+---------+-----------+-----------+-----+----------+
|        1|  Product A|Electronics| 1200|2022-05-10|
|        2|  Product B|   Clothing|  500|2022-07-15|
|        3|  Product C|Electronics| 1800|2021-11-05|
|        4|  Product D|  Furniture| 3000|2022-03-25|
|        5|  Product E|   Clothing|  800|2022-09-12|
|        6|  Product F|Electronics| 1500|2021

In [8]:
# 3. **Rank products by price within their category**:
#    Use window functions to rank the products in each category by price in descending order.
window_spec=Window.partitionBy('Category').orderBy(col('Price').desc())
rank_df=union_all_df.withColumn('Rank',rank().over(window_spec))
rank_df.show()
# 4. **Calculate cumulative price per category**:
#    Use window functions to calculate the cumulative price of products within each category.
cummulative_df=union_all_df.withColumn('CumulativePrice',F.sum('Price').over(window_spec))
cummulative_df.show()

+---------+-----------+-----------+-----+----------+----+
|ProductID|ProductName|   Category|Price|  SaleDate|Rank|
+---------+-----------+-----------+-----+----------+----+
|        5|  Product E|   Clothing|  800|2022-09-12|   1|
|        2|  Product B|   Clothing|  500|2022-07-15|   2|
|        3|  Product C|Electronics| 1800|2021-11-05|   1|
|        6|  Product F|Electronics| 1500|2021-10-19|   2|
|        1|  Product A|Electronics| 1200|2022-05-10|   3|
|        4|  Product D|  Furniture| 3000|2022-03-25|   1|
+---------+-----------+-----------+-----+----------+----+

+---------+-----------+-----------+-----+----------+---------------+
|ProductID|ProductName|   Category|Price|  SaleDate|CumulativePrice|
+---------+-----------+-----------+-----+----------+---------------+
|        5|  Product E|   Clothing|  800|2022-09-12|            800|
|        2|  Product B|   Clothing|  500|2022-07-15|           1300|
|        3|  Product C|Electronics| 1800|2021-11-05|           1800|
|    

In [15]:
# 5. **Convert `SaleDate` from string to date type**:
#    Convert the `SaleDate` column from string format to a PySpark date type.
date_converted=union_all_df.withColumn('SaleDate',F.to_date(col('SaleDate'),"yyyy-mm-dd"))
date_converted.show()
# 6. **Calculate the number of days since each sale**:
#    Calculate the number of days since each product was sold using the current date.
current_date=F.current_date()
days_since_sale=union_all_df.withColumn('DaysSinceSale',F.datediff(current_date,col('SaleDate')))
days_since_sale.show()

+---------+-----------+-----------+-----+----------+
|ProductID|ProductName|   Category|Price|  SaleDate|
+---------+-----------+-----------+-----+----------+
|        1|  Product A|Electronics| 1200|2022-01-10|
|        2|  Product B|   Clothing|  500|2022-01-15|
|        3|  Product C|Electronics| 1800|2021-01-05|
|        4|  Product D|  Furniture| 3000|2022-01-25|
|        5|  Product E|   Clothing|  800|2022-01-12|
|        6|  Product F|Electronics| 1500|2021-01-19|
+---------+-----------+-----------+-----+----------+

+---------+-----------+-----------+-----+----------+-------------+
|ProductID|ProductName|   Category|Price|  SaleDate|DaysSinceSale|
+---------+-----------+-----------+-----+----------+-------------+
|        1|  Product A|Electronics| 1200|2022-05-10|          848|
|        2|  Product B|   Clothing|  500|2022-07-15|          782|
|        3|  Product C|Electronics| 1800|2021-11-05|         1034|
|        4|  Product D|  Furniture| 3000|2022-03-25|          894|


In [18]:
# 7. **Add a column for the next sale deadline**:
#    Add a new column `NextSaleDeadline`, which should be 30 days after the `SaleDate`.
next_sale_df=union_all_df.withColumn('NextSaleDeadline',F.date_add(col('SaleDate'),30))
next_sale_df.show()
# 8. **Calculate total revenue and average price per category**:
#    Find the total revenue (sum of prices) and the average price per category.
revenue_df=union_all_df.groupBy('Category').agg(F.sum('Price').alias('TotalRevenue'),F.avg('Price').alias('AveragePrice'))
revenue_df.show()

+---------+-----------+-----------+-----+----------+----------------+
|ProductID|ProductName|   Category|Price|  SaleDate|NextSaleDeadline|
+---------+-----------+-----------+-----+----------+----------------+
|        1|  Product A|Electronics| 1200|2022-05-10|      2022-06-09|
|        2|  Product B|   Clothing|  500|2022-07-15|      2022-08-14|
|        3|  Product C|Electronics| 1800|2021-11-05|      2021-12-05|
|        4|  Product D|  Furniture| 3000|2022-03-25|      2022-04-24|
|        5|  Product E|   Clothing|  800|2022-09-12|      2022-10-12|
|        6|  Product F|Electronics| 1500|2021-10-19|      2021-11-18|
+---------+-----------+-----------+-----+----------+----------------+

+-----------+------------+------------+
|   Category|TotalRevenue|AveragePrice|
+-----------+------------+------------+
|Electronics|        4500|      1500.0|
|   Clothing|        1300|       650.0|
|  Furniture|        3000|      3000.0|
+-----------+------------+------------+



In [19]:
# 9. **Convert all product names to lowercase**:
#    Create a new column with all product names in lowercase.
lower_case_df=union_all_df.withColumn('ProductName',F.lower(col('ProductName')))
lower_case_df.show()

+---------+-----------+-----------+-----+----------+
|ProductID|ProductName|   Category|Price|  SaleDate|
+---------+-----------+-----------+-----+----------+
|        1|  product a|Electronics| 1200|2022-05-10|
|        2|  product b|   Clothing|  500|2022-07-15|
|        3|  product c|Electronics| 1800|2021-11-05|
|        4|  product d|  Furniture| 3000|2022-03-25|
|        5|  product e|   Clothing|  800|2022-09-12|
|        6|  product f|Electronics| 1500|2021-10-19|
+---------+-----------+-----------+-----+----------+

