In [1]:
pip install pyspark



In [2]:
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
spark=SparkSession.builder \
      .appName("Day 5") \
      .getOrCreate()

Problem 1: PySpark – Calculate Percentage Contribution of Each Product

Problem Statement
You have a PySpark DataFrame with total sales for different products. Write a PySpark program to calculate the percentage contribution of each product towards the total sales across all products.

In [3]:
data = [
    ("Laptop", 1200),
    ("Phone", 800),
    ("Tablet", 500),
    ("Desktop", 500)
]

columns = ["product", "sales"]

df = spark.createDataFrame(data, columns)

In [6]:
total_sales = df.agg(sum("sales").alias("total_sales")).first()["total_sales"]

df_result = df.withColumn(
    "percentage_contribution",
    round((col("sales") / total_sales) * 100, 1)
).show()

+-------+-----+-----------------------+
|product|sales|percentage_contribution|
+-------+-----+-----------------------+
| Laptop| 1200|                   40.0|
|  Phone|  800|                   26.7|
| Tablet|  500|                   16.7|
|Desktop|  500|                   16.7|
+-------+-----+-----------------------+



Problem 2: SQL – Identify Months with No Transactions

Problem Statement
You have a SQL table transactions(txn_id, txn_date, amount). Write a query to find months in 2025 where there were no transactions. Assume txn_date is in YYYY-MM-DD format.

In [7]:
data = [
    ("T1", "2025-01-10", 100),
    ("T2", "2025-01-15", 200),
    ("T3", "2025-03-05", 300),
    ("T4", "2025-05-20", 400),
]

columns = ["txn_id", "txn_date", "amount"]

transactions = spark.createDataFrame(data, columns)

transactions = transactions.withColumn("txn_date", to_date(col("txn_date"), "yyyy-MM-dd"))

transactions.createOrReplaceTempView("transactions")

In [8]:
months_df = (
    spark.sql("SELECT sequence(to_date('2025-01-01'), to_date('2025-12-01'), interval 1 month) as months")
    .withColumn("month", expr("explode(months)"))
    .withColumn("month", expr("date_format(month, 'yyyy-MM')"))
    .select("month")
)

months_df.createOrReplaceTempView("months")

txn_months = spark.sql("""
    SELECT DISTINCT date_format(txn_date, 'yyyy-MM') AS month
    FROM transactions
    WHERE year(txn_date) = 2025
""")
txn_months.createOrReplaceTempView("txn_months")

missing_months = spark.sql("""
    SELECT m.month AS missing_month
    FROM months m
    LEFT JOIN txn_months t
           ON m.month = t.month
    WHERE t.month IS NULL
    ORDER BY m.month
""")

missing_months.show()

+-------------+
|missing_month|
+-------------+
|      2025-02|
|      2025-04|
|      2025-06|
|      2025-07|
|      2025-08|
|      2025-09|
|      2025-10|
|      2025-11|
|      2025-12|
+-------------+

