In [1]:
pip install pyspark



In [2]:
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
spark=SparkSession.builder \
      .appName("Day 3") \
      .getOrCreate()

Problem 1: PySpark – Identify First Purchase per Customer


Problem Statement
You have a PySpark DataFrame containing customer purchase data. Each row represents a purchase. Write a PySpark program to find the first purchase date for each customer and the amount spent on that date.

In [4]:
data = [
    (101, "2025-01-03", 250),
    (101, "2025-01-05", 300),
    (102, "2025-01-01", 150),
    (102, "2025-01-02", 200),
    (103, "2025-01-04", 500),
]
columns = ["customer_id", "purchase_date", "amount"]

purchases = spark.createDataFrame(data, columns)

purchases = purchases.withColumn("purchase_date", to_date(col("purchase_date"), "yyyy-MM-dd"))
purchases.show()

+-----------+-------------+------+
|customer_id|purchase_date|amount|
+-----------+-------------+------+
|        101|   2025-01-03|   250|
|        101|   2025-01-05|   300|
|        102|   2025-01-01|   150|
|        102|   2025-01-02|   200|
|        103|   2025-01-04|   500|
+-----------+-------------+------+



In [5]:
window=Window.partitionBy("customer_id").orderBy(col("purchase_date").desc())
result=purchases.withColumn("rn",row_number().over(window)).filter(col("rn")==1).select("customer_id","purchase_date","amount").show()


+-----------+-------------+------+
|customer_id|purchase_date|amount|
+-----------+-------------+------+
|        101|   2025-01-05|   300|
|        102|   2025-01-02|   200|
|        103|   2025-01-04|   500|
+-----------+-------------+------+



Problem 2: SQL – Detect Employees with Salary Changes

Problem Statement
You have a table employee_salaries(emp_id, effective_date, salary) containing salary history for employees. Write a SQL query to find employees whose salary changed more than once and display the number of changes for each.



In [6]:
data = [
    (1, "2025-01-01", 50000),
    (1, "2025-02-01", 55000),
    (1, "2025-03-01", 60000),
    (2, "2025-01-15", 40000),
    (2, "2025-03-01", 45000),
    (3, "2025-01-10", 30000),
]
columns = ["emp_id", "effective_date", "salary"]

employee_salaries = spark.createDataFrame(data, columns)
employee_salaries.createOrReplaceTempView("employee_salaries")

In [7]:
result = spark.sql("""
WITH salary_changes AS (
    SELECT
        emp_id,
        effective_date,
        salary,
        LAG(salary) OVER (PARTITION BY emp_id ORDER BY effective_date) AS prev_salary
    FROM employee_salaries
)
SELECT
    emp_id,
    COUNT(*) FILTER (WHERE salary <> prev_salary) AS salary_change_count
FROM salary_changes
GROUP BY emp_id
HAVING salary_change_count > 0
""")

result.show()

+------+-------------------+
|emp_id|salary_change_count|
+------+-------------------+
|     1|                  2|
|     2|                  1|
+------+-------------------+

