In [1]:
pip install pyspark



In [2]:
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
spark=SparkSession.builder \
      .appName("Day 4") \
      .getOrCreate()

Problem 1: PySpark – Detect Outlier Transactions per Day


Problem Statement
You have a PySpark DataFrame with daily transaction amounts. For each day, identify transactions greater than the average amount for that day (outliers).

In [3]:
data = [
    ("2025-01-01", "T1", 100),
    ("2025-01-01", "T2", 200),
    ("2025-01-01", "T3", 500),
    ("2025-01-02", "T4", 300),
    ("2025-01-02", "T5", 400),
    ("2025-01-02", "T6", 600)
]
columns = ["txn_date", "txn_id", "amount"]

df = spark.createDataFrame(data, columns)

In [4]:
daily_avg = df.groupBy("txn_date").agg(avg("amount").alias("avg_amount"))
df_with_avg = df.join(daily_avg, on="txn_date", how="inner")
outliers = df_with_avg.filter(col("amount") > col("avg_amount")) \
                      .select("txn_date", "txn_id", "amount").show()

+----------+------+------+
|  txn_date|txn_id|amount|
+----------+------+------+
|2025-01-01|    T3|   500|
|2025-01-02|    T6|   600|
+----------+------+------+



Problem 2: SQL – Find Employees Always Reporting to the Same Manager

Problem Statement
You have a table employee_manager(emp_id, manager_id, change_date) showing employees' managers over time. Write a SQL query to find employees who never changed their manager across all records.



In [5]:
data = [
    (1, 10, "2025-01-01"),
    (1, 10, "2025-02-01"),
    (2, 11, "2025-01-01"),
    (2, 12, "2025-03-01"),
    (3, 13, "2025-01-05")
]

columns = ["emp_id", "manager_id", "change_date"]

df = spark.createDataFrame(data, columns)

df = df.withColumn("change_date", to_date(col("change_date"), "yyyy-MM-dd"))

df.createOrReplaceTempView("employee_manager")

In [6]:
result = spark.sql("""
    SELECT emp_id
    FROM employee_manager
    GROUP BY emp_id
    HAVING COUNT(DISTINCT manager_id) = 1
""")

result.show()

+------+
|emp_id|
+------+
|     1|
|     3|
+------+

