# Scenario: Employee Work Data for a Tech Company

## Step 1: Prepare Data in PySpark

In [3]:
from pyspark.sql import Row

from pyspark.sql import SparkSession

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("EmployeeWorkData") \
    .getOrCreate()

data = [
    Row(EmpID=101, Name="Ravi", Department="Engineering", Project="AI Engine",Salary=95000, HoursPerWeek=42),
    Row(EmpID=102, Name="Sneha", Department="Engineering", Project="Data Platform",Salary=87000, HoursPerWeek=45),
    Row(EmpID=103, Name="Kabir", Department="Marketing", Project="Product Launch",Salary=65000, HoursPerWeek=40),
    Row(EmpID=104, Name="Anita", Department="Sales", Project="Client Outreach",Salary=70000, HoursPerWeek=38),
    Row(EmpID=105, Name="Divya", Department="Engineering", Project="AI Engine",Salary=99000, HoursPerWeek=48),
    Row(EmpID=106, Name="Amit", Department="Marketing", Project="Social Media",Salary=62000, HoursPerWeek=35),
    Row(EmpID=107, Name="Priya", Department="HR", Project="Policy Revamp",Salary=58000, HoursPerWeek=37),
    Row(EmpID=108, Name="Manav", Department="Sales", Project="Lead Gen", Salary=73000,HoursPerWeek=41),
    Row(EmpID=109, Name="Neha", Department="Engineering", Project="Security Suite",Salary=91000, HoursPerWeek=46),
    Row(EmpID=110, Name="Farah", Department="HR", Project="Onboarding", Salary=60000,HoursPerWeek=36)
]

df = spark.createDataFrame(data)
df.show(truncate=False)

+-----+-----+-----------+---------------+------+------------+
|EmpID|Name |Department |Project        |Salary|HoursPerWeek|
+-----+-----+-----------+---------------+------+------------+
|101  |Ravi |Engineering|AI Engine      |95000 |42          |
|102  |Sneha|Engineering|Data Platform  |87000 |45          |
|103  |Kabir|Marketing  |Product Launch |65000 |40          |
|104  |Anita|Sales      |Client Outreach|70000 |38          |
|105  |Divya|Engineering|AI Engine      |99000 |48          |
|106  |Amit |Marketing  |Social Media   |62000 |35          |
|107  |Priya|HR         |Policy Revamp  |58000 |37          |
|108  |Manav|Sales      |Lead Gen       |73000 |41          |
|109  |Neha |Engineering|Security Suite |91000 |46          |
|110  |Farah|HR         |Onboarding     |60000 |36          |
+-----+-----+-----------+---------------+------+------------+



## Step 2: Create Views

In [4]:
# Create a Local Temp View
df.createOrReplaceTempView("employees_local")

# Create a Global Temp View
df.createOrReplaceGlobalTempView("employees_global")

## Part A: Exercises on Local View ( employees_local )

1. List all employees working on the "AI Engine" project.
2. Show all employees from the "Marketing" department with salaries greater than
60,000.
3. Calculate the average salary for each department.
4. List the top 3 highest paid employees overall.
5. Find employees who work more than 40 hours per week.
6. Group by project and display the number of employees per project.
7. Drop the local view. Try querying again — what happens?

In [5]:
# 1. List all employees working on the "AI Engine" project.
spark.sql("SELECT * FROM employees_local WHERE Project = 'AI Engine'").show()

+-----+-----+-----------+---------+------+------------+
|EmpID| Name| Department|  Project|Salary|HoursPerWeek|
+-----+-----+-----------+---------+------+------------+
|  101| Ravi|Engineering|AI Engine| 95000|          42|
|  105|Divya|Engineering|AI Engine| 99000|          48|
+-----+-----+-----------+---------+------+------------+



In [6]:
# 2. Show all employees from the "Marketing" department with salaries greater than 60,000.
spark.sql("SELECT * FROM employees_local WHERE Department = 'Marketing' AND Salary > 60000").show()

+-----+-----+----------+--------------+------+------------+
|EmpID| Name|Department|       Project|Salary|HoursPerWeek|
+-----+-----+----------+--------------+------+------------+
|  103|Kabir| Marketing|Product Launch| 65000|          40|
|  106| Amit| Marketing|  Social Media| 62000|          35|
+-----+-----+----------+--------------+------+------------+



In [7]:
# 3. Calculate the average salary for each department.
spark.sql("SELECT Department, AVG(Salary) AS AvgSalary FROM employees_local GROUP BY Department").show()

+-----------+---------+
| Department|AvgSalary|
+-----------+---------+
|      Sales|  71500.0|
|Engineering|  93000.0|
|  Marketing|  63500.0|
|         HR|  59000.0|
+-----------+---------+



In [8]:
# 4. List the top 3 highest paid employees overall.
spark.sql("SELECT Name, Salary FROM employees_local ORDER BY Salary DESC LIMIT 3").show()

+-----+------+
| Name|Salary|
+-----+------+
|Divya| 99000|
| Ravi| 95000|
| Neha| 91000|
+-----+------+



In [9]:
# 5. Find employees who work more than 40 hours per week.
spark.sql("SELECT * FROM employees_local WHERE HoursPerWeek > 40").show()

+-----+-----+-----------+--------------+------+------------+
|EmpID| Name| Department|       Project|Salary|HoursPerWeek|
+-----+-----+-----------+--------------+------+------------+
|  101| Ravi|Engineering|     AI Engine| 95000|          42|
|  102|Sneha|Engineering| Data Platform| 87000|          45|
|  105|Divya|Engineering|     AI Engine| 99000|          48|
|  108|Manav|      Sales|      Lead Gen| 73000|          41|
|  109| Neha|Engineering|Security Suite| 91000|          46|
+-----+-----+-----------+--------------+------+------------+



In [10]:
# 6. Group by project and display the number of employees per project.
spark.sql("SELECT Project, COUNT(*) AS EmployeeCount FROM employees_local GROUP BY Project").show()

+---------------+-------------+
|        Project|EmployeeCount|
+---------------+-------------+
|  Data Platform|            1|
|      AI Engine|            2|
| Product Launch|            1|
|Client Outreach|            1|
| Security Suite|            1|
|  Policy Revamp|            1|
|       Lead Gen|            1|
|   Social Media|            1|
|     Onboarding|            1|
+---------------+-------------+



In [11]:
# 7. Drop the local view. Try querying again — what happens?

# Drop view
spark.catalog.dropTempView("employees_local")

# Try querying again (this will throw an error)
spark.sql("SELECT * FROM employees_local").show

AnalysisException: [TABLE_OR_VIEW_NOT_FOUND] The table or view `employees_local` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.
To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS.; line 1 pos 14;
'Project [*]
+- 'UnresolvedRelation [employees_local], [], false


## Part B: Exercises on Global View ( employees_global )

1. Retrieve all "HR" employees working fewer than 38 hours/week.
2. Calculate the total salary payout for each department.
3. For each employee, add a derived column Status :

    If HoursPerWeek > 45 → 'Overworked'

    Otherwise → 'Normal'
4. Count the total number of employees working on each "Project" .
5. List employees whose salary is above the average salary in their department.
6. Open a new Spark session and query "global_temp.employees_global" from there.

In [13]:
# 1. Retrieve all "HR" employees working fewer than 38 hours/week.
spark.sql("SELECT * FROM global_temp.employees_global WHERE Department = 'HR' AND HoursPerWeek < 38").show()

+-----+-----+----------+-------------+------+------------+
|EmpID| Name|Department|      Project|Salary|HoursPerWeek|
+-----+-----+----------+-------------+------+------------+
|  107|Priya|        HR|Policy Revamp| 58000|          37|
|  110|Farah|        HR|   Onboarding| 60000|          36|
+-----+-----+----------+-------------+------+------------+



In [14]:
# 2. Calculate the total salary payout for each department.
spark.sql("SELECT Department, SUM(Salary) AS TotalSalary FROM global_temp.employees_global GROUP BY Department").show()

+-----------+-----------+
| Department|TotalSalary|
+-----------+-----------+
|      Sales|     143000|
|Engineering|     372000|
|  Marketing|     127000|
|         HR|     118000|
+-----------+-----------+



In [15]:
# 3. For each employee, add a derived column Status :
    # If HoursPerWeek > 45 → 'Overworked'
    # Otherwise → 'Normal'
spark.sql("""
    SELECT *,
        CASE
            WHEN HoursPerWeek > 45 THEN 'Overworked'
            ELSE 'Normal'
        END AS Status
    FROM global_temp.employees_global
""").show()


+-----+-----+-----------+---------------+------+------------+----------+
|EmpID| Name| Department|        Project|Salary|HoursPerWeek|    Status|
+-----+-----+-----------+---------------+------+------------+----------+
|  101| Ravi|Engineering|      AI Engine| 95000|          42|    Normal|
|  102|Sneha|Engineering|  Data Platform| 87000|          45|    Normal|
|  103|Kabir|  Marketing| Product Launch| 65000|          40|    Normal|
|  104|Anita|      Sales|Client Outreach| 70000|          38|    Normal|
|  105|Divya|Engineering|      AI Engine| 99000|          48|Overworked|
|  106| Amit|  Marketing|   Social Media| 62000|          35|    Normal|
|  107|Priya|         HR|  Policy Revamp| 58000|          37|    Normal|
|  108|Manav|      Sales|       Lead Gen| 73000|          41|    Normal|
|  109| Neha|Engineering| Security Suite| 91000|          46|Overworked|
|  110|Farah|         HR|     Onboarding| 60000|          36|    Normal|
+-----+-----+-----------+---------------+------+---

In [16]:
# 4. Count the total number of employees working on each "Project" .
spark.sql("SELECT Project, COUNT(*) AS EmployeeCount FROM global_temp.employees_global GROUP BY Project").show()

+---------------+-------------+
|        Project|EmployeeCount|
+---------------+-------------+
|  Data Platform|            1|
|      AI Engine|            2|
| Product Launch|            1|
|Client Outreach|            1|
| Security Suite|            1|
|  Policy Revamp|            1|
|       Lead Gen|            1|
|   Social Media|            1|
|     Onboarding|            1|
+---------------+-------------+



In [17]:
# 5. List employees whose salary is above the average salary in their department.
spark.sql("""
    SELECT *
    FROM global_temp.employees_global g1
    WHERE Salary > (
        SELECT AVG(Salary)
        FROM global_temp.employees_global g2
        WHERE g2.Department = g1.Department
        )
""").show()

+-----+-----+-----------+--------------+------+------------+
|EmpID| Name| Department|       Project|Salary|HoursPerWeek|
+-----+-----+-----------+--------------+------+------------+
|  101| Ravi|Engineering|     AI Engine| 95000|          42|
|  105|Divya|Engineering|     AI Engine| 99000|          48|
|  103|Kabir|  Marketing|Product Launch| 65000|          40|
|  108|Manav|      Sales|      Lead Gen| 73000|          41|
|  110|Farah|         HR|    Onboarding| 60000|          36|
+-----+-----+-----------+--------------+------+------------+



In [19]:
# 6. Open a new Spark session and query "global_temp.employees_global" from there.
from pyspark.sql import SparkSession

# New session
new_spark = SparkSession.builder.appName("NewSession").getOrCreate()

# Query global view from new session
new_spark.sql("SELECT * FROM global_temp.employees_global").show()

+-----+-----+-----------+---------------+------+------------+
|EmpID| Name| Department|        Project|Salary|HoursPerWeek|
+-----+-----+-----------+---------------+------+------------+
|  101| Ravi|Engineering|      AI Engine| 95000|          42|
|  102|Sneha|Engineering|  Data Platform| 87000|          45|
|  103|Kabir|  Marketing| Product Launch| 65000|          40|
|  104|Anita|      Sales|Client Outreach| 70000|          38|
|  105|Divya|Engineering|      AI Engine| 99000|          48|
|  106| Amit|  Marketing|   Social Media| 62000|          35|
|  107|Priya|         HR|  Policy Revamp| 58000|          37|
|  108|Manav|      Sales|       Lead Gen| 73000|          41|
|  109| Neha|Engineering| Security Suite| 91000|          46|
|  110|Farah|         HR|     Onboarding| 60000|          36|
+-----+-----+-----------+---------------+------+------------+



## Bonus Challenges

1. Use a window function to assign rank to employees within each department based
on salary.
2. Create another view (local or global) that only contains "Engineering"
employees.
3. Create a SQL view that filters out all employees working < 38 hours and saves
it as "active_employees" .

In [20]:
# 1. Use a window function to assign rank to employees within each department based on salary.
from pyspark.sql.window import Window
from pyspark.sql.functions import rank

windowSpec = Window.partitionBy("Department").orderBy(df["Salary"].desc())
df.withColumn("Rank", rank().over(windowSpec)).select("Name", "Department", "Salary", "Rank").show()

+-----+-----------+------+----+
| Name| Department|Salary|Rank|
+-----+-----------+------+----+
|Divya|Engineering| 99000|   1|
| Ravi|Engineering| 95000|   2|
| Neha|Engineering| 91000|   3|
|Sneha|Engineering| 87000|   4|
|Farah|         HR| 60000|   1|
|Priya|         HR| 58000|   2|
|Kabir|  Marketing| 65000|   1|
| Amit|  Marketing| 62000|   2|
|Manav|      Sales| 73000|   1|
|Anita|      Sales| 70000|   2|
+-----+-----------+------+----+



In [21]:
# 2. Create another view (local or global) that only contains "Engineering" employees.

# Filter employees from Engineering department
df_engineering = df.filter(df.Department == "Engineering")

# Create a local temporary view named "engineering_employees"
df_engineering.createOrReplaceTempView("engineering_employees")

spark.sql("SELECT * FROM engineering_employees").show()

+-----+-----+-----------+--------------+------+------------+
|EmpID| Name| Department|       Project|Salary|HoursPerWeek|
+-----+-----+-----------+--------------+------+------------+
|  101| Ravi|Engineering|     AI Engine| 95000|          42|
|  102|Sneha|Engineering| Data Platform| 87000|          45|
|  105|Divya|Engineering|     AI Engine| 99000|          48|
|  109| Neha|Engineering|Security Suite| 91000|          46|
+-----+-----+-----------+--------------+------+------------+



In [22]:
# 3. Create a SQL view that filters out all employees working < 38 hours and saves it as "active_employees" .
df.createOrReplaceTempView("all_employees")

spark.sql("""
    CREATE OR REPLACE TEMP VIEW active_employees AS
    SELECT *
    FROM all_employees
    WHERE HoursPerWeek >= 38
""")

spark.sql("SELECT * FROM active_employees").show()

+-----+-----+-----------+---------------+------+------------+
|EmpID| Name| Department|        Project|Salary|HoursPerWeek|
+-----+-----+-----------+---------------+------+------------+
|  101| Ravi|Engineering|      AI Engine| 95000|          42|
|  102|Sneha|Engineering|  Data Platform| 87000|          45|
|  103|Kabir|  Marketing| Product Launch| 65000|          40|
|  104|Anita|      Sales|Client Outreach| 70000|          38|
|  105|Divya|Engineering|      AI Engine| 99000|          48|
|  108|Manav|      Sales|       Lead Gen| 73000|          41|
|  109| Neha|Engineering| Security Suite| 91000|          46|
+-----+-----+-----------+---------------+------+------------+

