In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName("EmployeeData").getOrCreate()


In [None]:
data = [
    ("Ananya", "HR", 52000),
    ("Rahul", "Engineering", 65000),
    ("Priya", "Engineering", 60000),
    ("Zoya", "Marketing", 48000),
    ("Karan", "HR", 53000),
    ("Naveen", "Engineering", 70000),
    ("Fatima", "Marketing", 45000)
]

columns = ["Name", "Department", "Salary"]

df = spark.createDataFrame(data, columns)
df.show()

+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Ananya|         HR| 52000|
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
|  Zoya|  Marketing| 48000|
| Karan|         HR| 53000|
|Naveen|Engineering| 70000|
|Fatima|  Marketing| 45000|
+------+-----------+------+



**EXERCISES SET - 1 BASICS**

In [None]:
# 1 Output the complete list of employee entries
df.show()

# 2 Show the structure and types of each column
df.printSchema()

# 3 Total headcount of the dataset
df.count()


+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Ananya|         HR| 52000|
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
|  Zoya|  Marketing| 48000|
| Karan|         HR| 53000|
|Naveen|Engineering| 70000|
|Fatima|  Marketing| 45000|
+------+-----------+------+

root
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: long (nullable = true)



7

**EXERCISES SET - 2 COLUMN OPERATIONS**

In [None]:
from pyspark.sql.functions import col

# 4 Append a column to compute 15% bonus
df_with_bonus = df.withColumn("Bonus", col("Salary") * 0.15)
df_with_bonus.show()



+------+-----------+------+-------+
|  Name| Department|Salary|  Bonus|
+------+-----------+------+-------+
|Ananya|         HR| 52000| 7800.0|
| Rahul|Engineering| 65000| 9750.0|
| Priya|Engineering| 60000| 9000.0|
|  Zoya|  Marketing| 48000| 7200.0|
| Karan|         HR| 53000| 7950.0|
|Naveen|Engineering| 70000|10500.0|
|Fatima|  Marketing| 45000| 6750.0|
+------+-----------+------+-------+



In [None]:
# 5 Derive total pay after bonus
df_with_netpay = df_with_bonus.withColumn("NetPay", col("Salary") + col("Bonus"))
df_with_netpay.show()


+------+-----------+------+-------+-------+
|  Name| Department|Salary|  Bonus| NetPay|
+------+-----------+------+-------+-------+
|Ananya|         HR| 52000| 7800.0|59800.0|
| Rahul|Engineering| 65000| 9750.0|74750.0|
| Priya|Engineering| 60000| 9000.0|69000.0|
|  Zoya|  Marketing| 48000| 7200.0|55200.0|
| Karan|         HR| 53000| 7950.0|60950.0|
|Naveen|Engineering| 70000|10500.0|80500.0|
|Fatima|  Marketing| 45000| 6750.0|51750.0|
+------+-----------+------+-------+-------+



**Exercise Set 3: Filtering and Conditions**

In [None]:
# 6 Employees in Engineering team
df.filter(col("Department") == "Engineering").show()




+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
|Naveen|Engineering| 70000|
+------+-----------+------+

+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
| Rahul|Engineering| 65000|
|Naveen|Engineering| 70000|
+------+-----------+------+

+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Ananya|         HR| 52000|
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
| Karan|         HR| 53000|
|Naveen|Engineering| 70000|
+------+-----------+------+



In [None]:
# 7. Salary > 60000
df.filter(col("Salary") > 60000).show()



+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
| Rahul|Engineering| 65000|
|Naveen|Engineering| 70000|
+------+-----------+------+



In [None]:
# 8. Not in Marketing
df.filter(col("Department") != "Marketing").show()

+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Ananya|         HR| 52000|
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
| Karan|         HR| 53000|
|Naveen|Engineering| 70000|
+------+-----------+------+



**Exercise Set 4: Sorting and Limiting**

In [None]:
# 9. Top 3 highest paid employees
df.orderBy(col("Salary").desc()).limit(3).show()




+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Naveen|Engineering| 70000|
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
+------+-----------+------+

+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Naveen|Engineering| 70000|
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
| Karan|         HR| 53000|
|Ananya|         HR| 52000|
|  Zoya|  Marketing| 48000|
|Fatima|  Marketing| 45000|
+------+-----------+------+



In [None]:
# 10. Sort by Department (ASC) and Salary (DESC)
df.orderBy(col("Department").asc(), col("Salary").desc()).show()

+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Naveen|Engineering| 70000|
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
| Karan|         HR| 53000|
|Ananya|         HR| 52000|
|  Zoya|  Marketing| 48000|
|Fatima|  Marketing| 45000|
+------+-----------+------+



**Exercise Set 5: String and Case Logic**

In [None]:
from pyspark.sql.functions import when, upper

# 11 Categorize employees by salary level
df_level = df.withColumn(
    "Level",
    when(col("Salary") > 60000, "Senior")
    .when((col("Salary") >= 50000) & (col("Salary") <= 60000), "Mid")
    .otherwise("Junior")
)
df_level.show()




+------+-----------+------+------+
|  Name| Department|Salary| Level|
+------+-----------+------+------+
|Ananya|         HR| 52000|   Mid|
| Rahul|Engineering| 65000|Senior|
| Priya|Engineering| 60000|   Mid|
|  Zoya|  Marketing| 48000|Junior|
| Karan|         HR| 53000|   Mid|
|Naveen|Engineering| 70000|Senior|
|Fatima|  Marketing| 45000|Junior|
+------+-----------+------+------+

+------+-----------+------+---------+
|  Name| Department|Salary|NameUpper|
+------+-----------+------+---------+
|Ananya|         HR| 52000|   ANANYA|
| Rahul|Engineering| 65000|    RAHUL|
| Priya|Engineering| 60000|    PRIYA|
|  Zoya|  Marketing| 48000|     ZOYA|
| Karan|         HR| 53000|    KARAN|
|Naveen|Engineering| 70000|   NAVEEN|
|Fatima|  Marketing| 45000|   FATIMA|
+------+-----------+------+---------+



In [None]:
# 12 Standardize names to uppercase format
df_upper = df.withColumn("NameUpper", upper(col("Name")))
df_upper.show()

+------+-----------+------+---------+
|  Name| Department|Salary|NameUpper|
+------+-----------+------+---------+
|Ananya|         HR| 52000|   ANANYA|
| Rahul|Engineering| 65000|    RAHUL|
| Priya|Engineering| 60000|    PRIYA|
|  Zoya|  Marketing| 48000|     ZOYA|
| Karan|         HR| 53000|    KARAN|
|Naveen|Engineering| 70000|   NAVEEN|
|Fatima|  Marketing| 45000|   FATIMA|
+------+-----------+------+---------+

