In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, upper, when
spark = SparkSession.builder.appName("EmployeeData").getOrCreate()
data = [
    ("Ananya", "HR", 52000),
    ("Rahul", "Engineering", 65000),
    ("Priya", "Engineering", 60000),
    ("Zoya", "Marketing", 48000),
    ("Karan", "HR", 53000),
    ("Naveen", "Engineering", 70000),
    ("Fatima", "Marketing", 45000)
]
columns = ["Name", "Department", "Salary"]
df = spark.createDataFrame(data, columns)


Exercise Set 1: Basics

In [0]:
df.show()
df.printSchema()
print("Total Employees:",df.count())

+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Ananya|         HR| 52000|
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
|  Zoya|  Marketing| 48000|
| Karan|         HR| 53000|
|Naveen|Engineering| 70000|
|Fatima|  Marketing| 45000|
+------+-----------+------+

root
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: long (nullable = true)

Total Employees: 7


Exercise Set 2: Column Operations

In [0]:

df = df.withColumn("Bonus", col("Salary") * 0.15)
df = df.withColumn("NetPay", col("Salary") + col("Bonus"))
df.show()


+------+-----------+------+-------+-------+
|  Name| Department|Salary|  Bonus| NetPay|
+------+-----------+------+-------+-------+
|Ananya|         HR| 52000| 7800.0|59800.0|
| Rahul|Engineering| 65000| 9750.0|74750.0|
| Priya|Engineering| 60000| 9000.0|69000.0|
|  Zoya|  Marketing| 48000| 7200.0|55200.0|
| Karan|         HR| 53000| 7950.0|60950.0|
|Naveen|Engineering| 70000|10500.0|80500.0|
|Fatima|  Marketing| 45000| 6750.0|51750.0|
+------+-----------+------+-------+-------+



Exercise Set 3: Filtering and Conditions

In [0]:
df.filter(col("Department") == "Engineering").show()

+------+-----------+------+-------+-------+
|  Name| Department|Salary|  Bonus| NetPay|
+------+-----------+------+-------+-------+
| Rahul|Engineering| 65000| 9750.0|74750.0|
| Priya|Engineering| 60000| 9000.0|69000.0|
|Naveen|Engineering| 70000|10500.0|80500.0|
+------+-----------+------+-------+-------+



In [0]:
df.filter(col("Salary") > 60000).show()


+------+-----------+------+-------+-------+
|  Name| Department|Salary|  Bonus| NetPay|
+------+-----------+------+-------+-------+
| Rahul|Engineering| 65000| 9750.0|74750.0|
|Naveen|Engineering| 70000|10500.0|80500.0|
+------+-----------+------+-------+-------+



In [0]:
df.filter(col("Department") != "Marketing").show()

+------+-----------+------+-------+-------+
|  Name| Department|Salary|  Bonus| NetPay|
+------+-----------+------+-------+-------+
|Ananya|         HR| 52000| 7800.0|59800.0|
| Rahul|Engineering| 65000| 9750.0|74750.0|
| Priya|Engineering| 60000| 9000.0|69000.0|
| Karan|         HR| 53000| 7950.0|60950.0|
|Naveen|Engineering| 70000|10500.0|80500.0|
+------+-----------+------+-------+-------+



Exercise Set 4: Sorting and Limiting

In [0]:
df.orderBy(col("Salary").desc()).show(3)

+------+-----------+------+-------+-------+
|  Name| Department|Salary|  Bonus| NetPay|
+------+-----------+------+-------+-------+
|Naveen|Engineering| 70000|10500.0|80500.0|
| Rahul|Engineering| 65000| 9750.0|74750.0|
| Priya|Engineering| 60000| 9000.0|69000.0|
+------+-----------+------+-------+-------+
only showing top 3 rows



In [0]:
df.orderBy(col("Department").asc(), col("Salary").desc()).show()


+------+-----------+------+-------+-------+
|  Name| Department|Salary|  Bonus| NetPay|
+------+-----------+------+-------+-------+
|Naveen|Engineering| 70000|10500.0|80500.0|
| Rahul|Engineering| 65000| 9750.0|74750.0|
| Priya|Engineering| 60000| 9000.0|69000.0|
| Karan|         HR| 53000| 7950.0|60950.0|
|Ananya|         HR| 52000| 7800.0|59800.0|
|  Zoya|  Marketing| 48000| 7200.0|55200.0|
|Fatima|  Marketing| 45000| 6750.0|51750.0|
+------+-----------+------+-------+-------+



In [0]:
df = df.withColumn("Level", when(col("Salary") > 60000, "Senior")
                             .when((col("Salary") >= 50000) & (col("Salary") <= 60000), "Mid")
                             .otherwise("Junior"))


Exercise Set 5: String and Case Logic

In [0]:
df = df.withColumn("Name", upper(col("Name")))
df.show()

+------+-----------+------+-------+-------+------+
|  Name| Department|Salary|  Bonus| NetPay| Level|
+------+-----------+------+-------+-------+------+
|ANANYA|         HR| 52000| 7800.0|59800.0|   Mid|
| RAHUL|Engineering| 65000| 9750.0|74750.0|Senior|
| PRIYA|Engineering| 60000| 9000.0|69000.0|   Mid|
|  ZOYA|  Marketing| 48000| 7200.0|55200.0|Junior|
| KARAN|         HR| 53000| 7950.0|60950.0|   Mid|
|NAVEEN|Engineering| 70000|10500.0|80500.0|Senior|
|FATIMA|  Marketing| 45000| 6750.0|51750.0|Junior|
+------+-----------+------+-------+-------+------+

