In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [0]:
spark = SparkSession.builder.appName("NB-1").getOrCreate()
spark

#**Dataset**

In [0]:
data = [
("Ananya", "HR", 52000),
("Rahul", "Engineering", 65000),
("Priya", "Engineering", 60000),
("Zoya", "Marketing", 48000),
("Karan", "HR", 53000),
("Naveen", "Engineering", 70000),
("Fatima", "Marketing", 45000)
]
columns = ["Name", "Department", "Salary"]
df = spark.createDataFrame(data, columns)

In [0]:
df.show(5)

+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Ananya|         HR| 52000|
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
|  Zoya|  Marketing| 48000|
| Karan|         HR| 53000|
+------+-----------+------+
only showing top 5 rows


#**Basics**

In [0]:
# 1. Display all records in the DataFrame.
df.show()

+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Ananya|         HR| 52000|
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
|  Zoya|  Marketing| 48000|
| Karan|         HR| 53000|
|Naveen|Engineering| 70000|
|Fatima|  Marketing| 45000|
+------+-----------+------+



In [0]:
# 2. Print the schema of the DataFrame.
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: long (nullable = true)



In [0]:
# 3. Count total number of employees.
print(f"Employees count: {df.count()}")

Employees count: 7


#**Column Operations**

In [0]:
# 4. Add a new column Bonus which is 15% of Salary.
df = df.withColumn("Bonous", df.Salary * 0.15)
df.show()

+------+-----------+------+-------+
|  Name| Department|Salary| Bonous|
+------+-----------+------+-------+
|Ananya|         HR| 52000| 7800.0|
| Rahul|Engineering| 65000| 9750.0|
| Priya|Engineering| 60000| 9000.0|
|  Zoya|  Marketing| 48000| 7200.0|
| Karan|         HR| 53000| 7950.0|
|Naveen|Engineering| 70000|10500.0|
|Fatima|  Marketing| 45000| 6750.0|
+------+-----------+------+-------+



In [0]:
# 5. Add a new column NetPay = Salary + Bonus.
df = df.withColumn("NetPay", df.Salary + df.Bonous)
df.show()

+------+-----------+------+-------+-------+
|  Name| Department|Salary| Bonous| NetPay|
+------+-----------+------+-------+-------+
|Ananya|         HR| 52000| 7800.0|59800.0|
| Rahul|Engineering| 65000| 9750.0|74750.0|
| Priya|Engineering| 60000| 9000.0|69000.0|
|  Zoya|  Marketing| 48000| 7200.0|55200.0|
| Karan|         HR| 53000| 7950.0|60950.0|
|Naveen|Engineering| 70000|10500.0|80500.0|
|Fatima|  Marketing| 45000| 6750.0|51750.0|
+------+-----------+------+-------+-------+



#**Filtering and Conditions**

In [0]:
# 6. Display only employees from the “Engineering” department.
df.filter(df.Department == "Engineering").show()

+------+-----------+------+-------+-------+
|  Name| Department|Salary| Bonous| NetPay|
+------+-----------+------+-------+-------+
| Rahul|Engineering| 65000| 9750.0|74750.0|
| Priya|Engineering| 60000| 9000.0|69000.0|
|Naveen|Engineering| 70000|10500.0|80500.0|
+------+-----------+------+-------+-------+



In [0]:
# 7. Display employees whose salary is greater than 60000.
df.filter(df.Salary > 60_000).show()

+------+-----------+------+-------+-------+
|  Name| Department|Salary| Bonous| NetPay|
+------+-----------+------+-------+-------+
| Rahul|Engineering| 65000| 9750.0|74750.0|
|Naveen|Engineering| 70000|10500.0|80500.0|
+------+-----------+------+-------+-------+



In [0]:
# 8. Display employees who are not in the “Marketing” department.
df.filter(df.Department != "Marketing").show()

+------+-----------+------+-------+-------+
|  Name| Department|Salary| Bonous| NetPay|
+------+-----------+------+-------+-------+
|Ananya|         HR| 52000| 7800.0|59800.0|
| Rahul|Engineering| 65000| 9750.0|74750.0|
| Priya|Engineering| 60000| 9000.0|69000.0|
| Karan|         HR| 53000| 7950.0|60950.0|
|Naveen|Engineering| 70000|10500.0|80500.0|
+------+-----------+------+-------+-------+



#**Sorting and Limiting**

In [0]:
# 9. Show top 3 highest paid employees.
df.sort("Salary", ascending=False).show(3)

+------+-----------+------+-------+-------+
|  Name| Department|Salary| Bonous| NetPay|
+------+-----------+------+-------+-------+
|Naveen|Engineering| 70000|10500.0|80500.0|
| Rahul|Engineering| 65000| 9750.0|74750.0|
| Priya|Engineering| 60000| 9000.0|69000.0|
+------+-----------+------+-------+-------+
only showing top 3 rows


In [0]:
# 10. Sort the data by Department ascending and Salary descending.
df.sort(df.Department.asc(), df.Salary.desc()).show()

+------+-----------+------+-------+-------+
|  Name| Department|Salary| Bonous| NetPay|
+------+-----------+------+-------+-------+
|Naveen|Engineering| 70000|10500.0|80500.0|
| Rahul|Engineering| 65000| 9750.0|74750.0|
| Priya|Engineering| 60000| 9000.0|69000.0|
| Karan|         HR| 53000| 7950.0|60950.0|
|Ananya|         HR| 52000| 7800.0|59800.0|
|  Zoya|  Marketing| 48000| 7200.0|55200.0|
|Fatima|  Marketing| 45000| 6750.0|51750.0|
+------+-----------+------+-------+-------+



#**String and Case Logic**

In [0]:
# 11. Add a new column Level : “Senior” if salary > 60000
# “Mid” if salary between 50000 and 60000
# “Junior” otherwise
df.withColumn("Level", F.when(df.Salary > 60_000, "Senior").when((df.Salary > 50_000) & (df.Salary < 60_000), "Mid").otherwise("Junior")).show()

+------+-----------+------+-------+-------+------+
|  Name| Department|Salary| Bonous| NetPay| Level|
+------+-----------+------+-------+-------+------+
|Ananya|         HR| 52000| 7800.0|59800.0|   Mid|
| Rahul|Engineering| 65000| 9750.0|74750.0|Senior|
| Priya|Engineering| 60000| 9000.0|69000.0|Junior|
|  Zoya|  Marketing| 48000| 7200.0|55200.0|Junior|
| Karan|         HR| 53000| 7950.0|60950.0|   Mid|
|Naveen|Engineering| 70000|10500.0|80500.0|Senior|
|Fatima|  Marketing| 45000| 6750.0|51750.0|Junior|
+------+-----------+------+-------+-------+------+



In [0]:
# 12. Convert all names to uppercase.
df.withColumn("Name", F.upper(df.Name)).show()

+------+-----------+------+-------+-------+
|  Name| Department|Salary| Bonous| NetPay|
+------+-----------+------+-------+-------+
|ANANYA|         HR| 52000| 7800.0|59800.0|
| RAHUL|Engineering| 65000| 9750.0|74750.0|
| PRIYA|Engineering| 60000| 9000.0|69000.0|
|  ZOYA|  Marketing| 48000| 7200.0|55200.0|
| KARAN|         HR| 53000| 7950.0|60950.0|
|NAVEEN|Engineering| 70000|10500.0|80500.0|
|FATIMA|  Marketing| 45000| 6750.0|51750.0|
+------+-----------+------+-------+-------+

