In [0]:
#importing libraries and modules
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *


#creating a spark session 
spark= SparkSession.builder.appName("Read").getOrCreate()


In [0]:
df1 = spark.read.format("csv").option("inferSchema", True).option("header", "true").load("dbfs:/FileStore/shared_uploads/timilsina.ra@northeastern.edu/assignment_employees.csv")
df1.display()

id,name,age,department,salary,join_date
1,John,34,IT,75000,2015-06-01
2,Sara,28,HR,58000,2019-09-15
3,Michael,45,Finance,120000,2010-01-10
4,Karen,29,IT,70000,2020-02-19
5,David,38,Finance,90000,2017-08-23
6,Linda,33,HR,60000,2018-12-05
7,James,41,IT,110000,2013-04-15
8,Emily,27,HR,52000,2021-06-20
9,Robert,36,Finance,105000,2016-11-30


Question 1 - Display records of employees aged above 30.

In [0]:
df_emp_over_30 = df1.filter(col("age")>30)
df_emp_over_30.display()

id,name,age,department,salary,join_date
1,John,34,IT,75000,2015-06-01
3,Michael,45,Finance,120000,2010-01-10
5,David,38,Finance,90000,2017-08-23
6,Linda,33,HR,60000,2018-12-05
7,James,41,IT,110000,2013-04-15
9,Robert,36,Finance,105000,2016-11-30


Question 2 - Find the average salary of employees in each department.

In [0]:
df_average_sal_by_dept= df1.groupBy("department").agg(round(avg("salary")))
df_average_sal_by_dept.display()

department,"round(avg(salary), 0)"
HR,56667.0
Finance,105000.0
IT,85000.0


Question 3 - Add a column experience indicating the number of years an employee has been working.

In [0]:

df1 = df1.withColumn("experience", floor(datediff(current_date(), (col("join_date")))/365))

df1.display()

id,name,age,department,salary,join_date,experience
1,John,34,IT,75000,2015-06-01,9
2,Sara,28,HR,58000,2019-09-15,5
3,Michael,45,Finance,120000,2010-01-10,14
4,Karen,29,IT,70000,2020-02-19,4
5,David,38,Finance,90000,2017-08-23,7
6,Linda,33,HR,60000,2018-12-05,5
7,James,41,IT,110000,2013-04-15,11
8,Emily,27,HR,52000,2021-06-20,3
9,Robert,36,Finance,105000,2016-11-30,8


Question 4 - Find the top 3 highest-paid employees.

In [0]:
df_highest_paid= df1.orderBy(desc("Salary")).limit(3)
df_highest_paid.display()

id,name,age,department,salary,join_date,experience
3,Michael,45,Finance,120000,2010-01-10,14
7,James,41,IT,110000,2013-04-15,11
9,Robert,36,Finance,105000,2016-11-30,8


Question 5- Identify the department with the highest total salary.

In [0]:
df_salary_by_dept= df1.groupBy("department").agg(sum("salary").alias("total_salary"))
df_salary_by_dept.display()
df_dept_with_highest_total_salary = df_salary_by_dept.orderBy(desc("total_salary")).limit(1)
df_dept_with_highest_total_salary.display()

department,total_salary
HR,170000
Finance,315000
IT,255000


department,total_salary
Finance,315000


Question 6- Create a new DataFrame with employees earning more than the average salary.

In [0]:
#calculating average salary overall 
df_average_sal_overall = df1.agg(round(avg("salary"),2).alias("avg_salary"))
df_average_sal_overall.display()

#fetching the value 
avg_salary_value = df_average_sal_overall.collect()[0]["avg_salary"]

#filtering the data 
df_emp_making_above_average = df1.filter(col("salary") > avg_salary_value)
df_emp_making_above_average.display()


avg_salary
82222.22


id,name,age,department,salary,join_date,experience
3,Michael,45,Finance,120000,2010-01-10,14
5,David,38,Finance,90000,2017-08-23,7
7,James,41,IT,110000,2013-04-15,11
9,Robert,36,Finance,105000,2016-11-30,8


Question 7 - Rename the column name to employee_name.

In [0]:
df1= df1.withColumnRenamed("name", "employee_name")
df1.display()

id,employee_name,age,department,salary,join_date,experience
1,John,34,IT,75000,2015-06-01,9
2,Sara,28,HR,58000,2019-09-15,5
3,Michael,45,Finance,120000,2010-01-10,14
4,Karen,29,IT,70000,2020-02-19,4
5,David,38,Finance,90000,2017-08-23,7
6,Linda,33,HR,60000,2018-12-05,5
7,James,41,IT,110000,2013-04-15,11
8,Emily,27,HR,52000,2021-06-20,3
9,Robert,36,Finance,105000,2016-11-30,8


Question 8 - Find the number of employees in each department.

In [0]:
df1_emp_count = df1.groupBy("department").agg(count("id").alias ("emp_by_department"))
df1_emp_count.display()

department,emp_by_department
HR,3
Finance,3
IT,3


Question 9 - Select and display only the id and name columns.

In [0]:
df_name_id = df1.select("id", "employee_name")
df_name_id.display()

id,employee_name
1,John
2,Sara
3,Michael
4,Karen
5,David
6,Linda
7,James
8,Emily
9,Robert


Question 10 - Check for any null values in the dataset.

In [0]:
df_is_null= df1.select([col(c).isNull() for c in df1.columns]).display()


(id IS NULL),(employee_name IS NULL),(age IS NULL),(department IS NULL),(salary IS NULL),(join_date IS NULL),(experience IS NULL)
False,False,False,False,False,False,False
False,False,False,False,False,False,False
False,False,False,False,False,False,False
False,False,False,False,False,False,False
False,False,False,False,False,False,False
False,False,False,False,False,False,False
False,False,False,False,False,False,False
False,False,False,False,False,False,False
False,False,False,False,False,False,False
