In [None]:
from google.colab import drive
drive.mount('/content/drive')
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("EmployeeAnalysis").getOrCreate()

# Load the CSV file
df = spark.read.csv("/content/drive/MyDrive/large_employee_dataset.csv", header=True, inferSchema=True)


Mounted at /content/drive


In [None]:
#1.Show the top 10 rows of the dataset.
df.show(10)


+----------+--------------------+---+----------+------+-----------+--------+------------+
|EmployeeID|                Name|Age|Department|Salary|JoiningDate|  Status|        City|
+----------+--------------------+---+----------+------+-----------+--------+------------+
|      4128|     Charles Johnson| 52|        HR| 64039| 2018-07-07|Resigned|   Allentown|
|      6094|       Dylan Camacho| 57| Marketing| 34686| 2015-08-25|  Active| Anthonyfort|
|      5883| Mr. Ryan Bowman Jr.| 29|   Finance| 64541| 2025-03-11|On Leave|   Gilesstad|
|      9146|          Brian Ball| 24|     Sales| 87831| 2015-10-01|Resigned|Jenniferfurt|
|      1918|       Angela Hooper| 26|   Finance|108773| 2019-08-14|On Leave|Lake Amystad|
|      4600|Alexander Johnson...| 45|     Sales| 75671| 2016-04-21|On Leave|  Russohaven|
|      6253|         Steven Lane| 47|   Finance| 64982| 2021-07-25|  Active| East Robert|
|      8278|       Riley Johnson| 49|        HR| 43449| 2015-08-03|Resigned|  New Thomas|
|      852

In [None]:
# Count the total number of employees.
df.count()


500

In [None]:
# Display unique departments
df.select("Department").distinct().show()


+----------+
|Department|
+----------+
|     Sales|
|        HR|
|   Finance|
| Marketing|
|        IT|
+----------+



In [None]:
#4.Filter all employees in the "IT" department:

df.filter(df.Department == "IT").show()


+----------+-------------------+---+----------+------+-----------+--------+------------------+
|EmployeeID|               Name|Age|Department|Salary|JoiningDate|  Status|              City|
+----------+-------------------+---+----------+------+-----------+--------+------------------+
|      6598|        Mary Henson| 58|        IT| 63951| 2021-08-25|  Active|       Port Tricia|
|      8518|   Elizabeth Abbott| 22|        IT| 91732| 2022-11-05|  Active|       Douglasside|
|      9506|        Thomas Dunn| 45|        IT| 90340| 2020-07-12|On Leave|    Lindseychester|
|      9663|        Glenn Mason| 43|        IT|109189| 2020-03-27|On Leave|      Katelynburgh|
|      2106|     Richard Bailey| 45|        IT| 30950| 2021-06-29|Resigned|        North John|
|      8212|      Jacob Jackson| 35|        IT| 54812| 2020-09-18|On Leave|South Veronicastad|
|      6354|     Nicole Gilmore| 35|        IT|104202| 2018-05-04|  Active|       East Joseph|
|      5716|         David Wang| 49|        IT| 94

In [None]:
#5. Show employees aged between 30 and 40.

 df.filter((df.Age >= 30) & (df.Age <= 40)).show()


+----------+------------------+---+----------+------+-----------+--------+-------------------+
|EmployeeID|              Name|Age|Department|Salary|JoiningDate|  Status|               City|
+----------+------------------+---+----------+------+-----------+--------+-------------------+
|      4676|Christopher Fuller| 30|        HR| 63042| 2021-04-30|Resigned|   South Donnaville|
|      4136|     Jerome Torres| 30|   Finance| 68213| 2024-11-30|  Active|North Justinborough|
|      1588|       Edwin Burns| 34|     Sales|108208| 2015-09-14|Resigned|        South David|
|      8074|       Fred Brewer| 30|        HR|100736| 2021-06-06|On Leave|    Port Wendyville|
|      3841|       April Allen| 36|        HR| 98845| 2020-05-20|  Active|      Rachelchester|
|      8212|     Jacob Jackson| 35|        IT| 54812| 2020-09-18|On Leave| South Veronicastad|
|      3325|       Brian Huynh| 40|   Finance| 59924| 2017-01-02|On Leave|           Johnside|
|      6180|     Robert Cortez| 35| Marketing| 761

In [None]:
# 6.Sort employees by Salary in descending order:
 df.orderBy(df.Salary.desc()).show()


+----------+--------------------+---+----------+------+-----------+--------+-------------------+
|EmployeeID|                Name|Age|Department|Salary|JoiningDate|  Status|               City|
+----------+--------------------+---+----------+------+-----------+--------+-------------------+
|      8860|       Cody Williams| 30|        IT|119978| 2019-03-16|Resigned|         Susanville|
|      4585|      Sandra Webster| 30|        HR|119940| 2022-10-21|On Leave|       Thompsonport|
|      4667|         Ronald Hunt| 58|     Sales|119677| 2019-08-29|Resigned|    Griffithchester|
|      1602|    Deborah Williams| 25|        HR|119397| 2023-09-26|On Leave|    Port Terrimouth|
|      3374|        Amanda Green| 41|        HR|119316| 2021-04-08|Resigned|    West Shelleyton|
|      6329|       Heidi Shaffer| 36|        HR|119165| 2020-01-14|Resigned|          New Alexa|
|      2428|        Mary Stevens| 55|     Sales|119137| 2022-03-06|On Leave|         Travisport|
|      1545|Brittany Christens

In [None]:
#7. Get the average salary by department
from pyspark.sql.functions import avg
df.groupBy("Department").agg(avg("Salary").alias("AverageSalary")).show()


+----------+-----------------+
|Department|    AverageSalary|
+----------+-----------------+
|     Sales|77488.54545454546|
|        HR|76091.27450980392|
|   Finance|72834.75630252101|
| Marketing| 71958.1888888889|
|        IT|73116.25555555556|
+----------+-----------------+



In [None]:
# Count of employees by Status
df.groupBy("Status").count().show()


In [None]:
# Highest salary in each city.
from pyspark.sql.functions import max

df.groupBy("City").agg(max("Salary").alias("HighestSalary")).show()


+----------------+-------------+
|            City|HighestSalary|
+----------------+-------------+
|   Wilsonchester|        67025|
|     Bradshawton|       111116|
|       Steventon|        32009|
|     Lake Alyssa|        84903|
|      North Lisa|        57898|
|    North Marvin|        66252|
|     Jenniferton|        39907|
|     Buckleyview|        50109|
|     Burtonville|        98492|
|    Johnsonmouth|        48799|
|    South Joseph|        52456|
|  Lindseychester|        90340|
|   North Stephen|        91947|
|Port Nicoleshire|        57537|
|    Jerrychester|        53374|
|  North Jennifer|        82486|
|      Laurenstad|        44608|
|West Brendanbury|        90698|
|       Juliaberg|        50170|
|       New James|        54378|
+----------------+-------------+
only showing top 20 rows



In [None]:

# 10.Total number of employees who joined each year.

from pyspark.sql.functions import year

df.withColumn("JoinYear", year("JoiningDate")).groupBy("JoinYear").count().orderBy("JoinYear").show()


+--------+-----+
|JoinYear|count|
+--------+-----+
|    2015|   37|
|    2016|   49|
|    2017|   44|
|    2018|   52|
|    2019|   52|
|    2020|   56|
|    2021|   49|
|    2022|   49|
|    2023|   47|
|    2024|   38|
|    2025|   27|
+--------+-----+



In [None]:
# 11. Department-wise count of employees who are currently "Active".

df.filter(df.Status == "Active").groupBy("Department").count().show()


+----------+-----+
|Department|count|
+----------+-----+
|     Sales|   32|
|        HR|   37|
|   Finance|   45|
| Marketing|   32|
|        IT|   26|
+----------+-----+



In [None]:
# 12. Average age of employees per department.
df.groupBy("Department") \
  .avg("Age").withColumnRenamed("avg(Age)", "AverageAge").show()


+----------+------------------+
|Department|        AverageAge|
+----------+------------------+
|     Sales|40.535353535353536|
|        HR| 41.46078431372549|
|   Finance| 39.21008403361345|
| Marketing| 41.82222222222222|
|        IT| 38.68888888888889|
+----------+------------------+



In [None]:
# 13. Create another dataset with City and Region , and join it.
from pyspark.sql.functions import col

distinct_cities = df.select("City").distinct()

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def infer_region(city):
    city = city.lower()
    if "north" in city:
        return "North"
    elif "south" in city:
        return "South"
    elif "east" in city:
        return "East"
    elif "west" in city:
        return "West"
    else:
        return "Unknown"

infer_region_udf = udf(infer_region, StringType())

auto_region_df = distinct_cities.withColumn("Region", infer_region_udf(col("City")))
auto_region_df.show()

joined_df = df.join(auto_region_df, on="City", how="left")
joined_df.select("Name", "City", "Region", "Salary").show()



+----------------+-------+
|            City| Region|
+----------------+-------+
|   Wilsonchester|Unknown|
|     Bradshawton|Unknown|
|       Steventon|Unknown|
|     Lake Alyssa|Unknown|
|      North Lisa|  North|
|    North Marvin|  North|
|     Jenniferton|Unknown|
|     Buckleyview|Unknown|
|     Burtonville|Unknown|
|    Johnsonmouth|Unknown|
|    South Joseph|  South|
|  Lindseychester|Unknown|
|   North Stephen|  North|
|Port Nicoleshire|Unknown|
|    Jerrychester|Unknown|
|  North Jennifer|  North|
|      Laurenstad|Unknown|
|West Brendanbury|   West|
|       Juliaberg|Unknown|
|       New James|Unknown|
+----------------+-------+
only showing top 20 rows

+--------------------+-------------------+-------+------+
|                Name|               City| Region|Salary|
+--------------------+-------------------+-------+------+
|     Charles Johnson|          Allentown|Unknown| 64039|
|       Dylan Camacho|        Anthonyfort|Unknown| 34686|
| Mr. Ryan Bowman Jr.|          Gile

In [None]:
# 14. Group salaries by Region after the join.
from pyspark.sql.functions import avg

joined_df.groupBy("Region").agg(avg("Salary").alias("AverageSalary")) .show()


+-------+-----------------+
| Region|    AverageSalary|
+-------+-----------------+
|Unknown|74605.20170454546|
|  South|71426.63829787234|
|   East|       73085.8125|
|   West|73189.75757575757|
|  North|77350.58333333333|
+-------+-----------------+



In [None]:
# 15. Calculate years of experience for each employee (current date - JoiningDate).
from pyspark.sql.functions import current_date, datediff

df = df.withColumn("YearsOfExperience", (datediff(current_date(), df.JoiningDate) / 365).cast("int"))
df.select("Name", "JoiningDate", "YearsOfExperience").show()


+--------------------+-----------+-----------------+
|                Name|JoiningDate|YearsOfExperience|
+--------------------+-----------+-----------------+
|     Charles Johnson| 2018-07-07|                6|
|       Dylan Camacho| 2015-08-25|                9|
| Mr. Ryan Bowman Jr.| 2025-03-11|                0|
|          Brian Ball| 2015-10-01|                9|
|       Angela Hooper| 2019-08-14|                5|
|Alexander Johnson...| 2016-04-21|                9|
|         Steven Lane| 2021-07-25|                3|
|       Riley Johnson| 2015-08-03|                9|
|    Emily Washington| 2021-11-30|                3|
|     Valerie Fleming| 2019-12-08|                5|
|     Tracy Hughes MD| 2020-06-01|                5|
|    Johnathan Harmon| 2021-03-09|                4|
|       Michael Brown| 2023-10-21|                1|
|       Scott Burnett| 2016-04-25|                9|
|  Christopher Fuller| 2021-04-30|                4|
|         Mary Henson| 2021-08-25|            

In [None]:
# 16. List all employees with more than 5 years of experience.

df.filter(df.YearsOfExperience > 5) .select("Name", "Age", "Department", "YearsOfExperience").show()



+--------------------+---+----------+-----------------+
|                Name|Age|Department|YearsOfExperience|
+--------------------+---+----------+-----------------+
|     Charles Johnson| 52|        HR|                6|
|       Dylan Camacho| 57| Marketing|                9|
|          Brian Ball| 24|     Sales|                9|
|Alexander Johnson...| 45|     Sales|                9|
|       Riley Johnson| 49|        HR|                9|
|       Scott Burnett| 48|     Sales|                9|
|       Brittany Kerr| 58|     Sales|                6|
|         Edwin Burns| 34|     Sales|                9|
|       Mary Reynolds| 25|     Sales|                6|
|           Erin Berg| 42| Marketing|                7|
|         Jason Hines| 59|   Finance|                9|
|Christopher Mcdaniel| 59|        HR|               10|
|      Victoria Kelly| 57|   Finance|                7|
|      Heather Nelson| 48|     Sales|               10|
|         Paul Porter| 53|     Sales|           