In [69]:
from pyspark.sql import SparkSession
import numpy as np
from pyspark.sql import functions as F

In [4]:
spark = SparkSession.builder.appName("deo").getOrCreate()

In [5]:
spark

In [18]:
df = spark.read.csv("/content/drive/MyDrive/Datasets/Copy of Copy of large_employee_dataset.csv", inferSchema=True, header=True)

In [19]:
df.show()

+----------+--------------------+---+----------+------+-----------+--------+-------------------+
|EmployeeID|                Name|Age|Department|Salary|JoiningDate|  Status|               City|
+----------+--------------------+---+----------+------+-----------+--------+-------------------+
|      4128|     Charles Johnson| 52|        HR| 64039| 2018-07-07|Resigned|          Allentown|
|      6094|       Dylan Camacho| 57| Marketing| 34686| 2015-08-25|  Active|        Anthonyfort|
|      5883| Mr. Ryan Bowman Jr.| 29|   Finance| 64541| 2025-03-11|On Leave|          Gilesstad|
|      9146|          Brian Ball| 24|     Sales| 87831| 2015-10-01|Resigned|       Jenniferfurt|
|      1918|       Angela Hooper| 26|   Finance|108773| 2019-08-14|On Leave|       Lake Amystad|
|      4600|Alexander Johnson...| 45|     Sales| 75671| 2016-04-21|On Leave|         Russohaven|
|      6253|         Steven Lane| 47|   Finance| 64982| 2021-07-25|  Active|        East Robert|
|      8278|       Riley Johns

# **Basic Exploration**

In [20]:
#1. Show the top 10 rows of the dataset.
df.show(10)

+----------+--------------------+---+----------+------+-----------+--------+------------+
|EmployeeID|                Name|Age|Department|Salary|JoiningDate|  Status|        City|
+----------+--------------------+---+----------+------+-----------+--------+------------+
|      4128|     Charles Johnson| 52|        HR| 64039| 2018-07-07|Resigned|   Allentown|
|      6094|       Dylan Camacho| 57| Marketing| 34686| 2015-08-25|  Active| Anthonyfort|
|      5883| Mr. Ryan Bowman Jr.| 29|   Finance| 64541| 2025-03-11|On Leave|   Gilesstad|
|      9146|          Brian Ball| 24|     Sales| 87831| 2015-10-01|Resigned|Jenniferfurt|
|      1918|       Angela Hooper| 26|   Finance|108773| 2019-08-14|On Leave|Lake Amystad|
|      4600|Alexander Johnson...| 45|     Sales| 75671| 2016-04-21|On Leave|  Russohaven|
|      6253|         Steven Lane| 47|   Finance| 64982| 2021-07-25|  Active| East Robert|
|      8278|       Riley Johnson| 49|        HR| 43449| 2015-08-03|Resigned|  New Thomas|
|      852

In [34]:
#2. Count the total number of employees.
df.select("EmployeeID").distinct().count()

500

In [35]:
#3. Display unique departments.
df.select("Department").distinct().show()

+----------+
|Department|
+----------+
|     Sales|
|        HR|
|   Finance|
| Marketing|
|        IT|
+----------+



# **Filtering and Sorting**

In [36]:
# 4. Filter all employees in the "IT" department.

df.filter(df.Department == "IT").select(["Name", "Department"]).show()

+-------------------+----------+
|               Name|Department|
+-------------------+----------+
|        Mary Henson|        IT|
|   Elizabeth Abbott|        IT|
|        Thomas Dunn|        IT|
|        Glenn Mason|        IT|
|     Richard Bailey|        IT|
|      Jacob Jackson|        IT|
|     Nicole Gilmore|        IT|
|         David Wang|        IT|
|       Joseph Clark|        IT|
|      Debra Swanson|        IT|
|      Jeffrey Frank|        IT|
|Christopher Jimenez|        IT|
|     Kevin Harrison|        IT|
|    Andrew Harrison|        IT|
|        Melissa Lee|        IT|
|         Paige Hall|        IT|
|        Casey Olson|        IT|
|      Tony Mcdonald|        IT|
|     Jackie Herring|        IT|
|       Shelly James|        IT|
+-------------------+----------+
only showing top 20 rows



In [42]:
#5. Show employees aged between 30 and 40.
df.filter((df.Age >= 30) & (df.Age <= 40)).show()

+----------+------------------+---+----------+------+-----------+--------+-------------------+
|EmployeeID|              Name|Age|Department|Salary|JoiningDate|  Status|               City|
+----------+------------------+---+----------+------+-----------+--------+-------------------+
|      4676|Christopher Fuller| 30|        HR| 63042| 2021-04-30|Resigned|   South Donnaville|
|      4136|     Jerome Torres| 30|   Finance| 68213| 2024-11-30|  Active|North Justinborough|
|      1588|       Edwin Burns| 34|     Sales|108208| 2015-09-14|Resigned|        South David|
|      8074|       Fred Brewer| 30|        HR|100736| 2021-06-06|On Leave|    Port Wendyville|
|      3841|       April Allen| 36|        HR| 98845| 2020-05-20|  Active|      Rachelchester|
|      8212|     Jacob Jackson| 35|        IT| 54812| 2020-09-18|On Leave| South Veronicastad|
|      3325|       Brian Huynh| 40|   Finance| 59924| 2017-01-02|On Leave|           Johnside|
|      6180|     Robert Cortez| 35| Marketing| 761

In [44]:
# 6. Sort employees by Salary in descending order.
df.sort(df.Salary.desc()).show()

+----------+--------------------+---+----------+------+-----------+--------+-------------------+
|EmployeeID|                Name|Age|Department|Salary|JoiningDate|  Status|               City|
+----------+--------------------+---+----------+------+-----------+--------+-------------------+
|      8860|       Cody Williams| 30|        IT|119978| 2019-03-16|Resigned|         Susanville|
|      4585|      Sandra Webster| 30|        HR|119940| 2022-10-21|On Leave|       Thompsonport|
|      4667|         Ronald Hunt| 58|     Sales|119677| 2019-08-29|Resigned|    Griffithchester|
|      1602|    Deborah Williams| 25|        HR|119397| 2023-09-26|On Leave|    Port Terrimouth|
|      3374|        Amanda Green| 41|        HR|119316| 2021-04-08|Resigned|    West Shelleyton|
|      6329|       Heidi Shaffer| 36|        HR|119165| 2020-01-14|Resigned|          New Alexa|
|      2428|        Mary Stevens| 55|     Sales|119137| 2022-03-06|On Leave|         Travisport|
|      1545|Brittany Christens

# **Aggregation Tasks**

In [46]:
# 7. Get the average salary by department.
df.groupBy("Department").agg(F.avg(df.Salary).alias("Average Salary")).show()

+----------+-----------------+
|Department|   Average Salary|
+----------+-----------------+
|     Sales|77488.54545454546|
|        HR|76091.27450980392|
|   Finance|72834.75630252101|
| Marketing| 71958.1888888889|
|        IT|73116.25555555556|
+----------+-----------------+



In [47]:
# 8. Count of employees by Status.
df.groupBy("Status").agg(F.count(df.EmployeeID).alias("Emp Count")).show()

+--------+---------+
|  Status|Emp Count|
+--------+---------+
|Resigned|      159|
|  Active|      172|
|On Leave|      169|
+--------+---------+



In [50]:
# 9. Highest salary in each city.
df.groupBy("City").agg(F.max(df.Salary).alias("Highest Salary")).sort("Highest Salary", ascending=False).show()

+-------------------+--------------+
|               City|Highest Salary|
+-------------------+--------------+
|         Susanville|        119978|
|       Thompsonport|        119940|
|    Griffithchester|        119677|
|    Port Terrimouth|        119397|
|    West Shelleyton|        119316|
|          New Alexa|        119165|
|         Travisport|        119137|
|        Matthewside|        119038|
|           Toniside|        119009|
|       West Matthew|        118992|
|        Leslieville|        118959|
|          Welchview|        118682|
|    South Elizabeth|        118641|
|     New Howardfort|        118584|
|Lake Michaelchester|        118504|
|        Michaelstad|        118225|
|         South Mark|        118187|
|     East Daisystad|        118172|
|          Kellyfort|        117517|
|         Port Brian|        117237|
+-------------------+--------------+
only showing top 20 rows



# **GroupBy and Analysis**

In [52]:
# 10. Total number of employees who joined each year.
df.withColumn("year", F.year(df.JoiningDate)).groupby("year").agg(F.count("year").alias("Per Year")).show()

+----+--------+
|year|Per Year|
+----+--------+
|2025|      27|
|2018|      52|
|2015|      37|
|2023|      47|
|2022|      49|
|2019|      52|
|2020|      56|
|2016|      49|
|2024|      38|
|2017|      44|
|2021|      49|
+----+--------+



In [53]:
# 11. Department-wise count of employees who are currently "Active".
df.filter(df.Status == "Active").groupBy("Department").agg(F.count("EmployeeID").alias("Active Employees")).show()

+----------+----------------+
|Department|Active Employees|
+----------+----------------+
|     Sales|              32|
|        HR|              37|
|   Finance|              45|
| Marketing|              32|
|        IT|              26|
+----------+----------------+



In [56]:
# 12. Average age of employees per department.
df.groupby("Department").agg(F.avg("Age").alias("Average Age")).show()

+----------+------------------+
|Department|       Average Age|
+----------+------------------+
|     Sales|40.535353535353536|
|        HR| 41.46078431372549|
|   Finance| 39.21008403361345|
| Marketing| 41.82222222222222|
|        IT| 38.68888888888889|
+----------+------------------+



# **Joining**

In [64]:
# 13. Create another dataset with City and Region , and join it.
regions = [
    {"City": "Allentown", "Region": "East"},
    {"City": "Anthonyfort", "Region": "South"},
    {"City": "Gilesstad", "Region": "North"},
    {"City": "Jenniferfurt", "Region": "West"},
    {"City": "Lake Amystad", "Region": "East"}
]
regions_df = spark.createDataFrame(regions)

joined_df = df.join(regions_df, on="City", how="left")
joined_df.show()


+-------------------+----------+--------------------+---+----------+------+-----------+--------+------+
|               City|EmployeeID|                Name|Age|Department|Salary|JoiningDate|  Status|Region|
+-------------------+----------+--------------------+---+----------+------+-----------+--------+------+
|        East Robert|      6253|         Steven Lane| 47|   Finance| 64982| 2021-07-25|  Active|  NULL|
|          Allentown|      4128|     Charles Johnson| 52|        HR| 64039| 2018-07-07|Resigned|  East|
|         Tonyamouth|      6406|       Patrick Chung| 27|        HR|116423| 2024-07-05|  Active|  NULL|
|       Jenniferfurt|      9146|          Brian Ball| 24|     Sales| 87831| 2015-10-01|Resigned|  West|
|     North Brittany|      8989|       Scott Burnett| 48|     Sales| 93690| 2016-04-25|Resigned|  NULL|
|          Gilesstad|      5883| Mr. Ryan Bowman Jr.| 29|   Finance| 64541| 2025-03-11|On Leave| North|
|          Port Mark|      3326|       Michael Brown| 28|       

In [65]:
# 14. Group salaries by Region after the join.
joined_df.groupBy("Region").agg(F.sum("Salary").alias("Total Salary")).show()

+------+------------+
|Region|Total Salary|
+------+------------+
|  NULL|    36796842|
| South|       34686|
|  East|      172812|
|  West|       87831|
| North|       64541|
+------+------------+



# **Date Operations**

In [76]:
# 15. Calculate years of experience for each employee (current date - JoiningDate).

new_df = df.withColumn("Experience", (F.datediff(F.current_date(), df.JoiningDate)/ 365))
new_df.show()

+----------+--------------------+---+----------+------+-----------+--------+-------------------+-------------------+
|EmployeeID|                Name|Age|Department|Salary|JoiningDate|  Status|               City|         Experience|
+----------+--------------------+---+----------+------+-----------+--------+-------------------+-------------------+
|      4128|     Charles Johnson| 52|        HR| 64039| 2018-07-07|Resigned|          Allentown|  6.912328767123288|
|      6094|       Dylan Camacho| 57| Marketing| 34686| 2015-08-25|  Active|        Anthonyfort|  9.780821917808218|
|      5883| Mr. Ryan Bowman Jr.| 29|   Finance| 64541| 2025-03-11|On Leave|          Gilesstad|0.23013698630136986|
|      9146|          Brian Ball| 24|     Sales| 87831| 2015-10-01|Resigned|       Jenniferfurt|   9.67945205479452|
|      1918|       Angela Hooper| 26|   Finance|108773| 2019-08-14|On Leave|       Lake Amystad|  5.808219178082192|
|      4600|Alexander Johnson...| 45|     Sales| 75671| 2016-04-

In [77]:
# 16. List all employees with more than 5 years of experience.
new_df.filter(new_df.Experience >= 5).show()

+----------+--------------------+---+----------+------+-----------+--------+-----------------+------------------+
|EmployeeID|                Name|Age|Department|Salary|JoiningDate|  Status|             City|        Experience|
+----------+--------------------+---+----------+------+-----------+--------+-----------------+------------------+
|      4128|     Charles Johnson| 52|        HR| 64039| 2018-07-07|Resigned|        Allentown| 6.912328767123288|
|      6094|       Dylan Camacho| 57| Marketing| 34686| 2015-08-25|  Active|      Anthonyfort| 9.780821917808218|
|      9146|          Brian Ball| 24|     Sales| 87831| 2015-10-01|Resigned|     Jenniferfurt|  9.67945205479452|
|      1918|       Angela Hooper| 26|   Finance|108773| 2019-08-14|On Leave|     Lake Amystad| 5.808219178082192|
|      4600|Alexander Johnson...| 45|     Sales| 75671| 2016-04-21|On Leave|       Russohaven| 9.123287671232877|
|      8278|       Riley Johnson| 49|        HR| 43449| 2015-08-03|Resigned|       New T