In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("CourseTrackerAnalysis") \
    .getOrCreate()
spark

In [3]:
from google.colab import drive
drive.mount('/content/drive')
df = spark.read.csv("/content/drive/MyDrive/large_employee_dataset.csv", header=True, inferSchema=True)
df.show()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
+----------+--------------------+---+----------+------+-----------+--------+-------------------+
|EmployeeID|                Name|Age|Department|Salary|JoiningDate|  Status|               City|
+----------+--------------------+---+----------+------+-----------+--------+-------------------+
|      4128|     Charles Johnson| 52|        HR| 64039| 2018-07-07|Resigned|          Allentown|
|      6094|       Dylan Camacho| 57| Marketing| 34686| 2015-08-25|  Active|        Anthonyfort|
|      5883| Mr. Ryan Bowman Jr.| 29|   Finance| 64541| 2025-03-11|On Leave|          Gilesstad|
|      9146|          Brian Ball| 24|     Sales| 87831| 2015-10-01|Resigned|       Jenniferfurt|
|      1918|       Angela Hooper| 26|   Finance|108773| 2019-08-14|On Leave|       Lake Amystad|
|      4600|Alexander Johnson...| 45|     Sales| 75671| 2016-04-21|On Leave|         Russohaven

Basic Exploration


In [4]:
df.show(10)

+----------+--------------------+---+----------+------+-----------+--------+------------+
|EmployeeID|                Name|Age|Department|Salary|JoiningDate|  Status|        City|
+----------+--------------------+---+----------+------+-----------+--------+------------+
|      4128|     Charles Johnson| 52|        HR| 64039| 2018-07-07|Resigned|   Allentown|
|      6094|       Dylan Camacho| 57| Marketing| 34686| 2015-08-25|  Active| Anthonyfort|
|      5883| Mr. Ryan Bowman Jr.| 29|   Finance| 64541| 2025-03-11|On Leave|   Gilesstad|
|      9146|          Brian Ball| 24|     Sales| 87831| 2015-10-01|Resigned|Jenniferfurt|
|      1918|       Angela Hooper| 26|   Finance|108773| 2019-08-14|On Leave|Lake Amystad|
|      4600|Alexander Johnson...| 45|     Sales| 75671| 2016-04-21|On Leave|  Russohaven|
|      6253|         Steven Lane| 47|   Finance| 64982| 2021-07-25|  Active| East Robert|
|      8278|       Riley Johnson| 49|        HR| 43449| 2015-08-03|Resigned|  New Thomas|
|      852

In [5]:
print("Total Employees:", df.count())

Total Employees: 500


In [6]:
df.select("Department").distinct().show()


+----------+
|Department|
+----------+
|     Sales|
|        HR|
|   Finance|
| Marketing|
|        IT|
+----------+



Filtering & Sorting

In [7]:
df.filter(df.Department == "IT").show()


+----------+-------------------+---+----------+------+-----------+--------+------------------+
|EmployeeID|               Name|Age|Department|Salary|JoiningDate|  Status|              City|
+----------+-------------------+---+----------+------+-----------+--------+------------------+
|      6598|        Mary Henson| 58|        IT| 63951| 2021-08-25|  Active|       Port Tricia|
|      8518|   Elizabeth Abbott| 22|        IT| 91732| 2022-11-05|  Active|       Douglasside|
|      9506|        Thomas Dunn| 45|        IT| 90340| 2020-07-12|On Leave|    Lindseychester|
|      9663|        Glenn Mason| 43|        IT|109189| 2020-03-27|On Leave|      Katelynburgh|
|      2106|     Richard Bailey| 45|        IT| 30950| 2021-06-29|Resigned|        North John|
|      8212|      Jacob Jackson| 35|        IT| 54812| 2020-09-18|On Leave|South Veronicastad|
|      6354|     Nicole Gilmore| 35|        IT|104202| 2018-05-04|  Active|       East Joseph|
|      5716|         David Wang| 49|        IT| 94

In [8]:
df.filter((df.Age >= 30) & (df.Age <= 40)).show()


+----------+------------------+---+----------+------+-----------+--------+-------------------+
|EmployeeID|              Name|Age|Department|Salary|JoiningDate|  Status|               City|
+----------+------------------+---+----------+------+-----------+--------+-------------------+
|      4676|Christopher Fuller| 30|        HR| 63042| 2021-04-30|Resigned|   South Donnaville|
|      4136|     Jerome Torres| 30|   Finance| 68213| 2024-11-30|  Active|North Justinborough|
|      1588|       Edwin Burns| 34|     Sales|108208| 2015-09-14|Resigned|        South David|
|      8074|       Fred Brewer| 30|        HR|100736| 2021-06-06|On Leave|    Port Wendyville|
|      3841|       April Allen| 36|        HR| 98845| 2020-05-20|  Active|      Rachelchester|
|      8212|     Jacob Jackson| 35|        IT| 54812| 2020-09-18|On Leave| South Veronicastad|
|      3325|       Brian Huynh| 40|   Finance| 59924| 2017-01-02|On Leave|           Johnside|
|      6180|     Robert Cortez| 35| Marketing| 761

In [9]:
df.orderBy(df.Salary.desc()).show()


+----------+--------------------+---+----------+------+-----------+--------+-------------------+
|EmployeeID|                Name|Age|Department|Salary|JoiningDate|  Status|               City|
+----------+--------------------+---+----------+------+-----------+--------+-------------------+
|      8860|       Cody Williams| 30|        IT|119978| 2019-03-16|Resigned|         Susanville|
|      4585|      Sandra Webster| 30|        HR|119940| 2022-10-21|On Leave|       Thompsonport|
|      4667|         Ronald Hunt| 58|     Sales|119677| 2019-08-29|Resigned|    Griffithchester|
|      1602|    Deborah Williams| 25|        HR|119397| 2023-09-26|On Leave|    Port Terrimouth|
|      3374|        Amanda Green| 41|        HR|119316| 2021-04-08|Resigned|    West Shelleyton|
|      6329|       Heidi Shaffer| 36|        HR|119165| 2020-01-14|Resigned|          New Alexa|
|      2428|        Mary Stevens| 55|     Sales|119137| 2022-03-06|On Leave|         Travisport|
|      1545|Brittany Christens

Aggregation Tasks

In [10]:
from pyspark.sql.functions import avg
df.groupBy("Department").agg(avg("Salary").alias("AvgSalary")).show()


+----------+-----------------+
|Department|        AvgSalary|
+----------+-----------------+
|     Sales|77488.54545454546|
|        HR|76091.27450980392|
|   Finance|72834.75630252101|
| Marketing| 71958.1888888889|
|        IT|73116.25555555556|
+----------+-----------------+



In [11]:
df.groupBy("Status").count().show()


+--------+-----+
|  Status|count|
+--------+-----+
|Resigned|  159|
|  Active|  172|
|On Leave|  169|
+--------+-----+



In [12]:
from pyspark.sql.functions import max
df.groupBy("City").agg(max("Salary").alias("MaxSalary")).show()


+----------------+---------+
|            City|MaxSalary|
+----------------+---------+
|   Wilsonchester|    67025|
|     Bradshawton|   111116|
|       Steventon|    32009|
|     Lake Alyssa|    84903|
|      North Lisa|    57898|
|    North Marvin|    66252|
|     Jenniferton|    39907|
|     Buckleyview|    50109|
|     Burtonville|    98492|
|    Johnsonmouth|    48799|
|    South Joseph|    52456|
|  Lindseychester|    90340|
|   North Stephen|    91947|
|Port Nicoleshire|    57537|
|    Jerrychester|    53374|
|  North Jennifer|    82486|
|      Laurenstad|    44608|
|West Brendanbury|    90698|
|       Juliaberg|    50170|
|       New James|    54378|
+----------------+---------+
only showing top 20 rows



GroupBy and Analysis

In [13]:
from pyspark.sql.functions import year
df.withColumn("JoiningYear", year("JoiningDate")).groupBy("JoiningYear").count().show()


+-----------+-----+
|JoiningYear|count|
+-----------+-----+
|       2025|   27|
|       2018|   52|
|       2015|   37|
|       2023|   47|
|       2022|   49|
|       2019|   52|
|       2020|   56|
|       2016|   49|
|       2024|   38|
|       2017|   44|
|       2021|   49|
+-----------+-----+



In [14]:
df.filter(df.Status == "Active").groupBy("Department").count().show()


+----------+-----+
|Department|count|
+----------+-----+
|     Sales|   32|
|        HR|   37|
|   Finance|   45|
| Marketing|   32|
|        IT|   26|
+----------+-----+



In [15]:
df.groupBy("Department").agg(avg("Age").alias("AvgAge")).show()


+----------+------------------+
|Department|            AvgAge|
+----------+------------------+
|     Sales|40.535353535353536|
|        HR| 41.46078431372549|
|   Finance| 39.21008403361345|
| Marketing| 41.82222222222222|
|        IT| 38.68888888888889|
+----------+------------------+



Joining (Use another DataFrame for mapping)

In [16]:
region_data = [
    ("New York", "East"),
    ("Los Angeles", "West"),
    ("Chicago", "Central"),
    ("Houston", "South"),
    ("San Francisco", "West")
]

region_columns = ["City", "Region"]
region_df = spark.createDataFrame(region_data, region_columns)

# Join
df_joined = df.join(region_df, on="City", how="left")
df_joined.show()


+-------------------+----------+--------------------+---+----------+------+-----------+--------+------+
|               City|EmployeeID|                Name|Age|Department|Salary|JoiningDate|  Status|Region|
+-------------------+----------+--------------------+---+----------+------+-----------+--------+------+
|        East Robert|      6253|         Steven Lane| 47|   Finance| 64982| 2021-07-25|  Active|  NULL|
|          Allentown|      4128|     Charles Johnson| 52|        HR| 64039| 2018-07-07|Resigned|  NULL|
|         Tonyamouth|      6406|       Patrick Chung| 27|        HR|116423| 2024-07-05|  Active|  NULL|
|       Jenniferfurt|      9146|          Brian Ball| 24|     Sales| 87831| 2015-10-01|Resigned|  NULL|
|     North Brittany|      8989|       Scott Burnett| 48|     Sales| 93690| 2016-04-25|Resigned|  NULL|
|          Gilesstad|      5883| Mr. Ryan Bowman Jr.| 29|   Finance| 64541| 2025-03-11|On Leave|  NULL|
|          Port Mark|      3326|       Michael Brown| 28|       

In [17]:
df_joined.groupBy("Region").agg(avg("Salary").alias("AvgSalary")).show()


+------+---------+
|Region|AvgSalary|
+------+---------+
|  NULL|74313.424|
+------+---------+



Date Operations

In [18]:
from pyspark.sql.functions import datediff, current_date

df_exp = df.withColumn("YearsExperience", datediff(current_date(), df.JoiningDate) / 365)
df_exp.select("EmployeeID", "Name", "YearsExperience").show()


+----------+--------------------+-------------------+
|EmployeeID|                Name|    YearsExperience|
+----------+--------------------+-------------------+
|      4128|     Charles Johnson|  6.912328767123288|
|      6094|       Dylan Camacho|  9.780821917808218|
|      5883| Mr. Ryan Bowman Jr.|0.23013698630136986|
|      9146|          Brian Ball|   9.67945205479452|
|      1918|       Angela Hooper|  5.808219178082192|
|      4600|Alexander Johnson...|  9.123287671232877|
|      6253|         Steven Lane| 3.8602739726027395|
|      8278|       Riley Johnson|  9.841095890410958|
|      8520|    Emily Washington| 3.5095890410958903|
|      1298|     Valerie Fleming|   5.49041095890411|
|      5157|     Tracy Hughes MD|  5.008219178082192|
|      7403|    Johnathan Harmon|  4.238356164383561|
|      3326|       Michael Brown| 1.6191780821917807|
|      8989|       Scott Burnett|  9.112328767123287|
|      4676|  Christopher Fuller|  4.095890410958904|
|      6598|         Mary He

In [19]:
df_exp.filter((datediff(current_date(), df.JoiningDate) / 365) > 5).show()


+----------+--------------------+---+----------+------+-----------+--------+-----------------+------------------+
|EmployeeID|                Name|Age|Department|Salary|JoiningDate|  Status|             City|   YearsExperience|
+----------+--------------------+---+----------+------+-----------+--------+-----------------+------------------+
|      4128|     Charles Johnson| 52|        HR| 64039| 2018-07-07|Resigned|        Allentown| 6.912328767123288|
|      6094|       Dylan Camacho| 57| Marketing| 34686| 2015-08-25|  Active|      Anthonyfort| 9.780821917808218|
|      9146|          Brian Ball| 24|     Sales| 87831| 2015-10-01|Resigned|     Jenniferfurt|  9.67945205479452|
|      1918|       Angela Hooper| 26|   Finance|108773| 2019-08-14|On Leave|     Lake Amystad| 5.808219178082192|
|      4600|Alexander Johnson...| 45|     Sales| 75671| 2016-04-21|On Leave|       Russohaven| 9.123287671232877|
|      8278|       Riley Johnson| 49|        HR| 43449| 2015-08-03|Resigned|       New T