In [0]:
from pyspark.sql.functions import col
data = spark.read.format("delta").load("dbfs:/user/hive/warehouse/export")
# 1. **Filtering data**: Filter employees who have a salary greater than 10000
filtered_data = data.filter(col("salary") > 10000)
print("Filtered Data (salary > 10000):")
filtered_data.show(5)

Filtered Data (salary > 10000):
+---+---------+----------+----------+------+-------------------+-----------+------+
| id|firstName|middleName|  lastName|gender|          birthDate|        ssn|salary|
+---+---------+----------+----------+------+-------------------+-----------+------+
|  1|   Pennie|     Carry|Hirschmann|     F|1955-07-02 04:00:00|981-43-9345| 56172|
|  2|       An|     Amira|    Cowper|     F|1992-02-08 05:00:00|978-97-8086| 40203|
|  3|    Quyen|    Marlen|      Dome|     F|1970-10-11 04:00:00|957-57-8246| 53417|
|  4|  Coralie|  Antonina|   Marshal|     F|1990-04-11 04:00:00|963-39-4885| 94727|
|  5|   Terrie|      Wava|     Bonar|     F|1980-01-16 05:00:00|964-49-8051| 79908|
+---+---------+----------+----------+------+-------------------+-----------+------+
only showing top 5 rows



In [0]:
from pyspark.sql.functions import year,current_date
# 2. **Adding a new column**: Calculate age from birthdate
# Get the current year and subtract birth year to calculate age
current_year = year(current_date())
data_with_age = filtered_data.withColumn("age", current_year - year(col("birthDate")))
print("Data with Calculated Age:")
data_with_age.show(5)

Data with Calculated Age:
+---+---------+----------+----------+------+-------------------+-----------+------+---+
| id|firstName|middleName|  lastName|gender|          birthDate|        ssn|salary|age|
+---+---------+----------+----------+------+-------------------+-----------+------+---+
|  1|   Pennie|     Carry|Hirschmann|     F|1955-07-02 04:00:00|981-43-9345| 56172| 69|
|  2|       An|     Amira|    Cowper|     F|1992-02-08 05:00:00|978-97-8086| 40203| 32|
|  3|    Quyen|    Marlen|      Dome|     F|1970-10-11 04:00:00|957-57-8246| 53417| 54|
|  4|  Coralie|  Antonina|   Marshal|     F|1990-04-11 04:00:00|963-39-4885| 94727| 34|
|  5|   Terrie|      Wava|     Bonar|     F|1980-01-16 05:00:00|964-49-8051| 79908| 44|
+---+---------+----------+----------+------+-------------------+-----------+------+---+
only showing top 5 rows



In [0]:
# 3. Adding another new column: Calculate salary after a 20% tax deduction
data_with_tax = data_with_age.withColumn("salary_after_tax", col("salary") * 0.8)
print("Data with Salary After Tax Deduction:")
data_with_tax.show(5)

Data with Salary After Tax Deduction:
+---+---------+----------+----------+------+-------------------+-----------+------+---+------------------+
| id|firstName|middleName|  lastName|gender|          birthDate|        ssn|salary|age|  salary_after_tax|
+---+---------+----------+----------+------+-------------------+-----------+------+---+------------------+
|  1|   Pennie|     Carry|Hirschmann|     F|1955-07-02 04:00:00|981-43-9345| 56172| 69|44937.600000000006|
|  2|       An|     Amira|    Cowper|     F|1992-02-08 05:00:00|978-97-8086| 40203| 32|           32162.4|
|  3|    Quyen|    Marlen|      Dome|     F|1970-10-11 04:00:00|957-57-8246| 53417| 54|42733.600000000006|
|  4|  Coralie|  Antonina|   Marshal|     F|1990-04-11 04:00:00|963-39-4885| 94727| 34|           75781.6|
|  5|   Terrie|      Wava|     Bonar|     F|1980-01-16 05:00:00|964-49-8051| 79908| 44|           63926.4|
+---+---------+----------+----------+------+-------------------+-----------+------+---+------------------+

In [0]:
# 4. Renaming columns: Rename 'ssn' to 'social_security_number' and 'salary' to 'annual_salary'
renamed_data = data_with_tax.withColumnRenamed("ssn", "social_security_number").withColumnRenamed("salary", "annual_salary")
print("Data with Renamed Columns:")
renamed_data.show(5)

Data with Renamed Columns:
+---+---------+----------+----------+------+-------------------+----------------------+-------------+---+------------------+
| id|firstName|middleName|  lastName|gender|          birthDate|social_security_number|annual_salary|age|  salary_after_tax|
+---+---------+----------+----------+------+-------------------+----------------------+-------------+---+------------------+
|  1|   Pennie|     Carry|Hirschmann|     F|1955-07-02 04:00:00|           981-43-9345|        56172| 69|44937.600000000006|
|  2|       An|     Amira|    Cowper|     F|1992-02-08 05:00:00|           978-97-8086|        40203| 32|           32162.4|
|  3|    Quyen|    Marlen|      Dome|     F|1970-10-11 04:00:00|           957-57-8246|        53417| 54|42733.600000000006|
|  4|  Coralie|  Antonina|   Marshal|     F|1990-04-11 04:00:00|           963-39-4885|        94727| 34|           75781.6|
|  5|   Terrie|      Wava|     Bonar|     F|1980-01-16 05:00:00|           964-49-8051|        799

In [0]:
from pyspark.sql.functions import when

# 5. Conditional Columns: Create a new column to categorize salary
categorized_data = renamed_data.withColumn(
    "salary_category",
    when(col("annual_salary") > 50000, "High")
    .when(col("annual_salary") > 20000, "Medium")
    .otherwise("Low"))
print("Data with Salary Categories:")
categorized_data.show(5)

Data with Salary Categories:
+---+---------+----------+----------+------+-------------------+----------------------+-------------+---+------------------+---------------+
| id|firstName|middleName|  lastName|gender|          birthDate|social_security_number|annual_salary|age|  salary_after_tax|salary_category|
+---+---------+----------+----------+------+-------------------+----------------------+-------------+---+------------------+---------------+
|  1|   Pennie|     Carry|Hirschmann|     F|1955-07-02 04:00:00|           981-43-9345|        56172| 69|44937.600000000006|           High|
|  2|       An|     Amira|    Cowper|     F|1992-02-08 05:00:00|           978-97-8086|        40203| 32|           32162.4|         Medium|
|  3|    Quyen|    Marlen|      Dome|     F|1970-10-11 04:00:00|           957-57-8246|        53417| 54|42733.600000000006|           High|
|  4|  Coralie|  Antonina|   Marshal|     F|1990-04-11 04:00:00|           963-39-4885|        94727| 34|           75781.6| 

In [0]:
from pyspark.sql.functions import avg

# 6. GroupBy and Aggregation: Calculate the average salary by gender
salary_by_gender = renamed_data.groupBy("gender").agg(
    avg("annual_salary").alias("avg_salary")
)
print("Average Salary by Gender:")
salary_by_gender.show(5)

Average Salary by Gender:
+------+-----------------+
|gender|       avg_salary|
+------+-----------------+
|     F|72907.42685370741|
+------+-----------------+



In [0]:
# 7. Sorting: Sort the data by salary in descending order
sorted_by_salary = renamed_data.orderBy(col("annual_salary").desc())
print("Data Sorted by Salary (Descending):")
sorted_by_salary.show(5)

Data Sorted by Salary (Descending):
+---+---------+----------+----------+------+-------------------+----------------------+-------------+---+------------------+
| id|firstName|middleName|  lastName|gender|          birthDate|social_security_number|annual_salary|age|  salary_after_tax|
+---+---------+----------+----------+------+-------------------+----------------------+-------------+---+------------------+
| 83|  Markita|   Shellie|Baskeyfied|     F|1960-12-05 05:00:00|           993-66-4078|       134393| 64|107514.40000000001|
|483|     Sean|    Emilia|   Bellino|     F|1980-08-17 04:00:00|           969-11-9490|       128957| 44|          103165.6|
|686|   Karima|    Tamica|     Boden|     F|1958-04-12 05:00:00|           959-39-1344|       126968| 66|101574.40000000001|
|326|  Verlene|   Emogene|    Biford|     F|1981-08-16 04:00:00|           945-80-8061|       126588| 43|101270.40000000001|
|948|  Louella|     Tesha|   Cutford|     F|1970-11-16 05:00:00|           936-15-8338|  

In [0]:
# 8. Selecting specific columns: Select only 'id', 'firstName', 'lastName', and 'salary'
selected_columns = renamed_data.select("id", "firstName", "lastName", "annual_salary")
print("Selected Columns:")
selected_columns.show(5)

Selected Columns:
+---+---------+----------+-------------+
| id|firstName|  lastName|annual_salary|
+---+---------+----------+-------------+
|  1|   Pennie|Hirschmann|        56172|
|  2|       An|    Cowper|        40203|
|  3|    Quyen|      Dome|        53417|
|  4|  Coralie|   Marshal|        94727|
|  5|   Terrie|     Bonar|        79908|
+---+---------+----------+-------------+
only showing top 5 rows



In [0]:
from pyspark.sql.functions import date_diff

# 9. Date Difference: Calculate the number of days since the employee's birthdate
data_with_days_since_birth = renamed_data.withColumn(
    "days_since_birth", date_diff(current_date(), col("birthDate"))
)
print("Data with Days Since Birth:")
data_with_days_since_birth.show(5)


Data with Days Since Birth:
+---+---------+----------+----------+------+-------------------+----------------------+-------------+---+------------------+----------------+
| id|firstName|middleName|  lastName|gender|          birthDate|social_security_number|annual_salary|age|  salary_after_tax|days_since_birth|
+---+---------+----------+----------+------+-------------------+----------------------+-------------+---+------------------+----------------+
|  1|   Pennie|     Carry|Hirschmann|     F|1955-07-02 04:00:00|           981-43-9345|        56172| 69|44937.600000000006|           25362|
|  2|       An|     Amira|    Cowper|     F|1992-02-08 05:00:00|           978-97-8086|        40203| 32|           32162.4|           11992|
|  3|    Quyen|    Marlen|      Dome|     F|1970-10-11 04:00:00|           957-57-8246|        53417| 54|42733.600000000006|           19782|
|  4|  Coralie|  Antonina|   Marshal|     F|1990-04-11 04:00:00|           963-39-4885|        94727| 34|           7578

In [0]:
# 10. Filtering based on age: Filter employees who are older than 30
filtered_by_age = data_with_days_since_birth.filter(col("age") > 30)
print("Filtered Employees Older Than 30:")
filtered_by_age.show(5)

Filtered Employees Older Than 30:
+---+---------+----------+----------+------+-------------------+----------------------+-------------+---+------------------+----------------+
| id|firstName|middleName|  lastName|gender|          birthDate|social_security_number|annual_salary|age|  salary_after_tax|days_since_birth|
+---+---------+----------+----------+------+-------------------+----------------------+-------------+---+------------------+----------------+
|  1|   Pennie|     Carry|Hirschmann|     F|1955-07-02 04:00:00|           981-43-9345|        56172| 69|44937.600000000006|           25362|
|  2|       An|     Amira|    Cowper|     F|1992-02-08 05:00:00|           978-97-8086|        40203| 32|           32162.4|           11992|
|  3|    Quyen|    Marlen|      Dome|     F|1970-10-11 04:00:00|           957-57-8246|        53417| 54|42733.600000000006|           19782|
|  4|  Coralie|  Antonina|   Marshal|     F|1990-04-11 04:00:00|           963-39-4885|        94727| 34|         

In [0]:
# Apply transformations to the data in Notebook 2

# Transformation steps (filter, add columns, etc.) ...

# Final transformed data (filtered_by_age) should be saved to a Delta table in Notebook 2
filtered_by_age.write.format("delta").mode("overwrite").saveAsTable("default.transformed_employees")
