In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window

spark = SparkSession.builder.appName("CombiningData").getOrCreate()

# Employee Data
employee_data = [
    ("Ananya", "HR", 50000),
    ("Rahul", "Engineering", 70000),
    ("Priya", "Engineering", 65000),
    ("Zoya", "Marketing", 48000),
    ("Karan", "HR", 52000),
    ("Naveen", "Engineering", 72000),
    ("Fatima", "Marketing", 46000)
]
columns_emp = ["Name", "Department", "Salary"]
df_emp = spark.createDataFrame(employee_data, columns_emp)

# Performance Data
performance_data = [
    ("Ananya", 4.5),
    ("Rahul", 4.8),
    ("Priya", 4.1),
    ("Zoya", 3.9),
    ("Karan", 4.2),
    ("Naveen", 4.9),
    ("Fatima", 3.7)
]
columns_perf = ["Name", "Rating"]
df_perf = spark.createDataFrame(performance_data, columns_perf)

# Project Data
project_data = [
    ("Ananya", "HR Portal", 120),
    ("Rahul", "Data Platform", 200),
    ("Priya", "Data Platform", 180),
    ("Zoya", "Campaign Tracker", 100),
    ("Karan", "HR Portal", 130),
    ("Naveen", "ML Pipeline", 220),
    ("Fatima", "Campaign Tracker", 90)
]
columns_proj = ["Name", "Project", "HoursWorked"]
df_proj = spark.createDataFrame(project_data, columns_proj)


#Joins and Advanced Aggregations

In [2]:
# 1. Join employee_data, performance_data, and project_data
df_joined = df_emp.join(df_perf, "Name").join(df_proj, "Name")
df_joined.show()

# 2. Compute total hours worked per department
df_joined.groupBy("Department").agg(sum("HoursWorked").alias("TotalHoursWorked")).show()

# 3. Compute average rating per project
df_joined.groupBy("Project").agg(avg("Rating").alias("AvgRating")).show()


+------+-----------+------+------+----------------+-----------+
|  Name| Department|Salary|Rating|         Project|HoursWorked|
+------+-----------+------+------+----------------+-----------+
|Ananya|         HR| 50000|   4.5|       HR Portal|        120|
| Priya|Engineering| 65000|   4.1|   Data Platform|        180|
| Rahul|Engineering| 70000|   4.8|   Data Platform|        200|
|Naveen|Engineering| 72000|   4.9|     ML Pipeline|        220|
|Fatima|  Marketing| 46000|   3.7|Campaign Tracker|         90|
|  Zoya|  Marketing| 48000|   3.9|Campaign Tracker|        100|
| Karan|         HR| 52000|   4.2|       HR Portal|        130|
+------+-----------+------+------+----------------+-----------+

+-----------+----------------+
| Department|TotalHoursWorked|
+-----------+----------------+
|Engineering|             600|
|         HR|             250|
|  Marketing|             190|
+-----------+----------------+

+----------------+-----------------+
|         Project|        AvgRating|
+--

#  Handling Missing Data (introduce some manually)

In [5]:
from pyspark.sql import Row
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

# Define schema for performance_data
schema = StructType([
    StructField("Name", StringType(), True),
    StructField("Rating", DoubleType(), True)
])

# 4. Add a row with None rating
new_row = spark.createDataFrame([("John", None)], schema=schema)

# Union with existing df_perf
df_perf_updated = df_perf.union(new_row)
df_perf_updated.show()


# 5. Filter rows with null values
df_perf_updated.filter(col("Rating").isNull()).show()

# 6. Replace null ratings with the department average
df_perf_with_dept = df_perf_updated.join(df_emp, "Name", "left")

avg_dept_rating = df_perf_with_dept.groupBy("Department") \
    .agg(avg("Rating").alias("AvgDeptRating"))

df_filled = df_perf_with_dept.join(avg_dept_rating, "Department", "left") \
    .withColumn("FinalRating", when(col("Rating").isNull(), col("AvgDeptRating")).otherwise(col("Rating"))) \
    .select("Name", "Department", "FinalRating")

df_filled.show()


+------+------+
|  Name|Rating|
+------+------+
|Ananya|   4.5|
| Rahul|   4.8|
| Priya|   4.1|
|  Zoya|   3.9|
| Karan|   4.2|
|Naveen|   4.9|
|Fatima|   3.7|
|  John|  NULL|
+------+------+

+----+------+
|Name|Rating|
+----+------+
|John|  NULL|
+----+------+

+------+-----------+-----------+
|  Name| Department|FinalRating|
+------+-----------+-----------+
|Ananya|         HR|        4.5|
| Priya|Engineering|        4.1|
| Rahul|Engineering|        4.8|
|Naveen|Engineering|        4.9|
|Fatima|  Marketing|        3.7|
|  Zoya|  Marketing|        3.9|
| Karan|         HR|        4.2|
|  John|       NULL|       NULL|
+------+-----------+-----------+



#Built-In Functions and UDF

In [7]:
from pyspark.sql.functions import udf, when, col
from pyspark.sql.types import IntegerType

# 7. Create a column PerformanceCategory
df_cat = df_joined.withColumn("PerformanceCategory",
    when(col("Rating") >= 4.7, "Excellent")
    .when((col("Rating") >= 4.0) & (col("Rating") < 4.7), "Good")
    .otherwise("Average"))
df_cat.select("Name", "Rating", "PerformanceCategory").show()

# 8. Create a UDF to assign bonus: If project hours > 200 → 10,000 else → 5,000
def assign_bonus(hours):
    return 10000 if hours > 200 else 5000

bonus_udf = udf(assign_bonus, IntegerType())

df_bonus = df_joined.withColumn("Bonus", bonus_udf(col("HoursWorked")))
df_bonus.select("Name", "HoursWorked", "Bonus").show()


+------+------+-------------------+
|  Name|Rating|PerformanceCategory|
+------+------+-------------------+
|Ananya|   4.5|               Good|
| Priya|   4.1|               Good|
| Rahul|   4.8|          Excellent|
|Naveen|   4.9|          Excellent|
|Fatima|   3.7|            Average|
|  Zoya|   3.9|            Average|
| Karan|   4.2|               Good|
+------+------+-------------------+

+------+-----------+-----+
|  Name|HoursWorked|Bonus|
+------+-----------+-----+
|Ananya|        120| 5000|
| Priya|        180| 5000|
| Rahul|        200| 5000|
|Naveen|        220|10000|
|Fatima|         90| 5000|
|  Zoya|        100| 5000|
| Karan|        130| 5000|
+------+-----------+-----+



# Date and Time Functions

In [8]:
# 9. Add a column JoinDate with 2021-06-01 and calculate MonthsWorked
df_with_date = df_joined.withColumn("JoinDate", to_date(lit("2021-06-01")))
df_with_date = df_with_date.withColumn("MonthsWorked", months_between(current_date(), col("JoinDate")).cast("int"))
df_with_date.select("Name", "JoinDate", "MonthsWorked").show()

# 10. Calculate how many employees joined before 2022
count_before_2022 = df_with_date.filter(col("JoinDate") < "2022-01-01").select("Name").distinct().count()
print(f"Employees who joined before 2022: {count_before_2022}")


+------+----------+------------+
|  Name|  JoinDate|MonthsWorked|
+------+----------+------------+
|Ananya|2021-06-01|          48|
| Priya|2021-06-01|          48|
| Rahul|2021-06-01|          48|
|Naveen|2021-06-01|          48|
|Fatima|2021-06-01|          48|
|  Zoya|2021-06-01|          48|
| Karan|2021-06-01|          48|
+------+----------+------------+

Employees who joined before 2022: 7


#Unions


In [9]:
# 11. Create another small team DataFrame and union it with employee_data
extra_employees = [
    ("Meena", "HR", 48000),
    ("Raj", "Marketing", 51000)
]
df_extra = spark.createDataFrame(extra_employees, columns_emp)
df_union = df_emp.union(df_extra)
df_union.show()


+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Ananya|         HR| 50000|
| Rahul|Engineering| 70000|
| Priya|Engineering| 65000|
|  Zoya|  Marketing| 48000|
| Karan|         HR| 52000|
|Naveen|Engineering| 72000|
|Fatima|  Marketing| 46000|
| Meena|         HR| 48000|
|   Raj|  Marketing| 51000|
+------+-----------+------+



#  Saving Results

In [10]:
# 12. Save the final merged dataset (all 3 joins) as a partitioned Parquet file based on Department
final_df = df_joined.select("Name", "Department", "Salary", "Rating", "Project", "HoursWorked")
final_df.write.partitionBy("Department").parquet("/path/to/output/final_data.parquet", mode="overwrite")
