In [1]:
#sparksession

from pyspark.sql import SparkSession

spark=SparkSession.builder.appName("new").master("local[*]").getOrCreate()
spark

In [3]:
employee_data = [
    (1, "John Doe", 101, 50000),
    (2, "Jane Smith", 102, 60000),
    (3, "Sam Brown", 101, 55000),
    (4, "Emily White", 103, 70000),
    (5, "Chris Green", 102, 65000)
]

employee_columns = ["EmployeeID", "Name", "DepartmentID", "Salary"]
employee_df = spark.createDataFrame(employee_data, employee_columns)

In [4]:
employee_df.show()

+----------+-----------+------------+------+
|EmployeeID|       Name|DepartmentID|Salary|
+----------+-----------+------------+------+
|         1|   John Doe|         101| 50000|
|         2| Jane Smith|         102| 60000|
|         3|  Sam Brown|         101| 55000|
|         4|Emily White|         103| 70000|
|         5|Chris Green|         102| 65000|
+----------+-----------+------------+------+



In [5]:
# Example data for Department table
department_data = [
    (101, "HR"),
    (102, "Engineering"),
    (103, "Sales")
]

# Define column names
department_columns = ["DepartmentID", "DepartmentName"]

# Create the Department DataFrame
department_df = spark.createDataFrame(department_data, department_columns)

# Show Department DataFrame
department_df.show()


+------------+--------------+
|DepartmentID|DepartmentName|
+------------+--------------+
|         101|            HR|
|         102|   Engineering|
|         103|         Sales|
+------------+--------------+



In [6]:
# Get the number of partitions for the employee DataFrame
num_partitions_employee = employee_df.rdd.getNumPartitions()
print(f"Number of partitions in employee DataFrame: {num_partitions_employee}")

# Get the number of partitions for the department DataFrame
num_partitions_department = department_df.rdd.getNumPartitions()
print(f"Number of partitions in department DataFrame: {num_partitions_department}")


Number of partitions in employee DataFrame: 10
Number of partitions in department DataFrame: 10


In [7]:
joinedf= employee_df.join(department_df,on='DepartmentID', how='inner')

In [8]:
joinedf.show()

+------------+----------+-----------+------+--------------+
|DepartmentID|EmployeeID|       Name|Salary|DepartmentName|
+------------+----------+-----------+------+--------------+
|         101|         1|   John Doe| 50000|            HR|
|         101|         3|  Sam Brown| 55000|            HR|
|         102|         2| Jane Smith| 60000|   Engineering|
|         102|         5|Chris Green| 65000|   Engineering|
|         103|         4|Emily White| 70000|         Sales|
+------------+----------+-----------+------+--------------+



In [9]:
from pyspark.sql import functions as F
from pyspark.sql import Window 

window_spec = Window.partitionBy("DepartmentID").orderBy("Salary")
row_=joinedf.withColumn("row",F.row_number().over(window_spec))

In [10]:
row_.show()

+------------+----------+-----------+------+--------------+---+
|DepartmentID|EmployeeID|       Name|Salary|DepartmentName|row|
+------------+----------+-----------+------+--------------+---+
|         101|         1|   John Doe| 50000|            HR|  1|
|         101|         3|  Sam Brown| 55000|            HR|  2|
|         102|         2| Jane Smith| 60000|   Engineering|  1|
|         102|         5|Chris Green| 65000|   Engineering|  2|
|         103|         4|Emily White| 70000|         Sales|  1|
+------------+----------+-----------+------+--------------+---+



In [11]:
row_= row_.filter(F.col("row") == 1)
row_.show()

+------------+----------+-----------+------+--------------+---+
|DepartmentID|EmployeeID|       Name|Salary|DepartmentName|row|
+------------+----------+-----------+------+--------------+---+
|         101|         1|   John Doe| 50000|            HR|  1|
|         102|         2| Jane Smith| 60000|   Engineering|  1|
|         103|         4|Emily White| 70000|         Sales|  1|
+------------+----------+-----------+------+--------------+---+



In [12]:
row_.show()

+------------+----------+-----------+------+--------------+---+
|DepartmentID|EmployeeID|       Name|Salary|DepartmentName|row|
+------------+----------+-----------+------+--------------+---+
|         101|         1|   John Doe| 50000|            HR|  1|
|         102|         2| Jane Smith| 60000|   Engineering|  1|
|         103|         4|Emily White| 70000|         Sales|  1|
+------------+----------+-----------+------+--------------+---+

