In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,broadcast


In [0]:
spark = SparkSession.builder.appName('Pyspark Join').getOrCreate()

In [0]:
table1 = [(1, 'Alice'), (2, 'Bob'),(3, 'Charlie')]

In [0]:
table2 = [(1, 25),(2, 30),(4, 35)]

In [0]:
df1 = spark.createDataFrame(table1,["Id",'Name'])

In [0]:
df2 = spark.createDataFrame(table2,["Id","Age"])

In [0]:
df1.show()
df2.show()

+---+-------+
| Id|   Name|
+---+-------+
|  1|  Alice|
|  2|    Bob|
|  3|Charlie|
+---+-------+

+---+---+
| Id|Age|
+---+---+
|  1| 25|
|  2| 30|
|  4| 35|
+---+---+



In [0]:
# Inner Join
inner_join_df = df1.join(df2, df1.Id==df2.Id,"Inner")
inner_join_df.show()

+---+-----+---+---+
| Id| Name| Id|Age|
+---+-----+---+---+
|  1|Alice|  1| 25|
|  2|  Bob|  2| 30|
+---+-----+---+---+



In [0]:
# Left Join
Left_join_df = df1.join(df2, df1.Id==df2.Id, 'Left').select(df1.Id,df1.Name,df2.Age)
Left_join_df.show()

+---+-------+----+
| Id|   Name| Age|
+---+-------+----+
|  1|  Alice|  25|
|  2|    Bob|  30|
|  3|Charlie|NULL|
+---+-------+----+



In [0]:
# Right Join
right_join_df = df1.join(df2, df1.Id == df2.Id, "right")
right_join_df.show()



+----+-----+---+---+
|  Id| Name| Id|Age|
+----+-----+---+---+
|   1|Alice|  1| 25|
|   2|  Bob|  2| 30|
|NULL| NULL|  4| 35|
+----+-----+---+---+



In [0]:
# LEFT SEMI JOIN
left_semi_join_df = df1.join(df2, df1.Id == df2.Id, "left_semi")
left_semi_join_df.show()

+---+-----+
| Id| Name|
+---+-----+
|  1|Alice|
|  2|  Bob|
+---+-----+



In [0]:
# FULL OUTER JOIN
outer_join_df = df1.join(df2, df1.Id == df2.Id, "outer")
outer_join_df.show()


+----+-------+----+----+
|  Id|   Name|  Id| Age|
+----+-------+----+----+
|   1|  Alice|   1|  25|
|   2|    Bob|   2|  30|
|   3|Charlie|NULL|NULL|
|NULL|   NULL|   4|  35|
+----+-------+----+----+



In [0]:
# BROADCAST JOIN
broadcast_join_df = df1.join(broadcast(df2), df1.Id == df2.Id, "inner")
broadcast_join_df.show()

+---+-----+---+---+
| Id| Name| Id|Age|
+---+-----+---+---+
|  1|Alice|  1| 25|
|  2|  Bob|  2| 30|
+---+-----+---+---+



In [0]:
# SHUFFLE HASH JOIN
shuffle_hash_join_df = df1.join(df2, df1.Id == df2.Id, "inner")
shuffle_hash_join_df.show()


+---+-----+---+---+
| Id| Name| Id|Age|
+---+-----+---+---+
|  1|Alice|  1| 25|
|  2|  Bob|  2| 30|
+---+-----+---+---+



In [0]:
# SHUFFLE SORT JOIN
df1_sorted = df1.sort("Id")
df2_sorted = df2.sort("Id")
shuffle_sort_join_df = df1_sorted.join(df2_sorted, df1_sorted.Id == df2_sorted.Id, "inner")

In [0]:
# LEFT ANTI JOIN
left_anti_join_df = df1.join(df2, df1.Id == df2.Id, "left_anti")
left_anti_join_df.show()


+---+-------+
| Id|   Name|
+---+-------+
|  3|Charlie|
+---+-------+

