# Create a Spark session

In [6]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("JoinExample").getOrCreate()

# Sample data for DataFrame df1

In [7]:

data1 = [(1, "Martin"), (2, "Sam"), (3, "Leo"), (6,"Justin")]
columns1 = ["id", "name"]
df1 = spark.createDataFrame(data1, columns1)


# Sample data for DataFrame df2

In [8]:

data2 = [(1, 25), (2, 30), (4, 22), (5,23)]
columns2 = ["id", "age"]
df2 = spark.createDataFrame(data2, columns2)

# Display the DataFrames

In [9]:

print("DataFrame 1:")
df1.show()


DataFrame 1:
+---+------+
| id|  name|
+---+------+
|  1|Martin|
|  2|   Sam|
|  3|   Leo|
|  6|Justin|
+---+------+



In [10]:

print("DataFrame 2:")
df2.show()

DataFrame 2:
+---+---+
| id|age|
+---+---+
|  1| 25|
|  2| 30|
|  4| 22|
|  5| 23|
+---+---+



# INNER JOIN

In [11]:
inner_join_df = df1.join(df2, df1.id == df2.id, "inner")
print("INNER JOIN:")
inner_join_df.show()

INNER JOIN:
+---+------+---+---+
| id|  name| id|age|
+---+------+---+---+
|  1|Martin|  1| 25|
|  2|   Sam|  2| 30|
+---+------+---+---+



# LEFT OUTER JOIN

In [12]:

left_outer_join_df = df1.join(df2, df1.id == df2.id, "left_outer")
print("LEFT OUTER JOIN:")
left_outer_join_df.show()

LEFT OUTER JOIN:
+---+------+----+----+
| id|  name|  id| age|
+---+------+----+----+
|  1|Martin|   1|  25|
|  2|   Sam|   2|  30|
|  3|   Leo|NULL|NULL|
|  6|Justin|NULL|NULL|
+---+------+----+----+



# RIGHT OUTER JOIN

In [13]:

right_outer_join_df = df1.join(df2, df1.id == df2.id, "right_outer")
print("RIGHT OUTER JOIN:")
right_outer_join_df.show()

RIGHT OUTER JOIN:
+----+------+---+---+
|  id|  name| id|age|
+----+------+---+---+
|   1|Martin|  1| 25|
|   2|   Sam|  2| 30|
|NULL|  NULL|  4| 22|
|NULL|  NULL|  5| 23|
+----+------+---+---+



# LEFT ANTI JOIN

In [14]:

left_anti_join_df = df1.join(df2, df1.id == df2.id, "left_anti")
print("LEFT ANTI JOIN:")
left_anti_join_df.show()


LEFT ANTI JOIN:
+---+------+
| id|  name|
+---+------+
|  3|   Leo|
|  6|Justin|
+---+------+



# LEFT SEMI JOIN

In [15]:

left_semi_join_df = df1.join(df2, df1.id == df2.id, "left_semi")
print("LEFT SEMI JOIN:")
left_semi_join_df.show()

LEFT SEMI JOIN:
+---+------+
| id|  name|
+---+------+
|  1|Martin|
|  2|   Sam|
+---+------+



# FULL OUTER JOIN

In [16]:

full_outer_join_df = df1.join(df2, df1.id == df2.id, "outer")
print("FULL OUTER JOIN:")
full_outer_join_df.show()



FULL OUTER JOIN:
+----+------+----+----+
|  id|  name|  id| age|
+----+------+----+----+
|   1|Martin|   1|  25|
|   2|   Sam|   2|  30|
|   3|   Leo|NULL|NULL|
|NULL|  NULL|   4|  22|
|NULL|  NULL|   5|  23|
|   6|Justin|NULL|NULL|
+----+------+----+----+



# Stop the Spark session

In [17]:

spark.stop()