In [1]:
import os
from pyspark import SparkConf,SparkContext
from pyspark.sql import HiveContext,Row 
from pyspark.sql import SparkSession
from pyspark.sql.types import *

In [2]:
sc = SparkContext.getOrCreate()
ss = SparkSession.builder.getOrCreate()

In [3]:
d = sc.parallelize([("Alice",18),("Bob",20), ("Tom",40)])

In [4]:
d2 = sc.parallelize([("Bob",85), (None,80)])

In [5]:
schema1 = StructType([
  StructField("name", StringType(), True),
  StructField("age", IntegerType(), True)])

In [6]:
schema2 = StructType([
  StructField("name", StringType(), True),
  StructField("height", IntegerType(), True)])

In [7]:
df = ss.createDataFrame(d, schema1)
df2 = ss.createDataFrame(d2, schema2)

In [8]:
df.show()
df2.show()

+-----+---+
| name|age|
+-----+---+
|Alice| 18|
|  Bob| 20|
|  Tom| 40|
+-----+---+

+----+------+
|name|height|
+----+------+
| Bob|    85|
|null|    80|
+----+------+



## Inner Join

In [9]:
df.join(df2, 'name').show()

+----+---+------+
|name|age|height|
+----+---+------+
| Bob| 20|    85|
+----+---+------+



In [None]:
df.join(df2, df.name == df2.name).show()

In [10]:
df.join(df2, 'name', 'inner').show()

+----+---+------+
|name|age|height|
+----+---+------+
| Bob| 20|    85|
+----+---+------+



## Outer Join

In [11]:
df.join(df2, df.name == df2.name, 'outer').show()

+-----+----+----+------+
| name| age|name|height|
+-----+----+----+------+
|  Tom|  40|null|  null|
| null|null|null|    80|
|  Bob|  20| Bob|    85|
|Alice|  18|null|  null|
+-----+----+----+------+



In [12]:
df.join(df2, 'name', 'outer').show()

+-----+----+------+
| name| age|height|
+-----+----+------+
|  Tom|  40|  null|
| null|null|    80|
|  Bob|  20|    85|
|Alice|  18|  null|
+-----+----+------+



## Left Outer Join

In [14]:
df.join(df2, 'name', 'left_outer').show()

+-----+---+------+
| name|age|height|
+-----+---+------+
|  Tom| 40|  null|
|  Bob| 20|    85|
|Alice| 18|  null|
+-----+---+------+



## Right Outer Join

In [15]:
df.join(df2, 'name', 'right_outer').show()

+----+----+------+
|name| age|height|
+----+----+------+
|null|null|    80|
| Bob|  20|    85|
+----+----+------+



## Left-Semi Join

In [16]:
df.join(df2, 'name', 'leftsemi').show()

+----+---+
|name|age|
+----+---+
| Bob| 20|
+----+---+



In [17]:
df.join(df2, 'name', 'leftsemi').select(df.name, df2.height).show() #Error

AnalysisException: 'Resolved attribute(s) height#5 missing from name#0,age#1 in operator !Project [name#0, height#5].;;\n!Project [name#0, height#5]\n+- AnalysisBarrier\n      +- Project [name#0, age#1]\n         +- Join LeftSemi, (name#0 = name#4)\n            :- LogicalRDD [name#0, age#1], false\n            +- LogicalRDD [name#4, height#5], false\n'

In [18]:
df.join(df2, 'name', 'leftsemi').select(df.name, df.age).show() #Error

+----+---+
|name|age|
+----+---+
| Bob| 20|
+----+---+

