# JOIN()

In [1]:
import pyspark
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("example-join")
    .getOrCreate()
)

In [2]:
# Emp Table
empData = [(1,"Smith",10), (2,"Rose",20),(3,"Williams",10), (4,"Jones",30)]

empColumns = ["emp_id","name","emp_dept_id"]

empDF = spark.createDataFrame(empData,empColumns)
empDF.show()

+------+--------+-----------+
|emp_id|    name|emp_dept_id|
+------+--------+-----------+
|     1|   Smith|         10|
|     2|    Rose|         20|
|     3|Williams|         10|
|     4|   Jones|         30|
+------+--------+-----------+



In [3]:
# Dept Table
deptData = [("Finance",10), ("Marketing",20), ("Sales",30),("IT",40)]

deptColumns = ["dept_name","dept_id"]

deptDF=spark.createDataFrame(deptData,deptColumns)  
deptDF.show()

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|  Finance|     10|
|Marketing|     20|
|    Sales|     30|
|       IT|     40|
+---------+-------+



In [4]:
# Address Table
addData=[(1,"1523 Main St","SFO","CA"),
         (2,"3453 Orange St","SFO","NY"),
         (3,"34 Warner St","Jersey","NJ"),
         (4,"221 Cavalier St","Newark","DE"),
         (5,"789 Walnut St","Sandiago","CA")]

addColumns = ["emp_id", "addline1", "city", "state"]

addDF = spark.createDataFrame(addData,addColumns)
addDF.show()

+------+---------------+--------+-----+
|emp_id|       addline1|    city|state|
+------+---------------+--------+-----+
|     1|   1523 Main St|     SFO|   CA|
|     2| 3453 Orange St|     SFO|   NY|
|     3|   34 Warner St|  Jersey|   NJ|
|     4|221 Cavalier St|  Newark|   DE|
|     5|  789 Walnut St|Sandiago|   CA|
+------+---------------+--------+-----+



### PySpark Join Two DataFrames

In [5]:
# join(right, joinExprs, joinType)
# join(right)

empDF.join(addDF, empDF["emp_id"] == addDF["emp_id"]).show()

+------+--------+-----------+------+---------------+------+-----+
|emp_id|    name|emp_dept_id|emp_id|       addline1|  city|state|
+------+--------+-----------+------+---------------+------+-----+
|     1|   Smith|         10|     1|   1523 Main St|   SFO|   CA|
|     3|Williams|         10|     3|   34 Warner St|Jersey|   NJ|
|     2|    Rose|         20|     2| 3453 Orange St|   SFO|   NY|
|     4|   Jones|         30|     4|221 Cavalier St|Newark|   DE|
+------+--------+-----------+------+---------------+------+-----+



In [6]:
# Drop Duplicate Columns After Join
empDF.join(addDF,["emp_id"]).show()

+------+--------+-----------+---------------+------+-----+
|emp_id|    name|emp_dept_id|       addline1|  city|state|
+------+--------+-----------+---------------+------+-----+
|     1|   Smith|         10|   1523 Main St|   SFO|   CA|
|     3|Williams|         10|   34 Warner St|Jersey|   NJ|
|     2|    Rose|         20| 3453 Orange St|   SFO|   NY|
|     4|   Jones|         30|221 Cavalier St|Newark|   DE|
+------+--------+-----------+---------------+------+-----+



In [7]:
#Join Multiple DataFrames by chaining
empDF.join(addDF,["emp_id"]) \
     .join(deptDF,empDF["emp_dept_id"] == deptDF["dept_id"]) \
     .show()

+------+--------+-----------+---------------+------+-----+---------+-------+
|emp_id|    name|emp_dept_id|       addline1|  city|state|dept_name|dept_id|
+------+--------+-----------+---------------+------+-----+---------+-------+
|     1|   Smith|         10|   1523 Main St|   SFO|   CA|  Finance|     10|
|     3|Williams|         10|   34 Warner St|Jersey|   NJ|  Finance|     10|
|     4|   Jones|         30|221 Cavalier St|Newark|   DE|    Sales|     30|
|     2|    Rose|         20| 3453 Orange St|   SFO|   NY|Marketing|     20|
+------+--------+-----------+---------------+------+-----+---------+-------+



In [8]:
# Using Where for Join Condition
empDF.join(deptDF).where(empDF["emp_dept_id"] == deptDF["dept_id"]) \
    .join(addDF).where(empDF["emp_id"] == addDF["emp_id"]) \
    .show()

+------+--------+-----------+---------+-------+------+---------------+------+-----+
|emp_id|    name|emp_dept_id|dept_name|dept_id|emp_id|       addline1|  city|state|
+------+--------+-----------+---------+-------+------+---------------+------+-----+
|     1|   Smith|         10|  Finance|     10|     1|   1523 Main St|   SFO|   CA|
|     3|Williams|         10|  Finance|     10|     3|   34 Warner St|Jersey|   NJ|
|     2|    Rose|         20|Marketing|     20|     2| 3453 Orange St|   SFO|   NY|
|     4|   Jones|         30|    Sales|     30|     4|221 Cavalier St|Newark|   DE|
+------+--------+-----------+---------+-------+------+---------------+------+-----+



In [9]:
# Using Filter for Join Condition
empDF.join(deptDF).filter(empDF["emp_dept_id"] == deptDF["dept_id"]) \
    .join(addDF).filter(empDF["emp_id"] == addDF["emp_id"]) \
    .show()

+------+--------+-----------+---------+-------+------+---------------+------+-----+
|emp_id|    name|emp_dept_id|dept_name|dept_id|emp_id|       addline1|  city|state|
+------+--------+-----------+---------+-------+------+---------------+------+-----+
|     1|   Smith|         10|  Finance|     10|     1|   1523 Main St|   SFO|   CA|
|     3|Williams|         10|  Finance|     10|     3|   34 Warner St|Jersey|   NJ|
|     2|    Rose|         20|Marketing|     20|     2| 3453 Orange St|   SFO|   NY|
|     4|   Jones|         30|    Sales|     30|     4|221 Cavalier St|Newark|   DE|
+------+--------+-----------+---------+-------+------+---------------+------+-----+



In [10]:
# SQL
empDF.createOrReplaceTempView("emp")
deptDF.createOrReplaceTempView("dept")
addDF.createOrReplaceTempView("add")

spark.sql("""
             select * 
               from emp e, dept d, add a
              where e.emp_dept_id == d.dept_id 
                and e.emp_id == a.emp_id
         """) \
    .show()

+------+--------+-----------+---------+-------+------+---------------+------+-----+
|emp_id|    name|emp_dept_id|dept_name|dept_id|emp_id|       addline1|  city|state|
+------+--------+-----------+---------+-------+------+---------------+------+-----+
|     1|   Smith|         10|  Finance|     10|     1|   1523 Main St|   SFO|   CA|
|     3|Williams|         10|  Finance|     10|     3|   34 Warner St|Jersey|   NJ|
|     2|    Rose|         20|Marketing|     20|     2| 3453 Orange St|   SFO|   NY|
|     4|   Jones|         30|    Sales|     30|     4|221 Cavalier St|Newark|   DE|
+------+--------+-----------+---------+-------+------+---------------+------+-----+



In [11]:
# PySpark Join With Multiple Columns & Conditions
df1 = spark.createDataFrame([(1, "A"), (2, "B"), (3, "C")], ["A1", "A2"])

df2 = spark.createDataFrame([(1, "F"), (2, "B")], ["B1", "B2"])

df = df1.join(df2, (df1.A1 == df2.B1) & (df1.A2 == df2.B2))
df.show()

+---+---+---+---+
| A1| A2| B1| B2|
+---+---+---+---+
|  2|  B|  2|  B|
+---+---+---+---+

