# JOIN() + UNION()

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import explode, col, isnull

spark = (
    SparkSession.builder
    .appName("example-join-union")
    .getOrCreate()
)

In [None]:
# Emp Table
empData = [(1,"Smith",10), (2,"Rose",20),(3,"Williams",10), (4,"Jones",30)]

empColumns = ["emp_id","name","emp_dept_id"]

empDF = spark.createDataFrame(empData,empColumns)
empDF.show()

In [None]:
# Dept Table
deptData = [("Finance",10), ("Marketing",20), ("Sales",30),("IT",40)]

deptColumns = ["dept_name","dept_id"]

deptDF=spark.createDataFrame(deptData,deptColumns)  
deptDF.show()

In [None]:
# Address Table
addData=[(1,"1523 Main St","SFO","CA"),
         (2,"3453 Orange St","SFO","NY"),
         (3,"34 Warner St","Jersey","NJ"),
         (4,"221 Cavalier St","Newark","DE"),
         (5,"789 Walnut St","Sandiago","CA")]

addColumns = ["emp_id", "addline1", "city", "state"]

addDF = spark.createDataFrame(addData,addColumns)
addDF.show()

### PySpark Join Two DataFrames

In [None]:
# join(right, joinExprs, joinType)
# join(right)

empDF.join(addDF, empDF["emp_id"] == addDF["emp_id"]).show()

In [None]:
# Drop Duplicate Columns After Join
empDF.join(addDF,["emp_id"]).show()

In [None]:
#Join Multiple DataFrames by chaining
empDF.join(addDF,["emp_id"]) \
     .join(deptDF,empDF["emp_dept_id"] == deptDF["dept_id"]) \
     .show()

In [None]:
# Using Where for Join Condition
empDF.join(deptDF).where(empDF["emp_dept_id"] == deptDF["dept_id"]) \
    .join(addDF).where(empDF["emp_id"] == addDF["emp_id"]) \
    .show()

In [None]:
# Using Filter for Join Condition
empDF.join(deptDF).filter(empDF["emp_dept_id"] == deptDF["dept_id"]) \
    .join(addDF).filter(empDF["emp_id"] == addDF["emp_id"]) \
    .show()

In [None]:
# SQL
empDF.createOrReplaceTempView("emp")
deptDF.createOrReplaceTempView("dept")
addDF.createOrReplaceTempView("add")

spark.sql("""
             select * 
               from emp e, dept d, add a
              where e.emp_dept_id == d.dept_id 
                and e.emp_id == a.emp_id
         """) \
    .show()

In [None]:
# PySpark Join With Multiple Columns & Conditions
df1 = spark.createDataFrame([(1, "A"), (2, "B"), (3, "C")], ["A1", "A2"])

df2 = spark.createDataFrame([(1, "F"), (2, "B")], ["B1", "B2"])

df = df1.join(df2, (df1.A1 == df2.B1) & (df1.A2 == df2.B2))
df.show()

The code block displayed below contains an error. The code block is intended to perform an outer join of DataFrames transactionsDf and itemsDf on columns productId and itemId, respectively. Find the error.

Code block:

transactionsDf.join(itemsDf, [itemsDf.itemId, transactionsDf.productId], "outer”)

- The "outer" argument should be eliminated, since "outer" is the default join type.
- The join type needs to be appended to the join() operator, like join().outer() instead of listing it as the last argument inside the join() call.
- The term [itemsDf.itemId, transactionsDf.productId] should be replaced by itemsDf.itemId == transactionsDf.productId.
- The term [itemsDf.itemId, transactionsDf.productId] should be replaced by itemsDf.col("itemId") == transactionsDf.col("productId").
- The "outer" argument should be eliminated from the call and join should be replaced by joinOuter.

In [None]:
# Error [df1.A1, df2.B1]
df1.join(df2, [df1.A1, df2.B1], "outer").show()

In [None]:
# Correct 
cond = [df1.A1 == df2.B1]
df1.join(df2, cond, 'outer').show()


In [None]:
data = [(1, 'Thick Coat for Walking in the Snow', ['blue', 'winter', 'cozy'], 'Sports Company Inc.'),
        (2, 'Elegant Outdoors Summer Dress', ['red', 'summer'], 'YetiX'),
        (3, 'Outdoors Backpack', ['green', 'summer'], 'Sports Company Inc.')]

columns = ["itemId", "itemName", "attributes", "supplier"]

itemsDf = spark.createDataFrame(data=data, schema=columns)

itemsDf.printSchema()
itemsDf.show()

In [None]:
data = [(1, 3, 4, 25, 1, None, 1587915332),
         (2, 6, 7, 2, 2, None, 1586815312),
         (3, 3, None, 25, 3, None, 1585824821),
         (4, None, None, 3, 2, None, 1583244275),
         (5, None, None, None, 2, None, 1575285427),
         (6, 3, 2, 25, 2, None, 1572733275)]

schema = StructType([StructField('transactionId', IntegerType(), True),
                     StructField('predError', IntegerType(), True),
                     StructField('value', IntegerType(), True),
                     StructField('storeId', IntegerType(), True),
                     StructField('productId', IntegerType(), True),
                     StructField('f', IntegerType(), True),
                     StructField('transactionDate', LongType(), True)])

transactionsDf = spark.createDataFrame(data=data, schema=schema)

In [None]:
itemsDf.join(transactionsDf, itemsDf.itemId == transactionsDf.transactionId, "inner").show()

In [None]:
# AssertionError: how should be basestring
itemsDf.join(transactionsDf, "inner", itemsDf.itemId == transactionsDf.transactionId)

# NameError: name 'itemId' is not defined
itemsDf.join(transactionsDf, itemId == transactionId)

# AnalysisException: USING column `itemsDf.itemId == transactionsDf.transactionId` 
# cannot be resolved on the left side of the join. 
# The left-side columns: [itemId, itemName, attributes, supplier];
itemsDf.join(transactionsDf, "itemsDf.itemId == transactionsDf.transactionId", "inner")


# Py4JError: An error occurred while calling z:org.apache.spark.sql.functions.col. Trace:
# py4j.Py4JException: Method col([class org.apache.spark.sql.Column]) does not exist
itemsDf.join(transactionsDf, col(itemsDf.itemId) == col(transactionsDf.transactionId))

Which of the following code blocks concatenates rows of DataFrames transactionsDf and transactionsNewDf, omitting any duplicates?
>
- `transactionsDf.concat(transactionsNewDf).unique()`
- `transactionsDf.union(transactionsNewDf).distinct()`
- `spark.union(transactionsDf, transactionsNewDf).distinct()`
- `transactionsDf.join(transactionsNewDf, how="union").distinct()`
- `transactionsDf.union(transactionsNewDf).unique()`

In [None]:
transactionsNewDf = transactionsDf

In [None]:
transactionsDf.union(transactionsNewDf).distinct().show()


In [None]:
# AttributeError: 'DataFrame' object has no attribute 'concat'
transactionsDf.concat(transactionsNewDf).unique()

# AttributeError: 'SparkSession' object has no attribute 'union'
spark.union(transactionsDf, transactionsNewDf).distinct()

# IllegalArgumentException: Unsupported join type 'union'.
transactionsDf.join(transactionsNewDf, how="union").distinct()

# AttributeError: 'DataFrame' object has no attribute 'unique'
transactionsDf.union(transactionsNewDf).unique()

Which of the following code blocks returns a DataFrame that is an inner join of DataFrame itemsDf and DataFrame transactionsDf, on columns itemId and productId, respectively and in which every itemId just appears once?
>
- `itemsDf.join(transactionsDf, "itemsDf.itemId==transactionsDf.productId").distinct("itemId")`
- `itemsDf.join(transactionsDf, itemsDf.itemId==transactionsDf.productId).dropDuplicates(["itemId"])`
- `itemsDf.join(transactionsDf, itemsDf.itemId==transactionsDf.productId).dropDuplicates("itemId")`
- `itemsDf.join(transactionsDf, itemsDf.itemId==transactionsDf.productId, how="inner").distinct(["itemId"])`
- `itemsDf.join(transactionsDf, "itemsDf.itemId==transactionsDf.productId", how="inner").dropDuplicates(["itemId"])`

In [None]:
itemsDf.join(transactionsDf, itemsDf.itemId==transactionsDf.productId).dropDuplicates(["itemId"]).show()


In [None]:
# AnalysisException: USING column `itemsDf.itemId==transactionsDf.productId` cannot be resolved on the left side of the join. The left-side columns: [itemId, itemName, attributes, supplier]
itemsDf.join(transactionsDf, "itemsDf.itemId==transactionsDf.productId").distinct("itemId")

# Py4JError: An error occurred while calling z:org.apache.spark.api.python.PythonUtils.toSeq. Trace:
# py4j.Py4JException: Method toSeq([class java.lang.String]) does not exist
itemsDf.join(transactionsDf, itemsDf.itemId==transactionsDf.productId).dropDuplicates("itemId")

# TypeError: distinct() takes 1 positional argument but 2 were given
itemsDf.join(transactionsDf, itemsDf.itemId==transactionsDf.productId, how="inner").distinct(["itemId"])

# AnalysisException: USING column `itemsDf.itemId==transactionsDf.productId` cannot be resolved on the left side of the join. The left-side columns: [itemId, itemName, attributes, supplier]
itemsDf.join(transactionsDf, "itemsDf.itemId==transactionsDf.productId", how="inner").dropDuplicates(["itemId"])

In which order should the code blocks shown below be run in order to return the number of records that are not empty in column value in the DataFrame resulting from an inner join of DataFrame transactionsDf and itemsDf on columns productId and itemId, respectively?
>
- `1. .filter(~isnull(col(‘value’)))`
- `2. .count()`
- `3. transactionsDf.join(itemsDf, col("transactionsDf.productId”)==col(“itemsDf.itemId”))`
- `4. transactionsDf.join(itemsDf, transactionsDf.productId==itemsDf.itemId, how=’inner’)`
- `5. .filter(col(‘value’).isnotnull())`
- `6. .sum(col(‘value’))`

In [None]:
transactionsDf.join(itemsDf, transactionsDf.productId==itemsDf.itemId, how='inner') \
              .filter(~isnull(col('value'))) \
              .count()

You have the following code block for joining two DataFrames and selecting some columns after join.
joinType = “inner”
>
- `joinExpr = df1.BatchID == df2.BatchID`
- `df1.join(df2, joinExpr, joinType).select(“BatchID”, “Year”).show()`
>
Choose the correct statement about the above code block.
>
- `The code will apply inner join df1 and df2 and show joined records.`
- `There is a syntax error in this code`
- `The joinExpr are is incorrect`
- `The code block will fail with error : Reference 'BatchID' is ambiguous`

In [None]:
joinType = "inner"
joinExpr = empDF.emp_id == addDF.emp_id

empDF.join(addDF, joinExpr, joinType).select("emp_id", "name").show()

You are given two DataFrames. The first DataFrame df1 is shown below.
>
- `+——+———–+——+`
- `| Name | Department|Salary|`
- `+——+———–+——+`
- `| John | Accounts | 5000 |`
- `|Sheela|Development| 5500 |`
- `+——+———–+——+`
>
The second DataFrame df2 is shown below.
>
- `+——+———–+——+`
- `| Name | Department|Salary|`
- `+——+———–+——+`
- `| John | Accounts | 5000 |`
- `|Sheela|Development| 5500 |`
- `+——+———–+——+`
>
You want to combine these two data frames and eliminate any duplicates. You know UNION operation in Spark SQL combines two tables and also removes duplicates. How will you do the same in Spark DataFrame API? Choose the correct option.
>
- `df3 = df1.union(df2)`
- `df3 = df1.union(df2).unique()`
- `df3 = df1.union(df2).distinct()`
- `df3 = df1.union(df2).deleteDuplicates()`

In [37]:
# Emp Table
data = [("John","Accounts", 5000), ("John","Development", 5500)]
columns = ["Name", "Department", "Salary"]

df1 = spark.createDataFrame(data, columns)
df2 = spark.createDataFrame(data, columns)

In [42]:
df3 = df1.union(df2).show()


+----+-----------+------+
|Name| Department|Salary|
+----+-----------+------+
|John|   Accounts|  5000|
|John|Development|  5500|
|John|   Accounts|  5000|
|John|Development|  5500|
+----+-----------+------+



In [43]:
df3 = df1.union(df2).distinct().show()


+----+-----------+------+
|Name| Department|Salary|
+----+-----------+------+
|John|Development|  5500|
|John|   Accounts|  5000|
+----+-----------+------+



In [None]:
# AttributeError: 'DataFrame' object has no attribute 'unique'
df3 = df1.union(df2).unique()

# AttributeError: 'DataFrame' object has no attribute 'deleteDuplicates'
df3 = df1.union(df2).deleteDuplicates()

In [11]:
data = [('X1','2021','Scala',270),
        ('Y5','2021','Scala',230),
        ('N3','2020','Scala',150),
        ('C5','2020','Scala',100),
        ('D7','2020','Python',300),
        ('D3','2021','Python',400),
        ('H2','2021','Python',500)]

columns = ["BatchID", "Year", "CourseName", "Students"]

df1 = spark.createDataFrame(data=data, schema = columns)

In [12]:
data = [('X1','Scala',270),
        ('Y5','Scala',230),
        ('N3','Scala',150),
        ('C5','Scala',100),
        ('D7','Python',300),
        ('D3','Python',400),
        ('H2','Python',500)]

columns = ["BatchID", "CourseName", "Students"]

df2 = spark.createDataFrame(data=data, schema = columns)

In [15]:
joinType = "inner" 
joinExpr = df1.BatchID == df2.BatchID 
df1.join(df2, joinExpr, joinType).select(df1.BatchID, df1.Year).show()

+-------+----+
|BatchID|Year|
+-------+----+
|     D7|2020|
|     N3|2020|
|     D3|2021|
|     X1|2021|
|     C5|2020|
|     Y5|2021|
|     H2|2021|
+-------+----+



In [13]:
joinType = "inner" 
joinExpr = "BatchID" 
df1.join(df2, joinExpr, joinType).select("BatchID", "Year").show()


+-------+----+
|BatchID|Year|
+-------+----+
|     D7|2020|
|     N3|2020|
|     D3|2021|
|     X1|2021|
|     C5|2020|
|     Y5|2021|
|     H2|2021|
+-------+----+



In [14]:
joinType = "inner" 
joinExpr = df1.BatchID == df2.BatchID 
df1.join(df2, joinExpr, joinType).select(df1.BatchID, df1.Year).show()


+-------+----+
|BatchID|Year|
+-------+----+
|     D7|2020|
|     N3|2020|
|     D3|2021|
|     X1|2021|
|     C5|2020|
|     Y5|2021|
|     H2|2021|
+-------+----+

