In [16]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import broadcast

spark = (SparkSession.builder
         .appName("perfomr-joins")
         .master("spark://spark-master:7077")
         .config("spark.executor.memory", "512m")
         .getOrCreate())

spark.sparkContext.setLogLevel("ERROR")

In [17]:
cards_df = (spark.read.format("csv")
      .option("header", "true")
      .option("nullValue", "null")
      .load("../data/Credit Card/CardBase.csv"))


customers_df = (spark.read.format("csv")
      .option("header", "true")
      .option("nullValue", "null")
      .load("../data/Credit Card/CustomerBase.csv"))


transactions_df = (spark.read.format("csv")
      .option("header", "true")
      .option("nullValue", "null")
      .load("../data/Credit Card/TransactionBase.csv"))


fraud_df = (spark.read.format("csv")
      .option("header", "true")
      .option("nullValue", "null")
      .load("../data/Credit Card/FraudBase.csv"))

                                                                                

In [18]:
customer_cards_df = (
    cards_df.join(customers_df
                  , on='Cust_ID'
                  , how='inner'))

customer_cards_df.show()

                                                                                

+-------+-------------------+-----------+------------+---+----------------+----------------------+
|Cust_ID|        Card_Number|Card_Family|Credit_Limit|Age|Customer_Segment|Customer_Vintage_Group|
+-------+-------------------+-----------+------------+---+----------------+----------------------+
|CC55858|2868-5606-5152-5706|       Gold|       27000| 30|         Diamond|                   VG1|
|CC46077|6876-7378-4945-3251|       Gold|       44000| 49|         Diamond|                   VG1|
|CC46484|5556-4557-4566-1540|       Gold|       45000| 49|         Diamond|                   VG1|
|CC59340|5618-9718-9367-2102|       Gold|       14000| 25|         Diamond|                   VG1|
|CC62994|1652-7516-1273-1992|   Platinum|      180000| 48|         Diamond|                   VG1|
|CC43841|7212-8665-7734-5918|   Platinum|       55000| 30|         Diamond|                   VG1|
|CC21312|7837-4036-5999-1672|       Gold|       24000| 45|         Diamond|                   VG1|
|CC90510|6

In [19]:
joined_transactions_df = (
    transactions_df.join(fraud_df
                         , on='Transaction_ID'
                         , how='left_outer'))

joined_transactions_df.show()

+--------------+----------------+-------------------+-----------------+-------------------+----------+
|Transaction_ID|Transaction_Date|     Credit_Card_ID|Transaction_Value|Transaction_Segment|Fraud_Flag|
+--------------+----------------+-------------------+-----------------+-------------------+----------+
|  CTID28830551|       24-Apr-16|1629-9566-3285-2123|            23649|              SEG25|      null|
|  CTID45504917|       11-Feb-16|3697-6001-4909-5350|            26726|              SEG16|      null|
|  CTID47312290|        1-Nov-16|5864-4475-3659-1440|            22012|              SEG14|      null|
|  CTID25637718|       28-Jan-16|5991-4421-8476-3804|            37637|              SEG17|      null|
|  CTID66743960|       17-Mar-16|1893-8853-9900-8478|             5113|              SEG14|      null|
|  CTID22308010|       15-May-16|5206-5979-9383-4538|             9551|              SEG13|      null|
|  CTID41917614|       11-Jul-16|5129-6974-6371-2964|            29511|  

In [20]:
joinExpr = (
            (customer_cards_df["Card_Number"] == joined_transactions_df["Credit_Card_ID"]) 
            & (joined_transactions_df["Fraud_Flag"].isNotNull()))

customer_with_fraud_df = (
    customer_cards_df.join(joined_transactions_df
                           , on=joinExpr
                           , how='inner'))
customer_with_fraud_df.show()


                                                                                

+-------+-------------------+-----------+------------+---+----------------+----------------------+--------------+----------------+-------------------+-----------------+-------------------+----------+
|Cust_ID|        Card_Number|Card_Family|Credit_Limit|Age|Customer_Segment|Customer_Vintage_Group|Transaction_ID|Transaction_Date|     Credit_Card_ID|Transaction_Value|Transaction_Segment|Fraud_Flag|
+-------+-------------------+-----------+------------+---+----------------+----------------------+--------------+----------------+-------------------+-----------------+-------------------+----------+
|CC87306|5734-5619-8469-4044|       Gold|       36000| 30|         Diamond|                   VG1|  CTID26555772|       11-Jan-16|5734-5619-8469-4044|              683|              SEG22|         1|
|CC87034|6722-7299-6082-7974|       Gold|       34000| 36|        Platinum|                   VG2|  CTID30763806|       17-Dec-16|6722-7299-6082-7974|            40751|              SEG21|         1|


### Right Outer Join

In [21]:
data1 = [("Alice", "F", 25), ("Bob", "M", 30), ("Charlie", "M", 35), ("Dave", "M", 40)]
df1 = spark.createDataFrame(data1, ["Name", "Gender", "Age"])

data2 = [("Charlie", "M"), ("Dave", "M"), ("Eve", "F")]
df2 = spark.createDataFrame(data2, ["Name", "Gender"])

In [22]:
right_join = df1.join(df2, on='Name', how='right_outer')
right_join.show()

[Stage 17:>                                                         (0 + 2) / 2]

+-------+------+----+------+
|   Name|Gender| Age|Gender|
+-------+------+----+------+
|Charlie|     M|  35|     M|
|   Dave|     M|  40|     M|
|    Eve|  null|null|     F|
+-------+------+----+------+



                                                                                

### Full outer join

In [23]:
full_join = df1.join(df2, on='Name', how='outer')
full_join.show()

+-------+------+----+------+
|   Name|Gender| Age|Gender|
+-------+------+----+------+
|  Alice|     F|  25|  null|
|    Bob|     M|  30|  null|
|Charlie|     M|  35|     M|
|   Dave|     M|  40|     M|
|    Eve|  null|null|     F|
+-------+------+----+------+



### Cross join

In [24]:
cross_join = df1.crossJoin(df2)
cross_join.show()

+-------+------+---+-------+------+
|   Name|Gender|Age|   Name|Gender|
+-------+------+---+-------+------+
|  Alice|     F| 25|Charlie|     M|
|    Bob|     M| 30|Charlie|     M|
|  Alice|     F| 25|   Dave|     M|
|  Alice|     F| 25|    Eve|     F|
|    Bob|     M| 30|   Dave|     M|
|    Bob|     M| 30|    Eve|     F|
|Charlie|     M| 35|Charlie|     M|
|   Dave|     M| 40|Charlie|     M|
|Charlie|     M| 35|   Dave|     M|
|Charlie|     M| 35|    Eve|     F|
|   Dave|     M| 40|   Dave|     M|
|   Dave|     M| 40|    Eve|     F|
+-------+------+---+-------+------+



                                                                                

### Broadcast join

In [25]:
broadcast_join = df1.join(broadcast(df2), ["Name", "Gender"], "inner")
broadcast_join.show()

+-------+------+---+
|   Name|Gender|Age|
+-------+------+---+
|Charlie|     M| 35|
|   Dave|     M| 40|
+-------+------+---+



### Multiple Join Conditions

In [26]:
multi_join = df1.join(df2, on=['Name', 'Gender'], how='inner')
multi_join.show()

+-------+------+---+
|   Name|Gender|Age|
+-------+------+---+
|Charlie|     M| 35|
|   Dave|     M| 40|
+-------+------+---+



In [27]:
spark.stop()