In [1]:
import datetime

# Main entry point for DataFrame and SQL functionality.
from pyspark.sql import SparkSession
# Start SPARK Session
spark = SparkSession.builder.appName('Basics').getOrCreate()

from pyspark.sql.functions import *


In [2]:
# Address to the product_info folder
address = "/home/bluepi/Downloads/Update/product_info/"
previous_day = (datetime.datetime.today() - datetime.timedelta(days=10)).strftime('%d-%m-%Y')
print("Previous Date ---->"+previous_day)

# Address to the Previous Day folder
new_address = address + previous_day
print("\nNew Address to read the folder ---->"+new_address)

# Read the Previous Day folder
per_day_data = spark.read.format('csv') \
          .options( header=True, inferschema=True ) \
          .load(new_address)

Previous Date ---->03-04-2020

New Address to read the folder ---->/home/bluepi/Downloads/Update/product_info/03-04-2020


In [None]:
# Insert into mainTable using union operation
mainTable_I_inserted = per_day_data.filter("record_type == 'I' ")
mainTable_I_inserted.show()

In [3]:
print("\nTable of Products to be Updated in Main_Table taken from Per_Day_Table \n")
per_day_data.filter("record_type == 'U' ").orderBy(per_day_data.p_id.asc()).show()

# Created a new DataFrame of records to be updated
from_per_day_data_U = per_day_data.filter("record_type == 'U' ")


Table of Products to be Updated in Main_Table taken from Per_Day_Table 

+----+-----------+-----+-------------------+-----------+
|p_id|     p_name|price|     date_timestamp|record_type|
+----+-----------+-----+-------------------+-----------+
|  10|   Home Ing|  777|2020-04-02 09:57:44|          U|
|  14|Stringtough|  354|2020-04-02 21:35:48|          U|
|  17|  Gembucket|  853|2020-04-02 15:57:41|          U|
|  34|    Bitchip|  833|2020-04-02 15:40:31|          U|
|  36|      Subin|  244|2020-04-02 21:41:57|          U|
|  37|      Otcom|  260|2020-04-02 21:13:24|          U|
|  49|       Stim|  285|2020-04-02 15:44:40|          U|
|  50|    Matsoft|  730|2020-04-02 03:58:25|          U|
|  56|  Voyatouch|  114|2020-04-02 21:17:30|          U|
|  57|  Voyatouch|  175|2020-04-02 09:13:37|          U|
|  65|   Tampflex|  257|2020-04-02 03:28:48|          U|
|  66|  Gembucket|  212|2020-04-02 15:42:30|          U|
|  66|  Gembucket| 1074|2020-04-02 03:46:21|          U|
|  66|  Gembuc

In [4]:
for_join_mT_and_pDD = from_per_day_data_U.groupBy("p_id").agg({"date_timestamp":"max"}).withColumnRenamed("max(date_timestamp)","date_timestamp_1")
for_join_mT_and_pDD = for_join_mT_and_pDD.withColumnRenamed("p_id","p_id_1")
for_join_mT_and_pDD.orderBy(col("p_id_1")).show()

+------+-------------------+
|p_id_1|   date_timestamp_1|
+------+-------------------+
|    10|2020-04-02 09:57:44|
|    14|2020-04-02 21:35:48|
|    17|2020-04-02 15:57:41|
|    34|2020-04-02 15:40:31|
|    36|2020-04-02 21:41:57|
|    37|2020-04-02 21:13:24|
|    49|2020-04-02 15:44:40|
|    50|2020-04-02 03:58:25|
|    56|2020-04-02 21:17:30|
|    57|2020-04-02 09:13:37|
|    65|2020-04-02 03:28:48|
|    66|2020-04-02 21:58:57|
|    80|2020-04-02 03:42:30|
|    81|2020-04-02 21:33:16|
|    98|2020-04-02 09:16:28|
|   101|2020-04-02 03:56:54|
+------+-------------------+



In [5]:
# Performed Join opteration to pick only latest updates only
joined = from_per_day_data_U.join(for_join_mT_and_pDD, (
    from_per_day_data_U.p_id == for_join_mT_and_pDD.p_id_1) & (
    from_per_day_data_U.date_timestamp == for_join_mT_and_pDD.date_timestamp_1) ,'inner')

joined = joined.select(['p_id','p_name','price','date_timestamp','record_type'])
joined.orderBy(col("p_id")).show()

+----+-----------+-----+-------------------+-----------+
|p_id|     p_name|price|     date_timestamp|record_type|
+----+-----------+-----+-------------------+-----------+
|  10|   Home Ing|  777|2020-04-02 09:57:44|          U|
|  14|Stringtough|  354|2020-04-02 21:35:48|          U|
|  17|  Gembucket|  853|2020-04-02 15:57:41|          U|
|  34|    Bitchip|  833|2020-04-02 15:40:31|          U|
|  36|      Subin|  244|2020-04-02 21:41:57|          U|
|  37|      Otcom|  260|2020-04-02 21:13:24|          U|
|  49|       Stim|  285|2020-04-02 15:44:40|          U|
|  50|    Matsoft|  730|2020-04-02 03:58:25|          U|
|  56|  Voyatouch|  114|2020-04-02 21:17:30|          U|
|  57|  Voyatouch|  175|2020-04-02 09:13:37|          U|
|  65|   Tampflex|  257|2020-04-02 03:28:48|          U|
|  66|  Gembucket|  774|2020-04-02 21:58:57|          U|
|  80|     Zathin|  721|2020-04-02 03:42:30|          U|
|  81|      Opela|  243|2020-04-02 21:33:16|          U|
|  98|   Tampflex| 1304|2020-04