## Libraries Used

In [1]:
import datetime

# Main entry point for DataFrame and SQL functionality.
from pyspark.sql import SparkSession
# Start SPARK Session
spark = SparkSession.builder.appName('Basics').getOrCreate()

from pyspark.sql.functions import *


### Read the Main Table

In [2]:
mainTable = spark.read.format('csv').options(
    header=True, inferschema=True).load(
        "/home/bluepi/Downloads/Update/product_info/main table/main table.csv")

# Main Table Schema
mainTable.printSchema()

root
 |-- p_id: integer (nullable = true)
 |-- p_name: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- date_timestamp: timestamp (nullable = true)



### New Main Table with added columns

In [3]:
from pyspark.sql.functions import year, month, dayofyear, hour, minute, second

# Add record type to main table
mainTable_new = mainTable.withColumn('record_type',lit("A"))

# mainTable_new = mainTable_new.select(['p_id', 'p_name', 'price', 'date_timestamp','record_type',
#                               year(mainTable['date_timestamp']).alias("Year"),
#                               month(mainTable['date_timestamp']).alias("Month"),
#                               dayofyear(mainTable['date_timestamp']).alias("DayOfYear"),
#                               hour(mainTable['date_timestamp']).alias("Hour"),
#                               minute(mainTable['date_timestamp']).alias("Minute"),
#                               second(mainTable['date_timestamp']).alias("Second")])
mainTable_new.printSchema()

root
 |-- p_id: integer (nullable = true)
 |-- p_name: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- date_timestamp: timestamp (nullable = true)
 |-- record_type: string (nullable = false)



### Read the Previous Day Folders

In [4]:
# Address to the product_info folder
address = "/home/bluepi/Downloads/Update/product_info/"
previous_day = (datetime.datetime.today() - datetime.timedelta(days=2)).strftime('%d-%m-%Y')
print("Previous Date ---->"+previous_day)

# Address to the Previous Day folder
new_address = address + previous_day
print("\nNew Address to read the folder ---->"+new_address)

# Read the Previous Day folder
per_day_data = spark.read.format('csv') \
          .options( header=True, inferschema=True ) \
          .load(new_address)

Previous Date ---->05-04-2020

New Address to read the folder ---->/home/bluepi/Downloads/Update/product_info/05-04-2020


In [5]:
# per_day_data.show(3)
per_day_data.orderBy(per_day_data.date_timestamp.asc()).show(60)
# per_day_data.printSchema()

+----+-----------+-----+-------------------+-----------+
|p_id|     p_name|price|     date_timestamp|record_type|
+----+-----------+-----+-------------------+-----------+
| 192| Stronghold|  730|2020-04-04 03:22:13|          U|
| 244|  Daltfresh|  169|2020-04-04 03:24:04|          I|
| 171|     Zathin|  700|2020-04-04 03:29:18|          U|
| 193|   Treeflex|  556|2020-04-04 03:30:19|          U|
|   1|        Job|  307|2020-04-04 03:45:40|          U|
|  40|   Treeflex| 1359|2020-04-04 03:49:39|          U|
| 232|     Latlux| 1316|2020-04-04 03:54:29|          I|
| 233|     Tresom| 1690|2020-04-04 03:56:43|          I|
| 153|        Job|  366|2020-04-04 09:13:15|          U|
|  52|   Flowdesk|  737|2020-04-04 09:13:57|          U|
| 235|    Sonsing|  583|2020-04-04 09:14:01|          I|
|  82|     Bamity|  322|2020-04-04 09:16:26|          U|
|   1|        Job| 1360|2020-04-04 09:22:27|          U|
| 163|    Redhold| 1346|2020-04-04 09:37:17|          U|
|  42|   Alphazap|  940|2020-04

### Added new columns

In [6]:
# per_day_data_new = per_day_data.select(['p_id', 'p_name', 'price', 'date_timestamp','record_type',
#                               year(per_day_data['date_timestamp']).alias("Year"),
#                               month(per_day_data['date_timestamp']).alias("Month"),
#                               dayofyear(per_day_data['date_timestamp']).alias("DayOfYear"),
#                               hour(per_day_data['date_timestamp']).alias("Hour"),
#                               minute(per_day_data['date_timestamp']).alias("Minute"),
#                               second(per_day_data['date_timestamp']).alias("Second")])

per_day_data.printSchema()

root
 |-- p_id: integer (nullable = true)
 |-- p_name: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- date_timestamp: timestamp (nullable = true)
 |-- record_type: string (nullable = true)



In [7]:
per_day_data.orderBy(per_day_data.date_timestamp.asc()).show(60)

+----+-----------+-----+-------------------+-----------+
|p_id|     p_name|price|     date_timestamp|record_type|
+----+-----------+-----+-------------------+-----------+
| 192| Stronghold|  730|2020-04-04 03:22:13|          U|
| 244|  Daltfresh|  169|2020-04-04 03:24:04|          I|
| 171|     Zathin|  700|2020-04-04 03:29:18|          U|
| 193|   Treeflex|  556|2020-04-04 03:30:19|          U|
|   1|        Job|  307|2020-04-04 03:45:40|          U|
|  40|   Treeflex| 1359|2020-04-04 03:49:39|          U|
| 232|     Latlux| 1316|2020-04-04 03:54:29|          I|
| 233|     Tresom| 1690|2020-04-04 03:56:43|          I|
| 153|        Job|  366|2020-04-04 09:13:15|          U|
|  52|   Flowdesk|  737|2020-04-04 09:13:57|          U|
| 235|    Sonsing|  583|2020-04-04 09:14:01|          I|
|  82|     Bamity|  322|2020-04-04 09:16:26|          U|
|   1|        Job| 1360|2020-04-04 09:22:27|          U|
| 163|    Redhold| 1346|2020-04-04 09:37:17|          U|
|  42|   Alphazap|  940|2020-04

### Directly append new Inserted products

In [8]:
per_day_data.filter("record_type == 'I' ").orderBy(per_day_data.p_id.asc()).show()

+----+---------+-----+-------------------+-----------+
|p_id|   p_name|price|     date_timestamp|record_type|
+----+---------+-----+-------------------+-----------+
| 203| Transcof| 1701|2020-04-04 21:58:59|          I|
| 219|    Zamit|  833|2020-04-04 21:12:18|          I|
| 232|   Latlux| 1316|2020-04-04 03:54:29|          I|
| 233|   Tresom| 1690|2020-04-04 03:56:43|          I|
| 233|   Tresom| 1690|2020-04-04 09:53:46|          I|
| 233|   Tresom| 1690|2020-04-04 15:51:21|          I|
| 235|  Sonsing|  583|2020-04-04 09:14:01|          I|
| 244|Daltfresh|  169|2020-04-04 03:24:04|          I|
| 244|Daltfresh|  169|2020-04-04 15:47:31|          I|
| 244|Daltfresh|  169|2020-04-04 21:47:38|          I|
| 249|  Andalax| 1857|2020-04-04 15:41:36|          I|
+----+---------+-----+-------------------+-----------+



In [9]:
mainTable_I_inserted = mainTable_new.union(per_day_data.filter("record_type == 'I' "))
mainTable_I_inserted.filter("record_type == 'I' ").show()

+----+---------+-----+-------------------+-----------+
|p_id|   p_name|price|     date_timestamp|record_type|
+----+---------+-----+-------------------+-----------+
| 219|    Zamit|  833|2020-04-04 21:12:18|          I|
| 203| Transcof| 1701|2020-04-04 21:58:59|          I|
| 244|Daltfresh|  169|2020-04-04 21:47:38|          I|
| 235|  Sonsing|  583|2020-04-04 09:14:01|          I|
| 233|   Tresom| 1690|2020-04-04 09:53:46|          I|
| 233|   Tresom| 1690|2020-04-04 03:56:43|          I|
| 244|Daltfresh|  169|2020-04-04 03:24:04|          I|
| 232|   Latlux| 1316|2020-04-04 03:54:29|          I|
| 233|   Tresom| 1690|2020-04-04 15:51:21|          I|
| 244|Daltfresh|  169|2020-04-04 15:47:31|          I|
| 249|  Andalax| 1857|2020-04-04 15:41:36|          I|
+----+---------+-----+-------------------+-----------+



In [10]:
before_insert = str(mainTable_new.count())
after_insert = str(mainTable_I_inserted.count())
total_insert = str(mainTable_I_inserted.filter("record_type == 'I' ").count())
print("Total no. of products before adding ----> "+ str(mainTable_new.count()))
print("Total no. of products after adding----> "+ str(mainTable_I_inserted.count()))
print("Total no. of products with record \"Inserted\"----> "+ str(mainTable_I_inserted.filter("record_type == 'I' ").count()))

Total no. of products before adding ----> 200
Total no. of products after adding----> 211
Total no. of products with record "Inserted"----> 11


### Update the Products

In [11]:
per_day_data.filter("record_type == 'U' ").orderBy(per_day_data.p_id.asc()).show()

+----+-----------+-----+-------------------+-----------+
|p_id|     p_name|price|     date_timestamp|record_type|
+----+-----------+-----+-------------------+-----------+
|   1|        Job|  307|2020-04-04 03:45:40|          U|
|   1|        Job| 1360|2020-04-04 09:22:27|          U|
|  18|     Lotlux|  888|2020-04-04 15:36:23|          U|
|  38|  Gembucket|    6|2020-04-04 21:41:47|          U|
|  40|   Treeflex| 1359|2020-04-04 03:49:39|          U|
|  42|   Alphazap|  940|2020-04-04 09:45:40|          U|
|  52|   Flowdesk|  737|2020-04-04 09:13:57|          U|
|  64|    Flexidy|  870|2020-04-04 09:50:17|          U|
|  82|     Bamity|  322|2020-04-04 09:16:26|          U|
| 100|Stringtough|  826|2020-04-04 21:57:18|          U|
| 113|     Zathin| 1260|2020-04-04 21:22:06|          U|
| 121|      Opela|  334|2020-04-04 21:44:42|          U|
| 129|    Bitwolf| 1475|2020-04-04 21:45:47|          U|
| 153|        Job|  366|2020-04-04 09:13:15|          U|
| 156|     Namfix|  762|2020-04

In [12]:
# mainTable_U_inserted = mainTable_I_inserted.union(per_day_data.filter("record_type == 'U' "))
# mainTable_U_inserted.filter("record_type == 'U' ").show()

from_per_day_data_U = per_day_data.filter("record_type == 'U' ")
from_per_day_data_U.show()

+----+-----------+-----+-------------------+-----------+
|p_id|     p_name|price|     date_timestamp|record_type|
+----+-----------+-----+-------------------+-----------+
| 182|Ventosanzap|  197|2020-04-04 21:13:23|          U|
| 129|    Bitwolf| 1475|2020-04-04 21:45:47|          U|
| 121|      Opela|  334|2020-04-04 21:44:42|          U|
| 113|     Zathin| 1260|2020-04-04 21:22:06|          U|
| 176|     Sonair|  318|2020-04-04 21:11:56|          U|
|  38|  Gembucket|    6|2020-04-04 21:41:47|          U|
| 100|Stringtough|  826|2020-04-04 21:57:18|          U|
|  52|   Flowdesk|  737|2020-04-04 09:13:57|          U|
|  42|   Alphazap|  940|2020-04-04 09:45:40|          U|
| 153|        Job|  366|2020-04-04 09:13:15|          U|
|  64|    Flexidy|  870|2020-04-04 09:50:17|          U|
|   1|        Job| 1360|2020-04-04 09:22:27|          U|
|  82|     Bamity|  322|2020-04-04 09:16:26|          U|
| 163|    Redhold| 1346|2020-04-04 09:37:17|          U|
| 193|   Treeflex|  556|2020-04

In [13]:
from_per_day_data_U_list = from_per_day_data_U.select("p_id").collect()

print("Output of our collect operation----> ",from_per_day_data_U_list[0])
print("get the value of p_id -----> ",from_per_day_data_U_list[0].p_id)

# List comprehension
p_id_list = [ i.p_id for i in from_per_day_data_U_list ]
print("\nList of p_id which we have to update taken from \"per_day_data\"")
print(p_id_list)
total_update = str(len((p_id_list)))
print("Total no. of products with record \"Updated\" ------>"+str(len((p_id_list))))

Output of our collect operation---->  Row(p_id=182)
get the value of p_id ----->  182

List of p_id which we have to update taken from "per_day_data"
[182, 129, 121, 113, 176, 38, 100, 52, 42, 153, 64, 1, 82, 163, 193, 171, 1, 40, 192, 18, 156]
Total no. of products with record "Updated" ------>21


In [14]:
# mainTable_U_inserted.groupBy("p_id").count().filter("count > 1").show()

from_mainTable_U = mainTable_I_inserted.filter( col('p_id').isin(p_id_list))
from_mainTable_U.show()

+----+-----------+-----+-------------------+-----------+
|p_id|     p_name|price|     date_timestamp|record_type|
+----+-----------+-----+-------------------+-----------+
|   1|        Job| 1464|2020-05-02 08:06:42|          A|
|  18|     Lotlux|  565|2020-05-02 13:07:57|          A|
|  38|  Gembucket|  884|2020-02-20 03:31:07|          A|
|  40|   Treeflex|  247|2020-02-25 09:26:43|          A|
|  42|   Alphazap| 1783|2020-02-25 06:12:43|          A|
|  52|   Flowdesk|  564|2020-02-14 19:42:02|          A|
|  64|    Flexidy| 1409|2020-03-02 00:06:07|          A|
|  82|     Bamity|  917|2020-02-27 21:52:09|          A|
| 100|Stringtough|  911|2020-02-28 19:23:32|          A|
| 113|     Zathin|  157|2020-02-27 11:15:09|          A|
| 121|      Opela| 1055|2020-07-02 02:45:32|          A|
| 129|    Bitwolf|  616|2020-02-27 09:43:38|          A|
| 153|        Job| 1688|2020-02-20 11:33:22|          A|
| 156|     Namfix|  915|2020-02-20 17:13:22|          A|
| 163|    Redhold| 1868|2020-02

In [15]:
mT_and_pDD_union = from_mainTable_U.union(from_per_day_data_U)
mT_and_pDD_union.orderBy(mT_and_pDD_union.p_id).show()

+----+-----------+-----+-------------------+-----------+
|p_id|     p_name|price|     date_timestamp|record_type|
+----+-----------+-----+-------------------+-----------+
|   1|        Job| 1360|2020-04-04 09:22:27|          U|
|   1|        Job| 1464|2020-05-02 08:06:42|          A|
|   1|        Job|  307|2020-04-04 03:45:40|          U|
|  18|     Lotlux|  565|2020-05-02 13:07:57|          A|
|  18|     Lotlux|  888|2020-04-04 15:36:23|          U|
|  38|  Gembucket|    6|2020-04-04 21:41:47|          U|
|  38|  Gembucket|  884|2020-02-20 03:31:07|          A|
|  40|   Treeflex| 1359|2020-04-04 03:49:39|          U|
|  40|   Treeflex|  247|2020-02-25 09:26:43|          A|
|  42|   Alphazap| 1783|2020-02-25 06:12:43|          A|
|  42|   Alphazap|  940|2020-04-04 09:45:40|          U|
|  52|   Flowdesk|  564|2020-02-14 19:42:02|          A|
|  52|   Flowdesk|  737|2020-04-04 09:13:57|          U|
|  64|    Flexidy|  870|2020-04-04 09:50:17|          U|
|  64|    Flexidy| 1409|2020-03

In [16]:
for_join_mT_and_pDD = mT_and_pDD_union.groupBy("p_id").agg({"date_timestamp":"max"}).withColumnRenamed("max(date_timestamp)","date_timestamp_1")
for_join_mT_and_pDD = for_join_mT_and_pDD.withColumnRenamed("p_id","p_id_1")
for_join_mT_and_pDD.show()

+------+-------------------+
|p_id_1|   date_timestamp_1|
+------+-------------------+
|   193|2020-04-04 03:30:19|
|   192|2020-04-04 03:22:13|
|     1|2020-05-02 08:06:42|
|    52|2020-04-04 09:13:57|
|   182|2020-05-02 02:01:38|
|    40|2020-04-04 03:49:39|
|   163|2020-04-04 09:37:17|
|    64|2020-04-04 09:50:17|
|   100|2020-04-04 21:57:18|
|   176|2020-04-04 21:11:56|
|   171|2020-04-04 03:29:18|
|   129|2020-04-04 21:45:47|
|    38|2020-04-04 21:41:47|
|    82|2020-04-04 09:16:26|
|   113|2020-04-04 21:22:06|
|   121|2020-07-02 02:45:32|
|   156|2020-04-04 15:49:23|
|    42|2020-04-04 09:45:40|
|   153|2020-04-04 09:13:15|
|    18|2020-05-02 13:07:57|
+------+-------------------+



In [17]:
joined = mT_and_pDD_union.join(for_join_mT_and_pDD, (
    mT_and_pDD_union.p_id == for_join_mT_and_pDD.p_id_1) & (
    mT_and_pDD_union.date_timestamp == for_join_mT_and_pDD.date_timestamp_1) ,'inner')

joined = joined.select(['p_id','p_name','price','date_timestamp','record_type'])
joined.count()

20

In [18]:
mT_and_pDD_union.filter("p_id == 138").show()

+----+------+-----+--------------+-----------+
|p_id|p_name|price|date_timestamp|record_type|
+----+------+-----+--------------+-----------+
+----+------+-----+--------------+-----------+



In [19]:
mainTable_U_updated = mainTable_I_inserted.filter(~col('p_id').isin(p_id_list))
mainTable_U_updated.orderBy("p_id").count()

191

In [20]:
mainTable_U_updated_new = mainTable_U_updated.union(joined)
after_update = str(mainTable_U_updated_new.orderBy("p_id").count())
mainTable_U_updated_new.orderBy("p_id").count()

211

### Drop deleted products

In [27]:
# Drop deleted products

to_be_deleted = per_day_data.filter("record_type == 'D' ").collect()
p_id_list = [ i.p_id for i in to_be_deleted ]

print("\nList of p_id which we have to deleted taken from \"per_day_data\"")
print(p_id_list)
total_deleted = str(len((p_id_list)))
print("Total no. of products with record \"Updated\" ------>"+str(len((p_id_list))))


List of p_id which we have to deleted taken from "per_day_data"
[86, 125, 164, 177, 141, 178]
Total no. of products with record "Updated" ------>6


In [25]:
mainTable_D_deleted = mainTable_U_updated_new.filter(~col('p_id').isin(p_id_list))
mainTable_D_deleted.count()
after_delete = str(mainTable_D_deleted.count())

In [33]:
print("Total no. of products before insert----->"+before_insert)
print("Total no. of products after insert------>"+after_insert)
print("Total no. of inserted  ----------------->"+total_insert)
print("Total no. of products after update ----->"+after_update)
print("Total no. of products to be deleting---->"+total_deleted)
print("Total no. of products after deleting---->"+after_delete)

Total no. of products before insert----->200
Total no. of products after insert------>211
Total no. of inserted  ----------------->11
Total no. of products after update ----->211
Total no. of products to be deleting---->6
Total no. of products after deleting---->205
