In [1]:
import datetime

# Main entry point for DataFrame and SQL functionality.
from pyspark.sql import SparkSession

In [2]:
# Start SPARK Session
spark = SparkSession.builder.appName('Basics').getOrCreate()

In [3]:
# import pandas as pd

# # import data
# data = pd.read_csv("/home/bluepi/Downloads/Update/product_info/main table/product_info.csv")

# # convert day column type to datetime
# data['day'] = pd.to_datetime( data['day'], infer_datetime_format=True, yearfirst=True)

# # create new column
# data['date_timestamp'] = pd.to_datetime(data.day.astype(str) + ' ' + data.time)

# # Drop old columns
# data.drop(['day','time'],inplace=True,axis=1)

# # Write to csv
# data.to_csv("/home/bluepi/Downloads/Update/Updated Product/Latest Product/main table.csv", index=False)

In [4]:
mainTable = spark.read.format('csv').options(
    header=True, inferschema=True).load(
        "/home/bluepi/Downloads/Update/product_info/main table/main table.csv")

In [5]:
mainTable.printSchema()

root
 |-- p_id: integer (nullable = true)
 |-- p_name: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- date_timestamp: timestamp (nullable = true)



In [6]:
from pyspark.sql.functions import year, month, dayofyear, hour, minute, second

mainTable.select([year(mainTable['date_timestamp']).alias("Year"),
                  month(mainTable['date_timestamp']).alias("Month"),
                  dayofyear(mainTable['date_timestamp']).alias("DayOfYear"),
                  hour(mainTable['date_timestamp']).alias("Hour"),
                  minute(mainTable['date_timestamp']).alias("Minute"),
                  second(mainTable['date_timestamp']).alias("Second")]).show(3)

+----+-----+---------+----+------+------+
|Year|Month|DayOfYear|Hour|Minute|Second|
+----+-----+---------+----+------+------+
|2020|    5|      123|   8|     6|    42|
|2020|    2|       33|   8|     2|    22|
|2020|    2|       59|   7|    58|     8|
+----+-----+---------+----+------+------+
only showing top 3 rows



In [7]:
mainTable.select('date_timestamp').orderBy(mainTable.date_timestamp.asc()).show(2)

+-------------------+
|     date_timestamp|
+-------------------+
|2020-01-02 01:25:16|
|2020-01-02 05:05:59|
+-------------------+
only showing top 2 rows



In [8]:
# Address to the product_info folder
address = "/home/bluepi/Downloads/Update/product_info/"
previous_day = (datetime.datetime.today() - datetime.timedelta(days=8)).strftime('%d-%m-%Y')

# Address to the Previous Day folder
new_address = address + previous_day

# Read the Previous Day folder
per_day_data = spark.read.format('csv') \
          .options( header=True, inferschema=True ) \
          .load(new_address)

In [33]:
# per_day_data.show(3)
per_day_data.orderBy(per_day_data.Date_timestamp.asc()).show(5)
# per_day_data.printSchema()

+----+----------+-----+-------------------+-----------+
|p_id|    p_name|price|     Date_timestamp|record_type|
+----+----------+-----+-------------------+-----------+
|  70|  Aerified| 1149|2020-03-18 00:56:32|          D|
| 248|Y-Solowarm| 1829|2020-03-18 01:00:27|          I|
| 220|    Keylex| 1516|2020-03-18 01:16:02|          I|
| 155|Trippledex|  535|2020-03-18 01:17:27|          U|
| 179|   Pannier| 1673|2020-03-18 01:30:18|          D|
+----+----------+-----+-------------------+-----------+
only showing top 5 rows



In [10]:
# # Create the schema
# from pyspark.sql.types import *

# schema = StructType([StructField("p_id", IntegerType(), True),
#                      StructField("p_name", StringType(), True),
#                      StructField("price", IntegerType(), True),
#                      StructField("Date_timestamp", TimestampType() , True),
#                      StructField("record_type", StringType(), True)
#                      ])
# # Create latest proct table
# # Initially it is empty no updates
# Latest_Product_Table = spark.createDataFrame([], schema)

In [63]:
# Directly append new Inserted products
from pyspark.sql.functions import *
per_day_data_1 = per_day_data.filter(
    per_day_data.record_type == 'I').select(['p_id', 'date_timestamp'])

ppd_1 = per_day_data_1.alias('ppd_1')
ppd_2 = per_day_data_1.alias('ppd_2')

per_day_data_2 = ppd_1.join(
    ppd_2, ppd_1.p_id == ppd_2.p_id, 'inner').filter(ppd_1.date_timestamp > ppd_2.date_timestamp)

per_day_data_2.show()
# per_day_data_2 = per_day_data_1.select(
#     ['p_id', 'p_name', 'price', 'Date_timestamp'])
# mainTable_1 = mainTable.union(per_day_data_2)

# # mainTable_1.count()

+----+--------------+----+--------------+
|p_id|date_timestamp|p_id|date_timestamp|
+----+--------------+----+--------------+
+----+--------------+----+--------------+



In [45]:
# Drop deleted products

# Registers this DataFrame as a temporary table using the given name
mainTable_1.registerTempTable("table_main")
per_day_data.registerTempTable("table_day")

mainTable_2 = spark.sql(
    "select * from table_main where p_id not in ( select p_id from table_day where record_type = 'D' )")

# mainTable_2.count()

In [25]:
# update products
from pyspark.sql.functions import *
mainTable_2.select( countDistinct(mainTable_2.p_id) ).show()

+--------------------+
|count(DISTINCT p_id)|
+--------------------+
|                 206|
+--------------------+



In [32]:
mainTable_2.select('p_id').groupBy("p_id").count().filter( "count > 1" ).show()

+----+-----+
|p_id|count|
+----+-----+
| 206|    2|
| 220|    2|
+----+-----+

