In [1]:
import datetime

# Main entry point for DataFrame and SQL functionality.
from pyspark.sql import SparkSession

In [2]:
# Start SPARK Session
spark = SparkSession.builder.appName('Basics').getOrCreate()

In [3]:
# import pandas as pd

# # import data
# data = pd.read_csv("/home/bluepi/Downloads/Update/product_info/main table/product_info.csv")

# # convert day column type to datetime
# data['day'] = pd.to_datetime( data['day'], infer_datetime_format=True, yearfirst=True)

# # create new column
# data['date_timestamp'] = pd.to_datetime(data.day.astype(str) + ' ' + data.time)

# # Drop old columns
# data.drop(['day','time'],inplace=True,axis=1)

# # Write to csv
# data.to_csv("/home/bluepi/Downloads/Update/Updated Product/Latest Product/main table.csv", index=False)

In [3]:
mainTable = spark.read.format('csv').options(
    header=True, inferschema=True).load(
        "/home/bluepi/Downloads/Update/product_info/main table/main table.csv")

In [4]:
mainTable.printSchema()

root
 |-- p_id: integer (nullable = true)
 |-- p_name: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- date_timestamp: timestamp (nullable = true)



In [5]:
from pyspark.sql.functions import year, month, dayofyear, hour, minute, second

mainTable.select([year(mainTable['date_timestamp']).alias("Year"),
                  month(mainTable['date_timestamp']).alias("Month"),
                  dayofyear(mainTable['date_timestamp']).alias("DayOfYear"),
                  hour(mainTable['date_timestamp']).alias("Hour"),
                  minute(mainTable['date_timestamp']).alias("Minute"),
                  second(mainTable['date_timestamp']).alias("Second")]).show(3)

+----+-----+---------+----+------+------+
|Year|Month|DayOfYear|Hour|Minute|Second|
+----+-----+---------+----+------+------+
|2020|    5|      123|   8|     6|    42|
|2020|    2|       33|   8|     2|    22|
|2020|    2|       59|   7|    58|     8|
+----+-----+---------+----+------+------+
only showing top 3 rows



In [8]:
mainTable.select('date_timestamp').orderBy(mainTable.date_timestamp.asc()).show(6)

+-------------------+
|     date_timestamp|
+-------------------+
|2020-01-02 01:25:16|
|2020-01-02 05:05:59|
|2020-01-02 05:54:01|
|2020-01-02 09:08:05|
|2020-01-02 20:11:17|
|2020-02-02 01:16:34|
+-------------------+
only showing top 6 rows



In [26]:
# Address to the product_info folder
address = "/home/bluepi/Downloads/Update/product_info/"
previous_day = (datetime.datetime.today() - datetime.timedelta(days=6)).strftime('%d-%m-%Y')

# Address to the Previous Day folder
new_address = address + previous_day

# Read the Previous Day folder
per_day_data = spark.read.format('csv') \
          .options( header=True, inferschema=True ) \
          .load(new_address)

In [27]:
# per_day_data.show(3)
per_day_data.orderBy(per_day_data.Date_timestamp.asc()).show(60)
# per_day_data.printSchema()

+----+-----------+-----+-------------------+-----------+
|p_id|     p_name|price|     Date_timestamp|record_type|
+----+-----------+-----+-------------------+-----------+
| 117|Ventosanzap| 1498|2020-03-20 01:01:40|          D|
| 210|Solarbreeze| 1630|2020-03-20 01:09:29|          I|
| 196|     Tresom| 1609|2020-03-20 01:21:52|          D|
| 206|      Alpha| 1182|2020-03-20 01:30:40|          I|
| 226|  Lotstring| 1515|2020-03-20 01:46:50|          I|
|  50|     Bamity| 1567|2020-03-20 01:53:06|          D|
| 132|         It| 1684|2020-03-20 01:58:20|          D|
|  44|       Stim|  482|2020-03-20 06:52:19|          U|
|  66|Toughjoyfax| 1248|2020-03-20 06:54:28|          U|
|   6|    Andalax| 1173|2020-03-20 06:59:30|          U|
|  71|      Opela| 1260|2020-03-20 07:02:18|          U|
| 214|    Bitwolf|  910|2020-03-20 07:37:27|          I|
| 233|     Tresom| 1690|2020-03-20 07:40:11|          I|
| 216|     Lotlux|  613|2020-03-20 07:40:49|          I|
|  55|      Alpha|  468|2020-03

In [10]:
# # Create the schema
# from pyspark.sql.types import *

# schema = StructType([StructField("p_id", IntegerType(), True),
#                      StructField("p_name", StringType(), True),
#                      StructField("price", IntegerType(), True),
#                      StructField("Date_timestamp", TimestampType() , True),
#                      StructField("record_type", StringType(), True)
#                      ])
# # Create latest proct table
# # Initially it is empty no updates
# Latest_Product_Table = spark.createDataFrame([], schema)

In [29]:
# Directly append new Inserted products
from pyspark.sql.functions import *
per_day_data_1 = per_day_data.filter(
    per_day_data.record_type == 'I').select(['p_id', 'date_timestamp'])

ppd_1 = per_day_data_1.alias('ppd_1')
ppd_2 = per_day_data_1.alias('ppd_2')

per_day_data_2 = ppd_1.join(
    ppd_2, ppd_1.p_id == ppd_2.p_id, 'inner').filter(ppd_1.date_timestamp > ppd_2.date_timestamp)

per_day_data_2.show()
# per_day_data_2 = per_day_data_1.select(
#     ['p_id', 'p_name', 'price', 'Date_timestamp'])
# mainTable_1 = mainTable.union(per_day_data_2)

# # mainTable_1.count()

+----+-------------------+----+-------------------+
|p_id|     date_timestamp|p_id|     date_timestamp|
+----+-------------------+----+-------------------+
| 250|2020-03-20 19:42:54| 250|2020-03-20 19:42:54|
| 228|2020-03-20 20:03:41| 228|2020-03-20 13:28:37|
| 228|2020-03-20 20:03:41| 228|2020-03-20 20:03:41|
| 243|2020-03-20 19:36:02| 243|2020-03-20 19:36:02|
| 237|2020-03-20 20:00:52| 237|2020-03-20 20:00:52|
| 224|2020-03-20 19:57:29| 224|2020-03-20 19:57:29|
| 233|2020-03-20 07:40:11| 233|2020-03-20 07:40:11|
| 216|2020-03-20 07:40:49| 216|2020-03-20 07:40:49|
| 214|2020-03-20 07:37:27| 214|2020-03-20 07:37:27|
| 210|2020-03-20 01:09:29| 210|2020-03-20 01:09:29|
| 206|2020-03-20 01:30:40| 206|2020-03-20 01:30:40|
| 226|2020-03-20 01:46:50| 226|2020-03-20 01:46:50|
| 249|2020-03-20 13:29:38| 249|2020-03-20 13:29:38|
| 228|2020-03-20 13:28:37| 228|2020-03-20 13:28:37|
| 228|2020-03-20 13:28:37| 228|2020-03-20 20:03:41|
+----+-------------------+----+-------------------+



In [31]:
# Drop deleted products

# Registers this DataFrame as a temporary table using the given name
mainTable_1.registerTempTable("table_main")
per_day_data.registerTempTable("table_day")

mainTable_2 = spark.sql(
    "select * from table_main where p_id not in ( select p_id from table_day where record_type = 'D' )")

# mainTable_2.count()

NameError: name 'mainTable_1' is not defined

In [25]:
# update products
from pyspark.sql.functions import *
mainTable_2.select( countDistinct(mainTable_2.p_id) ).show()

+--------------------+
|count(DISTINCT p_id)|
+--------------------+
|                 206|
+--------------------+



In [30]:
mainTable_2.select('p_id').groupBy("p_id").count().filter( "count > 1" ).show()

NameError: name 'mainTable_2' is not defined