In [36]:
import datetime

# Main entry point for DataFrame and SQL functionality.
from pyspark.sql import SparkSession

In [37]:
# Start SPARK Session
spark = SparkSession.builder.appName('Basics').getOrCreate()

In [40]:
# import pandas as pd

# # import data
# data = pd.read_csv("/home/bluepi/Downloads/Update/product_info/main table/product_info.csv")

# # convert day column type to datetime
# data['day'] = pd.to_datetime( data['day'], infer_datetime_format=True, yearfirst=True)

# # create new column
# data['date_timestamp'] = pd.to_datetime(data.day.astype(str) + ' ' + data.time)

# # Drop old columns
# data.drop(['day','time'],inplace=True,axis=1)

# # Write to csv
# data.to_csv("/home/bluepi/Downloads/Update/Updated Product/Latest Product/main table.csv", index=False)

In [29]:
mainTable = spark.read.format('csv').options(
    header=True, inferschema=True).load(
        "/home/bluepi/Downloads/Update/product_info/main table/main table.csv")

In [30]:
mainTable.printSchema()

root
 |-- p_id: integer (nullable = true)
 |-- p_name: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- date_timestamp: timestamp (nullable = true)



In [31]:
from pyspark.sql.functions import year, month, dayofyear, hour, minute, second

mainTable.select([year(mainTable['date_timestamp']).alias("Year"),
                  month(mainTable['date_timestamp']).alias("Month"),
                  dayofyear(mainTable['date_timestamp']).alias("DayOfYear"),
                  hour(mainTable['date_timestamp']).alias("Hour"),
                  minute(mainTable['date_timestamp']).alias("Minute"),
                  second(mainTable['date_timestamp']).alias("Second")]).show(3)

+----+-----+---------+----+------+------+
|Year|Month|DayOfYear|Hour|Minute|Second|
+----+-----+---------+----+------+------+
|2020|    5|      123|   8|     6|    42|
|2020|    2|       33|   8|     2|    22|
|2020|    2|       59|   7|    58|     8|
+----+-----+---------+----+------+------+
only showing top 3 rows



In [32]:
mainTable.select('date_timestamp').orderBy(mainTable.date_timestamp.asc()).show(2)

+-------------------+
|     date_timestamp|
+-------------------+
|2020-01-02 01:25:16|
|2020-01-02 05:05:59|
+-------------------+
only showing top 2 rows



In [33]:
# Address to the product_info folder
address = "/home/bluepi/Downloads/Update/product_info/"
previous_day = (datetime.datetime.today() - datetime.timedelta(days=8)).strftime('%d-%m-%Y')

# Address to the Previous Day folder
new_address = address + previous_day

# Read the Previous Day folder
per_day_data = spark.read.format('csv') \
          .options( header=True, inferschema=True ) \
          .load(new_address)

In [25]:
# per_day_data.show(3)
per_day_data.orderBy(per_day_data.Date_timestamp.asc()).show(60)
# per_day_data.printSchema()

+----+-----------+-----+-------------------+-----------+
|p_id|     p_name|price|     Date_timestamp|record_type|
+----+-----------+-----+-------------------+-----------+
| 248| Y-Solowarm| 1829|2020-03-17 00:47:21|          I|
| 143|    Flexidy| 1312|2020-03-17 00:48:59|          U|
|  32|     Latlux|  436|2020-03-17 01:01:39|          U|
| 191|     Keylex| 1075|2020-03-17 01:02:32|          U|
| 220|     Keylex| 1516|2020-03-17 01:05:44|          I|
|  64|    Flexidy|  990|2020-03-17 01:21:40|          U|
| 222|  Lotstring|  297|2020-03-17 01:27:48|          I|
|  88|      Opela|  220|2020-03-17 01:28:35|          U|
|  46|     Y-find|  325|2020-03-17 01:38:25|          U|
| 236|   Aerified| 1955|2020-03-17 01:42:43|          I|
| 209|   Wrapsafe| 1969|2020-03-17 01:49:04|          I|
|  89|     Keylex| 1219|2020-03-17 02:02:57|          U|
|  50|     Bamity|  287|2020-03-17 02:04:27|          U|
|  96|   Overhold|  835|2020-03-17 02:06:11|          U|
| 141|  Lotstring|  273|2020-03

In [34]:
# # Create the schema
# from pyspark.sql.types import *

# schema = StructType([StructField("p_id", IntegerType(), True),
#                      StructField("p_name", StringType(), True),
#                      StructField("price", IntegerType(), True),
#                      StructField("Date_timestamp", TimestampType() , True),
#                      StructField("record_type", StringType(), True)
#                      ])
# # Create latest proct table
# # Initially it is empty no updates
# Latest_Product_Table = spark.createDataFrame([], schema)

In [35]:
per_day_data.filter( per_day_data.record_type == 'I' ).show()

+----+-----------+-----+-------------------+-----------+
|p_id|     p_name|price|     Date_timestamp|record_type|
+----+-----------+-----+-------------------+-----------+
| 222|  Lotstring|  297|2020-03-17 01:27:48|          I|
| 209|   Wrapsafe| 1969|2020-03-17 01:49:04|          I|
| 220|     Keylex| 1516|2020-03-17 01:05:44|          I|
| 248| Y-Solowarm| 1829|2020-03-17 00:47:21|          I|
| 236|   Aerified| 1955|2020-03-17 01:42:43|          I|
| 210|Solarbreeze| 1630|2020-03-17 13:36:23|          I|
| 250|   Alphazap| 1194|2020-03-17 13:40:02|          I|
| 232|     Latlux| 1316|2020-03-17 12:48:25|          I|
| 206|      Alpha| 1182|2020-03-17 13:07:10|          I|
| 243|   Tampflex| 1341|2020-03-17 07:38:44|          I|
| 206|      Alpha| 1182|2020-03-17 20:08:34|          I|
| 204|        Job| 1047|2020-03-17 20:17:16|          I|
| 246|    Zontrax| 1107|2020-03-17 20:14:38|          I|
| 227|    Fix San|  149|2020-03-17 18:59:22|          I|
+----+-----------+-----+-------

In [12]:
from pyspark.sql.functions import countDistinct,count
mainTable.select(countDistinct(mainTable.p_id)).show()
# mainTable.count()

+--------------------+
|count(DISTINCT p_id)|
+--------------------+
|                 200|
+--------------------+

