In [10]:
import datetime

# Main entry point for DataFrame and SQL functionality.
from pyspark.sql import SparkSession

In [11]:
# Start SPARK Session
spark = SparkSession.builder.appName('Basics').getOrCreate()

In [12]:
import pandas as pd

# import data
data = pd.read_csv("/home/bluepi/Downloads/Update/product_info/main table/product_info_new.csv")

# convert day column type to datetime
data['day'] = pd.to_datetime( data['day'], infer_datetime_format=True, yearfirst=True)

# create new column
data['date_timestamp'] = pd.to_datetime(data.day.astype(str) + ' ' + data.time)

# Drop old columns
data.drop(['day','time'],inplace=True,axis=1)

# Write to csv
data.to_csv("/home/bluepi/Downloads/Update/product_info/main table/main table.csv", index=False)

In [13]:
mainTable = spark.read.format('csv').options(
    header=True, inferschema=True).load(
        "/home/bluepi/Downloads/Update/product_info/main table/main table.csv")

In [14]:
mainTable.printSchema()

root
 |-- p_id: integer (nullable = true)
 |-- p_name: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- date_timestamp: timestamp (nullable = true)



In [6]:
from pyspark.sql.functions import year, month, dayofyear, hour, minute, second

mainTable.select([year(mainTable['date_timestamp']).alias("Year"),
                  month(mainTable['date_timestamp']).alias("Month"),
                  dayofyear(mainTable['date_timestamp']).alias("DayOfYear"),
                  hour(mainTable['date_timestamp']).alias("Hour"),
                  minute(mainTable['date_timestamp']).alias("Minute"),
                  second(mainTable['date_timestamp']).alias("Second")]).show(3)

+----+-----+---------+----+------+------+
|Year|Month|DayOfYear|Hour|Minute|Second|
+----+-----+---------+----+------+------+
|2020|    3|       62|   0|    17|    38|
|2020|    5|      124|  20|    40|    18|
|2020|    2|       59|  23|     2|     1|
+----+-----+---------+----+------+------+
only showing top 3 rows



In [7]:
mainTable.select('date_timestamp').orderBy(mainTable.date_timestamp.asc()).show(20)

+-------------------+
|     date_timestamp|
+-------------------+
|2020-01-01 01:07:06|
|2020-01-01 06:29:06|
|2020-01-01 15:55:41|
|2020-01-01 20:13:51|
|2020-01-01 22:26:12|
|2020-01-01 23:28:07|
|2020-01-02 11:32:34|
|2020-01-03 02:03:55|
|2020-01-03 09:41:29|
|2020-01-03 18:46:47|
|2020-01-03 20:53:19|
|2020-01-03 23:08:46|
|2020-01-13 02:20:05|
|2020-01-13 11:13:07|
|2020-01-13 20:58:38|
|2020-01-14 00:14:04|
|2020-01-14 00:15:12|
|2020-01-14 15:37:46|
|2020-01-14 21:56:14|
|2020-01-15 03:06:00|
+-------------------+
only showing top 20 rows



In [104]:
# Address to the product_info folder
address = "/home/bluepi/Downloads/Update/product_info/"
previous_day = (datetime.datetime.today() - datetime.timedelta(days=6)).strftime('%d-%m-%Y')

# Address to the Previous Day folder
new_address = address + previous_day

# Read the Previous Day folder
per_day_data = spark.read.format('csv') \
          .options( header=True, inferschema=True ) \
          .load(new_address)

In [105]:
# per_day_data.show(3)
# per_day_data.orderBy(per_day_data.Date_timestamp.asc()).show(60)
per_day_data.printSchema()

root
 |-- p_id: integer (nullable = true)
 |-- p_name: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- Date_timestamp: timestamp (nullable = true)
 |-- record_type: string (nullable = true)



In [100]:
# # Create the schema
# from pyspark.sql.types import *

# schema = StructType([StructField("p_id", IntegerType(), True),
#                      StructField("p_name", StringType(), True),
#                      StructField("price", IntegerType(), True),
#                      StructField("Date_timestamp", TimestampType() , True),
#                      StructField("record_type", StringType(), True)
#                      ])
# # Create latest proct table
# # Initially it is empty no updates
# Latest_Product_Table = spark.createDataFrame([], schema)

In [110]:
per_day_data.filter( per_day_data.record_type == 'I' ).show()

+----+-----------+-----+-------------------+-----------+
|p_id|     p_name|price|     Date_timestamp|record_type|
+----+-----------+-----+-------------------+-----------+
| 929|     Latlux|   68|2020-03-19 16:14:38|          I|
| 417|     Vagram|  782|2020-03-19 15:22:08|          I|
| 279|Solarbreeze| 1368|2020-03-19 15:34:58|          I|
| 725|    Sonsing|  640|2020-03-19 15:49:46|          I|
| 675|     Tresom| 1368|2020-03-19 15:54:15|          I|
| 929|     Latlux|   68|2020-03-19 03:34:46|          I|
| 249|   Aerified|  265|2020-03-19 04:37:18|          I|
| 335|       Rank|  165|2020-03-19 21:44:51|          I|
| 834|       Span|  792|2020-03-19 10:04:51|          I|
| 157|       Span| 1217|2020-03-19 10:33:53|          I|
| 980|     Lotlux|  618|2020-03-19 10:05:23|          I|
| 404|   Domainer| 1173|2020-03-19 10:37:48|          I|
+----+-----------+-----+-------------------+-----------+



In [15]:
from pyspark.sql.functions import countDistinct,count
mainTable.select(countDistinct(mainTable.p_id)).show()
# mainTable.count()

+--------------------+
|count(DISTINCT p_id)|
+--------------------+
|                 200|
+--------------------+

