In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Basics').getOrCreate()

In [16]:
spark.read.format('csv') \
          .options( header=True, inferschema=True ) \
          .load("/home/bluepi/Downloads/Update/product_info/main table/").show()
            

+----+-----------+-----+
|p_id|       name|Price|
+----+-----------+-----+
|   1|   Alphazap|   65|
|   2|     Hatity|   75|
|   3|    Cookley|   38|
|   4|Mat Lam Tam|   93|
|   5|       Stim|   88|
|   6|       Stim|   99|
|   7| Y-Solowarm|   80|
|   8|    Sonsing|   29|
|   9|     Keylex|   24|
|  10| Stronghold|   51|
+----+-----------+-----+



In [23]:
# This module allows us to execute a system command, and collect its result
import subprocess

# capture_output -- > is used to capture standard output and standard error
p1 = subprocess.run(['ls',"/home/bluepi/Downloads/Update/product_info"],capture_output=True)

# decode bytes into string
l = str(p1.stdout.decode('utf-8'))

In [24]:
# Splitting and getting names of the respective folders
l.split('\n')

# eg :- 12032020 --> 12th of March, 2020
# main table --> folder in which our main table is present

['12032020', '13032020', '15032020', '19032020', 'main table', '']

In [25]:
import pandas as pd

# to get today's date
d1 = pd.to_datetime('today').strftime('%d%m%Y')

In [26]:
# (int(d1) - 1000000) -- > With this we can decrease our current date by 1 day
# str(int(d1) - 1000000) -- > converting back to string

# date time funciotn***

# Checking whether the previous date folder is present or not
str(int(d1) - 1000000) in l

True

In [27]:
# Storing "previous date" into "previous_day" variable
previous_day = str(int(d1) - 1000000)

In [28]:
# Address to the product_info folder
address = "/home/bluepi/Downloads/Update/product_info/"

# Address to the previous day folder
new_address = address + previous_day

In [29]:
# read the previous day folder
data = spark.read.format('csv') \
          .options( header=True, inferschema=True ) \
          .load(new_address)

In [30]:
from  pyspark.sql.functions import input_file_name

# adding file names to column with there respective complete path
data_1 =  data.withColumn("filename", input_file_name())

In [31]:
data_1.show()

+----+---------------+-----+--------------------+
|p_id|           name|Price|            filename|
+----+---------------+-----+--------------------+
|   1|       Alphazap|  615|file:///home/blue...|
|   5|     Stim_trial| 9911|file:///home/blue...|
|   6| Stim_trial_exp| 1022|file:///home/blue...|
|   7|Y-Solowarm_2020|  344|file:///home/blue...|
|   8|        Sonsing|  556|file:///home/blue...|
|   9|    Keylex_2020| 7786|file:///home/blue...|
|   1|       Alphazap|   99|file:///home/blue...|
|   2|    Hatity_2020|  975|file:///home/blue...|
|   3|        Cookley|  999|file:///home/blue...|
|   4|    Mat Lam Tam|99999|file:///home/blue...|
|   1|       Alphazap| 9999|file:///home/blue...|
|   2|    Hatity_2020|  230|file:///home/blue...|
|   1|       Alphazap|   10|file:///home/blue...|
+----+---------------+-----+--------------------+



In [12]:
# Checking type
type(data_1)

pyspark.sql.dataframe.DataFrame

In [33]:
# get the address of the file
data_1.select('filename').head(5)[0]

Row(filename='file:///home/bluepi/Downloads/Update/product_info/19032020/Untitled4.csv')

In [35]:
from pyspark.sql.functions import split

In [36]:
# to select only the file name present in the folder instead of the complete path
d2 = data_1.withColumn("filename_file", split("filename", "/")[9])

In [37]:
d2.show()

+----+---------------+-----+--------------------+-------------+
|p_id|           name|Price|            filename|filename_file|
+----+---------------+-----+--------------------+-------------+
|   1|       Alphazap|  615|file:///home/blue...|Untitled4.csv|
|   5|     Stim_trial| 9911|file:///home/blue...|Untitled4.csv|
|   6| Stim_trial_exp| 1022|file:///home/blue...|Untitled4.csv|
|   7|Y-Solowarm_2020|  344|file:///home/blue...|Untitled4.csv|
|   8|        Sonsing|  556|file:///home/blue...|Untitled4.csv|
|   9|    Keylex_2020| 7786|file:///home/blue...|Untitled4.csv|
|   1|       Alphazap|   99|file:///home/blue...|Untitled3.csv|
|   2|    Hatity_2020|  975|file:///home/blue...|Untitled3.csv|
|   3|        Cookley|  999|file:///home/blue...|Untitled3.csv|
|   4|    Mat Lam Tam|99999|file:///home/blue...|Untitled3.csv|
|   1|       Alphazap| 9999|file:///home/blue...|Untitled2.csv|
|   2|    Hatity_2020|  230|file:///home/blue...|Untitled2.csv|
|   1|       Alphazap|   10|file:///home

In [38]:
d3 = d2.drop("filename")

In [39]:
# Added the filename(only) to the dataset
d3.show()

+----+---------------+-----+-------------+
|p_id|           name|Price|filename_file|
+----+---------------+-----+-------------+
|   1|       Alphazap|  615|Untitled4.csv|
|   5|     Stim_trial| 9911|Untitled4.csv|
|   6| Stim_trial_exp| 1022|Untitled4.csv|
|   7|Y-Solowarm_2020|  344|Untitled4.csv|
|   8|        Sonsing|  556|Untitled4.csv|
|   9|    Keylex_2020| 7786|Untitled4.csv|
|   1|       Alphazap|   99|Untitled3.csv|
|   2|    Hatity_2020|  975|Untitled3.csv|
|   3|        Cookley|  999|Untitled3.csv|
|   4|    Mat Lam Tam|99999|Untitled3.csv|
|   1|       Alphazap| 9999|Untitled2.csv|
|   2|    Hatity_2020|  230|Untitled2.csv|
|   1|       Alphazap|   10|Untitled1.csv|
+----+---------------+-----+-------------+



In [40]:
type(d3)

pyspark.sql.dataframe.DataFrame

In [41]:
d3.createOrReplaceTempView("Each_day_data")

In [42]:
# SQL Query to find the latest update of each product 
# max not on string
table = spark.sql(" select * from Each_day_data \
            where (p_id,filename_file) \
            in (select p_id,max(filename_file) from Each_day_data group by p_id order by p_id) \
            order by p_id,filename_file desc  ")

In [22]:
table.show()

+----+---------------+-----+-------------+
|p_id|           name|Price|filename_file|
+----+---------------+-----+-------------+
|   1|       Alphazap|  615|Untitled4.csv|
|   2|    Hatity_2020|  975|Untitled3.csv|
|   3|        Cookley|  999|Untitled3.csv|
|   4|    Mat Lam Tam|99999|Untitled3.csv|
|   5|     Stim_trial| 9911|Untitled4.csv|
|   6| Stim_trial_exp| 1022|Untitled4.csv|
|   7|Y-Solowarm_2020|  344|Untitled4.csv|
|   8|        Sonsing|  556|Untitled4.csv|
|   9|    Keylex_2020| 1786|Untitled4.csv|
+----+---------------+-----+-------------+



In [43]:
table.createOrReplaceTempView("Table")

In [44]:
# Updating the MAIN_TABLE

product_info = spark.read.format('csv') \
          .options( header=True, inferschema=True ) \
          .load("/home/bluepi/Downloads/Update/product_info/main table/product_info.csv")

In [45]:
product_info.createOrReplaceTempView("Product_info")

In [26]:
# Main Table
spark.sql(" select * from Product_info ").show()

+----+-----------+-----+
|p_id|       name|Price|
+----+-----------+-----+
|   1|   Alphazap|   65|
|   2|     Hatity|   75|
|   3|    Cookley|   38|
|   4|Mat Lam Tam|   93|
|   5|       Stim|   88|
|   6|       Stim|   99|
|   7| Y-Solowarm|   80|
|   8|    Sonsing|   29|
|   9|     Keylex|   24|
|  10| Stronghold|   51|
+----+-----------+-----+



In [27]:
# Query to update the main table
spark.sql(" select * from Product_info \
            where p_id not in ( select p_id from Table ) \
            union \
            select p_id,name,Price from Table \
            order by p_id \
          ").show()

+----+---------------+-----+
|p_id|           name|Price|
+----+---------------+-----+
|   1|       Alphazap|  615|
|   2|    Hatity_2020|  975|
|   3|        Cookley|  999|
|   4|    Mat Lam Tam|99999|
|   5|     Stim_trial| 9911|
|   6| Stim_trial_exp| 1022|
|   7|Y-Solowarm_2020|  344|
|   8|        Sonsing|  556|
|   9|    Keylex_2020| 1786|
|  10|     Stronghold|   51|
+----+---------------+-----+

