In [2]:
import logging
from pyspark import SparkContext
from pyspark.sql import SparkSession

logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
logging.info("------------------------------------------------------------------------")
logging.info("------------Starting the Historical data recalculation process----------")
spark = SparkSession.builder.appName("AK-StatisticsRecalculation").master('yarn').getOrCreate()
spark.conf.set("spark.sql.legacy.allowCreatingManagedTableUsingNonemptyLocation","true")

spark.sql("CREATE TABLE IF NOT EXISTS eventDurationHistoryDataAvg (operation string, details string, avg_duration float) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','")
spark.sql("CREATE TABLE IF NOT EXISTS eventDurationHistoryDataMin (operation string, details string, min_duration float) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','")
spark.sql("CREATE TABLE IF NOT EXISTS eventDurationHistoryDataMax (operation string, details string, max_duration float) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','")

logging.info("----------Recalculating the averages...")
df_avg = spark.sql("SELECT operation, details, AVG(duration) as avg_duration FROM eventdurationhistorydata GROUP BY operation, details")

logging.info("----------Recalculating the minimals...")
df_min = spark.sql("SELECT operation, details, MIN(duration) as min_duration FROM eventdurationhistorydata GROUP BY operation, details")

logging.info("----------Recalculating the maximals...")
df_max = spark.sql("SELECT operation, details, MAX(duration) as max_duration FROM eventdurationhistorydata GROUP BY operation, details")

logging.info("----------Storing the averages...")
df_avg.write.mode('overwrite').format('hive').saveAsTable('default.eventDurationHistoryDataAvg')

logging.info("----------Storing the minimals...")
df_min.write.mode('overwrite').format('hive').saveAsTable('default.eventDurationHistoryDataMin')

logging.info("----------Storing the maximals...")
df_max.write.mode('overwrite').format('hive').saveAsTable('default.eventDurationHistoryDataMax')

#df = spark.sql("SELECT * FROM eventdurationhistorydataAvg")
#print(df.collect())
#df = spark.sql("SELECT * FROM eventdurationhistorydataMin")
#print(df.collect())
#df = spark.sql("SELECT * FROM eventdurationhistorydataMax")
#print(df.collect())
logging.info("---------------------------Process Finished!!!--------------------------")
logging.info("------------------------------------------------------------------------")

2021-03-01 19:13:56,058 - INFO - ------------------------------------------------------------------------
2021-03-01 19:13:56,059 - INFO - ------------Starting the Historical data recalculation process----------
2021-03-01 19:13:56,717 - INFO - ----------Recalculating the averages...
2021-03-01 19:13:56,977 - INFO - ----------Recalculating the minimals...
2021-03-01 19:13:57,000 - INFO - ----------Recalculating the maximals...
2021-03-01 19:13:57,019 - INFO - ----------Storing the averages...
2021-03-01 19:14:25,816 - INFO - ----------Storing the minimals...
2021-03-01 19:14:33,577 - INFO - ----------Storing the maximals...
2021-03-01 19:14:39,655 - INFO - ---------------------------Process Finished!!!--------------------------
2021-03-01 19:14:39,656 - INFO - ------------------------------------------------------------------------
