In [1]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import desc, row_number
spark = SparkSession.builder.master('local[*]').appName('Analytical_ETL').getOrCreate() 

In [2]:
def corrected_parquet_path(date,type):
    basepath='/home/roger/SB/Guided_Capstone'
    filepath='/output/corrected_data/{}/partition={}/'.format(date,type)
    df=spark.read.parquet(basepath+filepath)
    return df

In [3]:
#read parquet files from EOD Corrected data
curr_quote=corrected_parquet_path('2020-08-06','Q')
curr_trade=corrected_parquet_path('2020-08-06','T')
prev_trade=corrected_parquet_path('2020-08-05','T')

In [4]:
#creating temp view
curr_quote.createOrReplaceTempView('tmp_curr_quote')
curr_trade.createOrReplaceTempView('tmp_curr_trade')
prev_trade.createOrReplaceTempView('tmp_prev_trade')

### Calculate Current Day Trade Analytics
#### 30 minute moving average

In [16]:
# uses tmp_curr_trade table
curr_trade_analytics=spark.sql('SELECT symbol,exchange,event_tm,event_seq_nb, trade_pr,AVG(trade_pr) OVER (PARTITION BY exchange,symbol ORDER BY event_tm RANGE BETWEEN INTERVAL 30 MINUTES PRECEDING AND CURRENT ROW) as mov_avg_pr FROM tmp_curr_trade')
curr_trade_analytics.show(5,truncate=False)
# create tmp_trade_analytics temporary table
# curr_trade_analytics.createOrReplaceTempView('tmp_curr_trade_analytics')

+------+--------+-----------------------+------------+--------+-----------------+
|symbol|exchange|event_tm               |event_seq_nb|trade_pr|mov_avg_pr       |
+------+--------+-----------------------+------------+--------+-----------------+
|SYMA  |NASDAQ  |2020-08-06 10:42:21.079|10          |78.93246|78.93245697021484|
|SYMA  |NASDAQ  |2020-08-06 12:00:29.595|20          |77.0967 |77.0967025756836 |
|SYMA  |NASDAQ  |2020-08-06 13:09:29.883|30          |78.31462|78.31462097167969|
|SYMA  |NASDAQ  |2020-08-06 14:27:08.62 |40          |75.84401|75.84400939941406|
|SYMA  |NASDAQ  |2020-08-06 15:39:00.929|50          |77.62613|77.62612915039062|
+------+--------+-----------------------+------------+--------+-----------------+
only showing top 5 rows



In [21]:
# save temp view of current trade analytics as table
curr_trade_analytics.write.saveAsTable('tmp_curr_trade_analytics') 

### Calculate Previous Day Trade Anaylitics
#### 30 minute moving average

In [9]:
# 30MA for previous days trade information
prev_trade_analytics=spark.sql('SELECT symbol,exchange,event_tm,event_seq_nb, trade_pr,AVG(trade_pr) OVER (PARTITION BY exchange,symbol ORDER BY event_tm RANGE BETWEEN INTERVAL 30 MINUTES PRECEDING AND CURRENT ROW) as mov_avg_pr FROM tmp_prev_trade')
prev_trade_analytics.orderBy('exchange','symbol','event_tm').show(truncate=False)
prev_trade_analytics.createOrReplaceTempView('tmp_prev_trade_analytics')

+------+--------+-----------------------+------------+---------+------------------+
|symbol|exchange|event_tm               |event_seq_nb|trade_pr |mov_avg_pr        |
+------+--------+-----------------------+------------+---------+------------------+
|SYMA  |NASDAQ  |2020-08-05 10:38:50.046|10          |77.7757  |77.77570343017578 |
|SYMA  |NASDAQ  |2020-08-05 11:58:33.106|20          |75.715225|75.71522521972656 |
|SYMA  |NASDAQ  |2020-08-05 13:09:24.38 |30          |75.87926 |75.87925720214844 |
|SYMA  |NASDAQ  |2020-08-05 14:22:41.39 |40          |78.324715|78.32471466064453 |
|SYMA  |NASDAQ  |2020-08-05 15:33:58.825|50          |75.72602 |75.72602081298828 |
|SYMA  |NASDAQ  |2020-08-05 16:46:43.764|60          |77.479485|77.47948455810547 |
|SYMA  |NASDAQ  |2020-08-05 17:58:10.324|70          |74.91233 |74.9123306274414  |
|SYMA  |NASDAQ  |2020-08-05 19:13:26.116|80          |77.69288 |77.69287872314453 |
|SYMA  |NASDAQ  |2020-08-05 20:26:46.538|90          |76.15561 |76.155609130

In [19]:
# filtering previous days trade analytics for only closing trade price and closing MA 
prev_close_trade_analytics=spark.sql('SELECT symbol,exchange,event_tm,event_seq_nb,trade_pr, mov_avg_pr FROM (SELECT *, ROW_NUMBER() OVER (PARTITION BY exchange, symbol ORDER BY event_tm DESC) as row FROM tmp_prev_trade_analytics) WHERE row=1')
prev_close_trade_analytics.show()
# creating temp view 
# prev_trade_analytics.createOrReplaceTempView('tmp_prev_close_trade_analytics')

+------+--------+--------------------+------------+---------+------------------+
|symbol|exchange|            event_tm|event_seq_nb| trade_pr|        mov_avg_pr|
+------+--------+--------------------+------------+---------+------------------+
|  SYMA|  NASDAQ|2020-08-05 21:40:...|         100| 77.24676| 77.24675750732422|
|  SYMB|  NASDAQ|2020-08-05 21:03:...|         100|35.537262|35.537261962890625|
|  SYMC|  NASDAQ|2020-08-05 21:49:...|         100|158.02032|158.02032470703125|
|  SYMA|    NYSE|2020-08-05 21:30:...|         100| 77.78611|  77.7861099243164|
|  SYMB|    NYSE|2020-08-05 21:27:...|         100|33.956287|  33.9562873840332|
|  SYMC|    NYSE|2020-08-05 21:52:...|         100|160.61949|160.61949157714844|
+------+--------+--------------------+------------+---------+------------------+



In [20]:
# save temp view of previous close trade analytics as table
prev_close_trade_analytics.write.saveAsTable('tmp_prev_close_trade_analytics') 

#### Join Quote with Current Trade Analytics

In [11]:
Updated_quote_df= spark.sql('SELECT q.symbol, q.exchange, q.event_tm, q.event_seq_nb, q.bid_pr, q.bid_size, q.ask_pr, q.ask_size, ma.trade_pr,ma.mov_avg_pr, ROW_NUMBER() OVER (PARTITION BY q.symbol, q.exchange, q.event_tm ORDER BY ma.event_tm DESC) as row_num FROM tmp_curr_quote as q LEFT JOIN tmp2_prev_trade_analytics as ma ON q.symbol = ma.symbol AND q.exchange = ma.exchange AND ma.event_tm < q.event_tm')
Updated_quote_df.createOrReplaceTempView('tmp_updated_quote')
Updated_quote_df.show()

+------+--------+--------------------+------------+---------+--------+---------+--------+--------+-----------------+-------+
|symbol|exchange|            event_tm|event_seq_nb|   bid_pr|bid_size|   ask_pr|ask_size|trade_pr|       mov_avg_pr|row_num|
+------+--------+--------------------+------------+---------+--------+---------+--------+--------+-----------------+-------+
|  SYMA|  NASDAQ|2020-08-06 09:38:...|           1|78.133705|     100|79.825165|     100|77.24676|77.24675750732422|      1|
|  SYMA|  NASDAQ|2020-08-06 09:46:...|           2| 76.52305|     100| 76.57241|     100|77.24676|77.24675750732422|      1|
|  SYMA|  NASDAQ|2020-08-06 09:52:...|           3| 78.74535|     100|  79.0928|     100|77.24676|77.24675750732422|      1|
|  SYMA|  NASDAQ|2020-08-06 09:58:...|           4|75.613625|     100|76.949776|     100|77.24676|77.24675750732422|      1|
|  SYMA|  NASDAQ|2020-08-06 10:07:...|           5| 77.45084|     100|78.725334|     100|77.24676|77.24675750732422|      1|


### Join Updated Quote with previous day close trade analytics and calculate spread

In [25]:
#uses coalese to select non-null trade analytics value in updated quote or previous day trade table
Final_quote=spark.sql('SELECT q.symbol, q.exchange, q.event_tm, q.event_seq_nb, q.bid_pr, q.bid_size, q.ask_pr, q.ask_size, q.bid_pr-c.trade_pr as bid_spread, q.ask_pr-c.trade_pr as ask_spread, coalesce(q.trade_pr,c.trade_pr) as last_trade_pr, coalesce(q.mov_avg_pr, c.mov_avg_pr) as last_mov_avg_pr FROM tmp_updated_quote as q LEFT JOIN tmp2_prev_trade_analytics as c ON q.exchange=c.exchange AND q.symbol=c.symbol where row_num=1')
Final_quote.show()

+------+--------+--------------------+------------+---------+--------+---------+--------+-----------+-----------+-------------+-----------------+
|symbol|exchange|            event_tm|event_seq_nb|   bid_pr|bid_size|   ask_pr|ask_size| bid_spread| ask_spread|last_trade_pr|  last_mov_avg_pr|
+------+--------+--------------------+------------+---------+--------+---------+--------+-----------+-----------+-------------+-----------------+
|  SYMA|  NASDAQ|2020-08-06 09:38:...|           1|78.133705|     100|79.825165|     100| 0.88694763|  2.5784073|     77.24676|77.24675750732422|
|  SYMA|  NASDAQ|2020-08-06 09:46:...|           2| 76.52305|     100| 76.57241|     100| -0.7237091| -0.6743469|     77.24676|77.24675750732422|
|  SYMA|  NASDAQ|2020-08-06 09:52:...|           3| 78.74535|     100|  79.0928|     100|  1.4985962|  1.8460388|     77.24676|77.24675750732422|
|  SYMA|  NASDAQ|2020-08-06 09:58:...|           4|75.613625|     100|76.949776|     100| -1.6331329| -0.2969818|     77.246

### Load analytical data to storage

In [27]:
def load_analytical_data(df,date):
    base_path='/home/roger/SB/Guided_Capstone/output/analytical_data/'
    df.write.parquet(base_path+'{}/'.format(date))
    return 



In [30]:
load_analytical_data(Final_quote,'2020-08-06')