# Data Analysis
### Analyzes updated data and writes output to Blob storage

In [None]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [None]:
# Azure credentials
storageAccountName = 'exchangedata1'
storageAccountAccessKey = 'OvneJu+eyLD7MD9DCYX//qvDOe9jwUrpOV3KA5HpjFM3+TA0++sOJzT7xSWVGE8lqXl6oG/8y39yzZ2ZPRAQ0g=='
ContainerName = 'source-container'
spark.conf.set(f'fs.azure.account.key.{storageAccountName}.blob.core.windows.net', storageAccountAccessKey)

In [None]:
import os
os.listdir('/dbfs/output/')

In [None]:
def EOD_parquet_path(date,type):
    blob_path='wasbs://{}@{}.blob.core.windows.net'.format(ContainerName,storageAccountName)
    file_path='/output/EOD_corrected/{}/partition={}/'.format(date,type)
    df = spark.read.parquet(blob_path+file_path)
    return df
 

In [None]:
#read parquet files from EOD Corrected data
curr_quote=EOD_parquet_path('2020-08-06','Q')
curr_trade=EOD_parquet_path('2020-08-06','T')
prev_trade=EOD_parquet_path('2020-08-05','T')

In [None]:
#creating temp view
curr_quote.createOrReplaceTempView('tmp_curr_quote')
curr_trade.createOrReplaceTempView('tmp_curr_trade')
prev_trade.createOrReplaceTempView('tmp_prev_trade')

### Calculate Current Day Trade Analytics
#### 30 minute moving average and trade price

In [None]:
# uses tmp_curr_trade table
curr_trade_analytics=spark.sql('SELECT symbol,exchange,event_tm,event_seq_nb, trade_pr,AVG(trade_pr) OVER (PARTITION BY exchange,symbol ORDER BY event_tm RANGE BETWEEN INTERVAL 30 MINUTES PRECEDING AND CURRENT ROW) as mov_avg_pr FROM tmp_curr_trade')
curr_trade_analytics.show(5,truncate=False)
# create tmp_trade_analytics temporary table
# curr_trade_analytics.createOrReplaceTempView('tmp_curr_trade_analytics_test')

In [None]:
# save temp view of current trade analytics as hive table
# this will create a table in folder the structure /dbfs/user/hive/warehouse/
curr_trade_analytics.write.saveAsTable('tmp_curr_trade_analytics') 

### Calculate Previous Day Trade Analytics
#### 30 minute moving average and trade price

In [None]:
# 30MA for previous days trade information
prev_trade_analytics=spark.sql('SELECT symbol,exchange,event_tm,event_seq_nb, trade_pr,AVG(trade_pr) OVER (PARTITION BY exchange,symbol ORDER BY event_tm RANGE BETWEEN INTERVAL 30 MINUTES PRECEDING AND CURRENT ROW) as mov_avg_pr FROM tmp_prev_trade')
prev_trade_analytics.orderBy('exchange','symbol','event_tm').show(5,truncate=False)
prev_trade_analytics.createOrReplaceTempView('tmp_prev_trade_analytics')

In [None]:
# filtering previous days trade analytics for only closing trade price and closing MA 
prev_close_trade_analytics=spark.sql('SELECT symbol,exchange,event_tm,event_seq_nb,trade_pr, mov_avg_pr FROM (SELECT *, ROW_NUMBER() OVER (PARTITION BY exchange, symbol ORDER BY event_tm DESC) as row FROM tmp_prev_trade_analytics) WHERE row=1')
prev_close_trade_analytics.show()
# creating temp view 
# prev_trade_analytics.createOrReplaceTempView('tmp_prev_close_trade_analytics')

In [None]:
# save temp view of previous close trade analytics as table
prev_close_trade_analytics.write.saveAsTable('tmp_prev_close_trade_analytics') 

In [None]:
os.listdir('/dbfs/user/hive/warehouse/')

#### Join Quote with Current Trade Analytics

In [None]:
Updated_quote_df= spark.sql('SELECT q.symbol, q.exchange, q.event_tm, q.event_seq_nb, q.bid_pr, q.bid_size, q.ask_pr, q.ask_size, ma.trade_pr,ma.mov_avg_pr, ROW_NUMBER() OVER (PARTITION BY q.symbol, q.exchange, q.event_tm ORDER BY ma.event_tm DESC) as row_num FROM tmp_curr_quote as q LEFT JOIN tmp_curr_trade_analytics as ma ON q.symbol = ma.symbol AND q.exchange = ma.exchange AND ma.event_tm < q.event_tm')
Updated_quote_df.show()
Updated_quote_df.createOrReplaceTempView('tmp_updated_quote')


### Join Updated Quote with previous day close trade analytics and calculate spread

In [None]:
#uses coalese to select non-null trade analytics value in updated quote or previous day trade table
Final_quote=spark.sql('SELECT q.symbol, q.exchange, q.event_tm, q.event_seq_nb, q.bid_pr, q.bid_size, q.ask_pr, q.ask_size, q.bid_pr-c.trade_pr as bid_spread, q.ask_pr-c.trade_pr as ask_spread, coalesce(q.trade_pr,c.trade_pr) as last_trade_pr, coalesce(q.mov_avg_pr, c.mov_avg_pr) as last_mov_avg_pr FROM tmp_updated_quote as q LEFT JOIN tmp_prev_close_trade_analytics as c ON q.exchange=c.exchange AND q.symbol=c.symbol where row_num=1')
Final_quote.show()

### Load analytical data to Blob storage

In [None]:
def load_analytical_data(df,date):
    blob_path='wasbs://{}@{}.blob.core.windows.net'.format(ContainerName,storageAccountName)
    dir_path='/output/Analytical_data/{}/'.format(date)
    df.write.parquet(blob_path+dir_path)
    return 



In [None]:
load_analytical_data(Final_quote,'2020-08-06')