In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import desc, row_number
spark = SparkSession.builder.master('local[*]').appName('Data_Correction').getOrCreate()

### Create Dataframe from parsed parquet files

In [11]:
def parsed_parquet_path(date,type):
    basepath='/home/roger/SB/Guided_Capstone/output/parsed_data'
    filepath='/{}/partition={}/'.format(date,type)
    df=spark.read.parquet(basepath+filepath)
    return df

In [12]:
# read data for 2020-08-05
common_quote_df_85=parsed_parquet_path('2020-08-05','Q')
common_trade_df_85=parsed_parquet_path('2020-08-05','T')

# read data for 2020-08-06
common_quote_df_86=parsed_parquet_path('2020-08-06','Q')
common_trade_df_86=parsed_parquet_path('2020-08-06','T')

In [13]:
#quote and trade 
common_quote_df_85.show()
common_quote_df_85.count()

+----------+--------+------+--------+--------------------+------------+-------------------+--------+---------+--------+---------+--------+
|  trade_dt|rec_type|symbol|exchange|            event_tm|event_seq_nb|         arrival_tm|trade_pr|   bid_pr|bid_size|   ask_pr|ask_size|
+----------+--------+------+--------+--------------------+------------+-------------------+--------+---------+--------+---------+--------+
|2020-08-05|       Q|  SYMA|  NASDAQ|2020-08-05 09:36:...|           1|2020-08-05 09:30:00|    null| 76.10017|     100|  77.9648|     100|
|2020-08-05|       Q|  SYMA|  NASDAQ|2020-08-05 09:42:...|           2|2020-08-05 09:30:00|    null| 75.44373|     100| 75.94453|     100|
|2020-08-05|       Q|  SYMA|  NASDAQ|2020-08-05 09:48:...|           3|2020-08-05 09:30:00|    null|78.847984|     100| 80.69115|     100|
|2020-08-05|       Q|  SYMA|  NASDAQ|2020-08-05 09:53:...|           4|2020-08-05 09:30:00|    null| 74.98337|     100| 76.16257|     100|
|2020-08-05|       Q|  SYMA

540

### Creating Dataframe with columns specific to Quote or Trade

In [9]:
#getting columns pertaining specifically to quotes or trades only since previous dataframe include columns related to both quote and trade data
# 2020-08-05
specific_trade_df_85=common_trade_df_85.select('trade_dt', 'symbol', 'exchange', 'event_tm', 'event_seq_nb', 'arrival_tm', 'trade_pr')
specific_quote_df_85=common_quote_df_85.select('trade_dt', 'symbol', 'exchange', 'event_tm', 'event_seq_nb', 'arrival_tm', 'bid_pr','bid_size','ask_pr','ask_size')


In [10]:
#getting columns pertaining specifically to quotes or trades only since previous dataframe include columns related to both quote and trade data
# 2020-08-06
specific_trade_df_86=common_trade_df_86.select('trade_dt', 'symbol', 'exchange', 'event_tm', 'event_seq_nb', 'arrival_tm', 'trade_pr')
specific_quote_df_86=common_quote_df_86.select('trade_dt', 'symbol', 'exchange', 'event_tm', 'event_seq_nb', 'arrival_tm', 'bid_pr','bid_size','ask_pr','ask_size')


### Correcting Data 

In [32]:
#same records can by uniquely identifed by columns trade_dt,symbol,event_tm, and event_seq_nb
#since some records may be sent by exchanges in later batches to correct for initial data, we partition by unique identfiers and order by arrival time
#function applies a row_number to identify which records are the latest to be recieved and only keeps those records
def apply_latest_data(df):
    WindowSpec=Window.partitionBy('trade_dt','symbol','event_tm','event_seq_nb').orderBy(desc('arrival_tm'))
    corrected_df=df.withColumn('row_number',row_number().over(WindowSpec)).where('row_number == 1').drop('row_number')
    return corrected_df

### Verifring code in apply_latest_data functions works correctly 

In [31]:
# add in duplicated test data to verify function code works 
#test data has the same unique record identifers as another field, but a later arrival time and different trade_pr
test_schema = ['trade_dt', 'symbol', 'exchange', 'event_tm', 'event_seq_nb', 'arrival_tm', 'trade_pr']
test_data=[("2020-08-05","SYMA","NASDAQ","2020-08-05 10:38:50.046",10,"2020-08-05 09:45:00.0",82.11)]
test_df = spark.createDataFrame(test_data, schema=test_schema)

# union test_df with original trade_df
df_with_testdata=specific_trade_df_85.union(test_df)

#creates column with row numbers that partitions by unique identifers and orders by latest arrival time
WindowSpec=Window.partitionBy('trade_dt','symbol','event_tm','event_seq_nb').orderBy(desc('arrival_tm'))
df_with_testdata=df_with_testdata.withColumn('row_number',row_number().over(WindowSpec))


# filtering for records with row number 2 or greater
df_with_testdata.filter(df_with_testdata['row_number']>='2').select('*').show()

# since our test data had event_tm=2020-08-05 10:38:50.046, we filter for this to confirm the later arrival time has row number 1
df_with_testdata.filter(df_with_testdata['event_tm']=='2020-08-05 10:38:50.046').select('*').show(truncate=False)

+----------+------+--------+--------------------+------------+-------------------+-----------------+----------+
|  trade_dt|symbol|exchange|            event_tm|event_seq_nb|         arrival_tm|         trade_pr|row_number|
+----------+------+--------+--------------------+------------+-------------------+-----------------+----------+
|2020-08-05|  SYMA|  NASDAQ|2020-08-05 10:38:...|          10|2020-08-05 09:30:00|77.77570343017578|         2|
+----------+------+--------+--------------------+------------+-------------------+-----------------+----------+

+----------+------+--------+-----------------------+------------+---------------------+-----------------+----------+
|trade_dt  |symbol|exchange|event_tm               |event_seq_nb|arrival_tm           |trade_pr         |row_number|
+----------+------+--------+-----------------------+------------+---------------------+-----------------+----------+
|2020-08-05|SYMA  |NASDAQ  |2020-08-05 10:38:50.046|10          |2020-08-05 09:45:00.0|8

In [16]:
#data correction for 2020-08-05
trade_corrected_85=apply_latest_data(specific_trade_df_85)
quote_corrected_85=apply_latest_data(specific_quote_df_85)

#data correction for 2020-08-06
trade_corrected_86=apply_latest_data(specific_trade_df_86)
quote_corrected_86=apply_latest_data(specific_quote_df_86)

### End-of-day loading of corrected data

In [33]:
# Quote and Trade paritions include NYSE and NASDAQ data in each partition
def corrected_data_partition(df,date,type):
    base_path='/home/roger/SB/Guided_Capstone/output/corrected_data/'
    df.write.parquet(base_path+'{}/partition={}/'.format(date,type))
    return


In [34]:
#2020-08-05
corrected_data_partition(trade_corrected_85,'2020-08-05','T')
corrected_data_partition(quote_corrected_85,'2020-08-05','Q')
#2020-08-06
corrected_data_partition(trade_corrected_86,'2020-08-06','T')
corrected_data_partition(quote_corrected_86,'2020-08-06','Q')