In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Import Data

In [32]:
# Read whatever csv from WRDS here
rawdata_df = pd.read_csv('../data/taq/56bb9b959bbe275b_csv.zip')
rawdata_df.head()

Unnamed: 0,DATE,TIME_M,EX,SYM_ROOT,SYM_SUFFIX,TR_SCOND,SIZE,PRICE,TR_CORR,TR_SEQNUM,TR_ID,TR_SOURCE,TR_RF
0,20200512,9:30:00.719399034,Q,ANGL,,@O X,31699,27.05,0,68153,3,N,
1,20200512,9:30:00.719524063,Q,ANGL,,@ Q,31699,27.05,0,68154,4,N,
2,20200512,9:30:02.952346185,Q,ANGL,,@,600,27.05,0,73839,5,N,
3,20200512,9:30:02.953119411,Q,ANGL,,@,3900,27.05,0,73840,6,N,
4,20200512,9:30:02.953862413,Q,ANGL,,@,500,27.05,0,73842,7,N,


# Process Data

In [33]:
data_df = rawdata_df.query('EX != "D"').copy()

In [34]:
# Add datetime info
data_df['DATE TIME'] = data_df['DATE'].astype(str) + ' ' + data_df['TIME_M'].astype(str)
data_df['datetime'] = pd.to_datetime(data_df['DATE TIME'])
data_df['date'] = pd.to_datetime(data_df['DATE'].astype(str))

In [35]:
# Rename variables
data_df['ticker'] = data_df['SYM_ROOT']
data_df['log_price'] = np.log(data_df['PRICE'])

In [36]:
# Resample on a 1 second basis
data_df = (
    data_df.set_index("datetime")
    .groupby(["ticker", "date"])
    .resample('1s', label = 'right')["log_price"]
    .last()
    .reset_index()
)


data_df["log_price"] = (
    data_df.sort_values(by="datetime")
    .groupby(["ticker", "date"])["log_price"]
    .fillna(method="ffill")
)

In [37]:
# Add more variables
data_df = data_df.sort_values(by="datetime")
data_df['price'] = np.exp(data_df['log_price'])
data_df["return"] = data_df.groupby(["ticker", "date"])["log_price"].diff(1)

In [38]:
data_df.head(10)

Unnamed: 0,ticker,date,datetime,log_price,price,return
0,ANGL,2020-05-12,2020-05-12 09:30:01,3.297687,27.05,
7159870,LQD,2020-05-12,2020-05-12 09:30:01,4.845367,127.15,
1193259,HYG,2020-05-12,2020-05-12 09:30:01,4.381026,79.92,
5966473,JNK,2020-05-12,2020-05-12 09:30:01,4.591578,98.65,
4773217,IGSB,2020-05-12,2020-05-12 09:30:01,3.983413,53.7,
3579960,IGIB,2020-05-12,2020-05-12 09:30:01,4.054217,57.64,
2386659,HYLB,2020-05-12,2020-05-12 09:30:01,3.81925,45.57,
2386660,HYLB,2020-05-12,2020-05-12 09:30:02,3.81925,45.57,0.0
1,ANGL,2020-05-12,2020-05-12 09:30:02,3.297687,27.05,0.0
1193260,HYG,2020-05-12,2020-05-12 09:30:02,4.381401,79.95,0.000375


# Export Data

In [None]:
pd.to_csv('../data/clean/whatever.csv', index = False)