# Data Cleaning and Preparation

In [29]:
import numpy as np
import datetime
import pandas as pd
import os
from tqdm import tqdm
import concurrent.futures

## Google Trends Data

In [5]:
files = os.listdir("../csv/trends/raw")
files

['politics.csv',
 'option call.csv',
 'treasury.csv',
 'stock-market.csv',
 'loan.csv',
 'rigged economy.csv',
 'gold.csv',
 'stimulus.csv',
 'bonds.csv',
 'election.csv',
 'finance.csv',
 'lawsuit.csv',
 'millionaire.csv',
 'economy.csv',
 'fraud.csv',
 'luxury.csv',
 'venture capital.csv',
 'NYSE.csv',
 'NASDAQ.csv',
 'banking.csv',
 'default.csv',
 'investment banking.csv',
 'debt ceiling.csv',
 'supreme court.csv',
 'bull market.csv',
 'stock futures.csv',
 'wall street bets.csv',
 'insider trading.csv',
 'etf.csv',
 'hedge fund.csv',
 'bear market.csv',
 'silicon valley.csv',
 'immigration.csv',
 'executive order.csv',
 'technology.csv',
 'startup.csv',
 'jobs.csv',
 'prices.csv',
 'bankruptcy.csv',
 'mortgage.csv',
 'innovation.csv',
 'money.csv',
 'government spending.csv',
 'forex.csv',
 'analyst.csv',
 'rent.csv',
 'federal reserve.csv',
 'fed.csv',
 'betting.csv',
 'sports.csv',
 'investor confidence.csv',
 'investment.csv',
 'debt.csv',
 'earnings.csv',
 'bailout.csv',
 'spy

In [31]:
def clean_file(file):
    data = pd.read_csv("../csv/trends/raw/{}".format(file))
    # Compute rolling average 
    data["delta"] = data[file[:-4]].rolling(2).apply(lambda x: (x.iloc[1] - x.iloc[0])/x.iloc[0])
    data.to_csv("../csv/trends/cleaned/{}".format(file))

In [32]:
# Slow, Non Parallelized
# for file in tqdm(files):
#     data = pd.read_csv("../csv/trends/raw/{}".format(file))
#     data["delta"] = data[file[:-4]].rolling(2).apply(lambda x: (x.iloc[1] - x.iloc[0])/x.iloc[0])
#     data.to_csv("../csv/trends/cleaned/{}".format(file))

# Fast, Parallelized, Wow
with concurrent.futures.ThreadPoolExecutor(max_workers=12) as executor:
    executor.map(clean_file,
                 files)

## Bloomberg Options Data

In [6]:
data = pd.read_csv("../csv/options/raw/options.csv")
data.head()

Unnamed: 0,Dates,VOLUME_TOTAL_PUT,VOLUME_TOTAL_CALL,PX_VOLUME,PX_MID,PX_ASK,PX_BID,CALL_IMP_VOL_10D,CALL_IMP_VOL_30D,3MO_CALL_IMP_VOL,...,18MO_CALL_IMP_VOL,24MO_CALL_IMP_VOL,3MO_PUT_IMP_VOL,PUT_IMP_VOL_10D,6MO_PUT_IMP_VOL,PUT_IMP_VOL_30D,24MO_PUT_IMP_VOL,PUT_IMP_VOL_60D,18MO_PUT_IMP_VOL,12MO_PUT_IMP_VOL
0,#NAME?,1135715.0,647838.0,130991069.0,202.035,202.05,202.02,20.493,17.199,16.792,...,18.024,18.666,16.792,20.493,17.068,17.199,18.666,16.668,18.024,17.424
1,1/21/15,1420444.0,706260.0,122942707.0,203.095,203.1,203.09,18.862,16.168,15.941,...,17.709,18.363,15.941,18.862,16.535,16.168,18.363,15.577,17.709,17.01
2,1/22/15,1901104.0,1540655.0,174356029.0,206.005,206.01,206.0,14.312,13.865,14.707,...,17.137,17.996,14.707,14.312,15.429,13.865,17.996,14.064,17.137,16.27
3,1/23/15,1731851.0,917202.0,117516753.0,204.815,204.83,204.8,14.651,14.101,14.966,...,17.254,17.834,14.966,14.651,15.685,14.101,17.834,14.368,17.254,16.385
4,1/26/15,832915.0,524220.0,92009711.0,205.735,205.74,205.73,13.385,13.227,14.334,...,17.019,18.017,14.334,13.385,15.333,13.227,18.017,13.58,17.019,16.267


In [7]:
data.dtypes

Dates                 object
VOLUME_TOTAL_PUT     float64
VOLUME_TOTAL_CALL    float64
PX_VOLUME            float64
PX_MID               float64
PX_ASK               float64
PX_BID               float64
CALL_IMP_VOL_10D     float64
CALL_IMP_VOL_30D     float64
3MO_CALL_IMP_VOL     float64
6MO_CALL_IMP_VOL     float64
12MO_CALL_IMP_VOL    float64
18MO_CALL_IMP_VOL    float64
24MO_CALL_IMP_VOL    float64
3MO_PUT_IMP_VOL      float64
PUT_IMP_VOL_10D      float64
6MO_PUT_IMP_VOL      float64
PUT_IMP_VOL_30D      float64
24MO_PUT_IMP_VOL     float64
PUT_IMP_VOL_60D      float64
18MO_PUT_IMP_VOL     float64
12MO_PUT_IMP_VOL     float64
dtype: object

In [16]:
data["Dates"] = pd.to_datetime(data["Dates"], errors="coerce")
data = data[data["Dates"] != pd.NaT]
data.dtypes

Dates                datetime64[ns]
VOLUME_TOTAL_PUT            float64
VOLUME_TOTAL_CALL           float64
PX_VOLUME                   float64
PX_MID                      float64
PX_ASK                      float64
PX_BID                      float64
CALL_IMP_VOL_10D            float64
CALL_IMP_VOL_30D            float64
3MO_CALL_IMP_VOL            float64
6MO_CALL_IMP_VOL            float64
12MO_CALL_IMP_VOL           float64
18MO_CALL_IMP_VOL           float64
24MO_CALL_IMP_VOL           float64
3MO_PUT_IMP_VOL             float64
PUT_IMP_VOL_10D             float64
6MO_PUT_IMP_VOL             float64
PUT_IMP_VOL_30D             float64
24MO_PUT_IMP_VOL            float64
PUT_IMP_VOL_60D             float64
18MO_PUT_IMP_VOL            float64
12MO_PUT_IMP_VOL            float64
dtype: object

In [22]:
data = data.dropna()
data.head()

Unnamed: 0,Dates,VOLUME_TOTAL_PUT,VOLUME_TOTAL_CALL,PX_VOLUME,PX_MID,PX_ASK,PX_BID,CALL_IMP_VOL_10D,CALL_IMP_VOL_30D,3MO_CALL_IMP_VOL,...,18MO_CALL_IMP_VOL,24MO_CALL_IMP_VOL,3MO_PUT_IMP_VOL,PUT_IMP_VOL_10D,6MO_PUT_IMP_VOL,PUT_IMP_VOL_30D,24MO_PUT_IMP_VOL,PUT_IMP_VOL_60D,18MO_PUT_IMP_VOL,12MO_PUT_IMP_VOL
1,2015-01-21,1420444.0,706260.0,122942707.0,203.095,203.1,203.09,18.862,16.168,15.941,...,17.709,18.363,15.941,18.862,16.535,16.168,18.363,15.577,17.709,17.01
2,2015-01-22,1901104.0,1540655.0,174356029.0,206.005,206.01,206.0,14.312,13.865,14.707,...,17.137,17.996,14.707,14.312,15.429,13.865,17.996,14.064,17.137,16.27
3,2015-01-23,1731851.0,917202.0,117516753.0,204.815,204.83,204.8,14.651,14.101,14.966,...,17.254,17.834,14.966,14.651,15.685,14.101,17.834,14.368,17.254,16.385
4,2015-01-26,832915.0,524220.0,92009711.0,205.735,205.74,205.73,13.385,13.227,14.334,...,17.019,18.017,14.334,13.385,15.333,13.227,18.017,13.58,17.019,16.267
5,2015-01-27,1600611.0,852696.0,134044598.0,203.375,203.38,203.37,16.444,15.166,15.384,...,17.361,18.02,15.384,16.444,15.951,15.166,18.02,14.905,17.361,16.578


In [23]:
data.to_csv("../csv/options/cleaned/options.csv")