# Data Cleaning and Preparation

In [5]:
import numpy as np
import datetime
import pandas as pd
import os
from tqdm import tqdm
import concurrent.futures

## Google Trends Data

In [17]:
files = os.listdir("../csv/trends/raw")
files

['analyst.csv',
 'bailout.csv',
 'banking.csv',
 'bankruptcy.csv',
 'bear market.csv',
 'betting.csv',
 'billionaire.csv',
 'bonds.csv',
 'bull market.csv',
 'debt ceiling.csv',
 'debt.csv',
 'default.csv',
 'democrat.csv',
 'disease.csv',
 'division.csv',
 'earnings.csv',
 'economy.csv',
 'election.csv',
 'etf.csv',
 'executive order.csv',
 'fed.csv',
 'federal reserve.csv',
 'finance.csv',
 'fire.csv',
 'forex.csv',
 'fraud.csv',
 'GDP.csv',
 'gold.csv',
 'government spending.csv',
 'health.csv',
 'hedge fund.csv',
 'immigration.csv',
 'inflation.csv',
 'innovation.csv',
 'insider trading.csv',
 'interest rates.csv',
 'investment banking.csv',
 'investment.csv',
 'investor confidence.csv',
 'jobs.csv',
 'law.csv',
 'lawsuit.csv',
 'loan.csv',
 'luxury.csv',
 'millionaire.csv',
 'money.csv',
 'mortgage.csv',
 'NASDAQ.csv',
 'news.csv',
 'NYSE.csv',
 'option call.csv',
 'option put.csv',
 'politics.csv',
 'prayers.csv',
 'prices.csv',
 'recession.csv',
 'rent.csv',
 'republican.csv',
 

In [18]:
def clean_file(file):
    data = pd.read_csv("../csv/trends/raw/{}".format(file))
    data = data.drop(columns="isPartial")
    data = data.drop_duplicates(subset="date", keep="last")
    data.to_csv("../csv/trends/cleaned/{}".format(file), index=False)

In [29]:
# Slow, Non Parallelized
# for file in tqdm(files):
#     clean_file(file)

In [19]:
# Fast, Parallelized, Wow
with concurrent.futures.ThreadPoolExecutor(max_workers=12) as executor:
    executor.map(clean_file,
                 files)

In [24]:
files = os.listdir("../csv/trends/cleaned")
files

['analyst.csv',
 'bailout.csv',
 'banking.csv',
 'bankruptcy.csv',
 'bear market.csv',
 'betting.csv',
 'billionaire.csv',
 'bonds.csv',
 'bull market.csv',
 'debt ceiling.csv',
 'debt.csv',
 'default.csv',
 'democrat.csv',
 'disease.csv',
 'division.csv',
 'earnings.csv',
 'economy.csv',
 'election.csv',
 'etf.csv',
 'executive order.csv',
 'fed.csv',
 'federal reserve.csv',
 'finance.csv',
 'fire.csv',
 'forex.csv',
 'fraud.csv',
 'GDP.csv',
 'gold.csv',
 'government spending.csv',
 'health.csv',
 'hedge fund.csv',
 'immigration.csv',
 'inflation.csv',
 'innovation.csv',
 'insider trading.csv',
 'interest rates.csv',
 'investment banking.csv',
 'investment.csv',
 'investor confidence.csv',
 'jobs.csv',
 'law.csv',
 'lawsuit.csv',
 'loan.csv',
 'luxury.csv',
 'millionaire.csv',
 'money.csv',
 'mortgage.csv',
 'NASDAQ.csv',
 'news.csv',
 'NYSE.csv',
 'option call.csv',
 'option put.csv',
 'politics.csv',
 'prayers.csv',
 'prices.csv',
 'recession.csv',
 'rent.csv',
 'republican.csv',
 

In [25]:
data = map(lambda f: pd.read_csv("../csv/trends/cleaned/{}".format(f)), files)
data = list(data)
data[0]

Unnamed: 0,date,analyst
0,2015-01-01 00:00:00,38
1,2015-01-01 01:00:00,38
2,2015-01-01 02:00:00,40
3,2015-01-01 03:00:00,43
4,2015-01-01 04:00:00,42
...,...,...
57189,2021-07-17 20:00:00,43
57190,2021-07-17 21:00:00,44
57191,2021-07-17 22:00:00,43
57192,2021-07-17 23:00:00,46


In [26]:
for d in range(len(data)):
    data[d]["date"] = pd.to_datetime(data[d]["date"])
    data[d] = data[d].set_index("date")

In [27]:
data[0].index

DatetimeIndex(['2015-01-01 00:00:00', '2015-01-01 01:00:00',
               '2015-01-01 02:00:00', '2015-01-01 03:00:00',
               '2015-01-01 04:00:00', '2015-01-01 05:00:00',
               '2015-01-01 06:00:00', '2015-01-01 07:00:00',
               '2015-01-01 08:00:00', '2015-01-01 09:00:00',
               ...
               '2021-07-17 15:00:00', '2021-07-17 16:00:00',
               '2021-07-17 17:00:00', '2021-07-17 18:00:00',
               '2021-07-17 19:00:00', '2021-07-17 20:00:00',
               '2021-07-17 21:00:00', '2021-07-17 22:00:00',
               '2021-07-17 23:00:00', '2021-07-18 00:00:00'],
              dtype='datetime64[ns]', name='date', length=57194, freq=None)

In [28]:
list(map(lambda x: x.index.is_unique, data))

[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True]

In [29]:
full_data = pd.concat(data, ignore_index=False, axis=1, join="outer")
full_data.head()

Unnamed: 0_level_0,analyst,bailout,banking,bankruptcy,bear market,betting,billionaire,bonds,bull market,debt ceiling,...,stocks,supreme court,technology,treasury,unemployment,vaccine,venture capital,wall street bets,wall street,war
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-01 00:00:00,38.0,47.0,28.0,50,40.0,68.0,28,30,86.0,9.0,...,76.0,48.0,32.0,33.0,45,54.0,21.0,0.0,60.0,76.0
2015-01-01 01:00:00,38.0,60.0,28.0,53,36.0,40.0,36,27,100.0,19.0,...,74.0,56.0,37.0,33.0,40,56.0,15.0,0.0,63.0,79.0
2015-01-01 02:00:00,40.0,69.0,30.0,62,30.0,48.0,29,35,61.0,5.0,...,73.0,56.0,39.0,35.0,38,52.0,99.0,0.0,60.0,80.0
2015-01-01 03:00:00,43.0,33.0,38.0,34,28.0,41.0,33,34,47.0,24.0,...,96.0,53.0,45.0,41.0,37,54.0,21.0,0.0,57.0,79.0
2015-01-01 04:00:00,42.0,31.0,41.0,38,34.0,48.0,33,39,41.0,19.0,...,72.0,54.0,47.0,45.0,33,59.0,20.0,0.0,54.0,74.0


In [31]:
full_data = full_data.fillna(0)
full_data.head()


Unnamed: 0_level_0,analyst,bailout,banking,bankruptcy,bear market,betting,billionaire,bonds,bull market,debt ceiling,...,stocks,supreme court,technology,treasury,unemployment,vaccine,venture capital,wall street bets,wall street,war
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-01 00:00:00,38.0,47.0,28.0,50,40.0,68.0,28,30,86.0,9.0,...,76.0,48.0,32.0,33.0,45,54.0,21.0,0.0,60.0,76.0
2015-01-01 01:00:00,38.0,60.0,28.0,53,36.0,40.0,36,27,100.0,19.0,...,74.0,56.0,37.0,33.0,40,56.0,15.0,0.0,63.0,79.0
2015-01-01 02:00:00,40.0,69.0,30.0,62,30.0,48.0,29,35,61.0,5.0,...,73.0,56.0,39.0,35.0,38,52.0,99.0,0.0,60.0,80.0
2015-01-01 03:00:00,43.0,33.0,38.0,34,28.0,41.0,33,34,47.0,24.0,...,96.0,53.0,45.0,41.0,37,54.0,21.0,0.0,57.0,79.0
2015-01-01 04:00:00,42.0,31.0,41.0,38,34.0,48.0,33,39,41.0,19.0,...,72.0,54.0,47.0,45.0,33,59.0,20.0,0.0,54.0,74.0


In [33]:
full_data.to_csv("trends.csv")

## WRDS Options Data

In [1]:
options_files = [
    "../csv/options/raw/2015-16options.csv",
    "../csv/options/raw/2016-17options.csv",
    "../csv/options/raw/2017-18options.csv",
    "../csv/options/raw/2018-19options.csv",
    "../csv/options/raw/2019-20options.csv"
]

In [3]:
def clean_options(file):
    print("Reading")
    data = pd.read_csv(file)
    print("Fixing Dates")
    data["date"] = pd.to_datetime(data["date"], errors='coerce')
    data["exdate"] = pd.to_datetime(data["exdate"], errors='coerce')
    print("Dropping NAs")
    data = data.dropna()
    print("Saving to File")
    data.to_csv("../csv/options/cleaned/{}".format(file.split("/"))[-1])

In [6]:
for file in options_files:
    print("Cleaning {}".format(file))
    clean_options(file)
    print("Done.")


Cleaning ../csv/options/raw/2015-16options.csv
Reading
Fixing Dates
Dropping NAs
Saving to File
Done.
Cleaning ../csv/options/raw/2016-17options.csv
Reading
Fixing Dates
Dropping NAs
Saving to File
Done.
Cleaning ../csv/options/raw/2017-18options.csv
Reading
Fixing Dates
Dropping NAs
Saving to File
Done.
Cleaning ../csv/options/raw/2018-19options.csv
Reading
Fixing Dates
Dropping NAs
Saving to File
Done.
Cleaning ../csv/options/raw/2019-20options.csv
Reading
Fixing Dates
Dropping NAs
Saving to File
Done.
