# Model - Stage1C - Cache

Cache the outputs from stage 1 for better performance.

In [1]:
import pyspark

sparkConfig = {
    'spark.executor.memory': '30g',
    'spark.driver.memory': '60g',
    'spark.master': 'local[*]',
    'spark.default.parallelism': '30',
}
conf = pyspark.SparkConf() \
    .setMaster('local[*]') \
    .setAppName('Model - Stage1C - Cache')
for k,v in sparkConfig.items():
    conf = conf.set(k, v)
sc = pyspark.SparkContext(conf=conf)

In [2]:
import datetime
import importlib
import json, pickle
import numpy as N
import numpy.random as NR
import matplotlib.pyplot as pyplot
import seaborn
import pandas as P
from pathlib import Path
import shutil
import pyspark.mllib as SM
import pyspark.mllib.feature as SMF

seaborn.set_style('whitegrid')

In [3]:
import common.data
importlib.reload(common.data)

<module 'common.data' from '/data/common/data.py'>

In [4]:
pathChirps = Path('datasets/Chirps')
pathStocks = Path('datasets/Stocks')
pathAnalyticsChirp = Path('analytics/Chirps')
pathModelEmbedding = Path("models/embedding/word2vec")

startTrain = datetime.date(2017, 1, 1)
endTrain = datetime.date(2019, 1, 1)
startTest = endTrain
endTest = datetime.date(2019, 7, 1)

pathTrainInstances = pathChirps / 'instances_train.tsv'
pathTestInstances = pathChirps / 'instances_test.tsv'

pathStage1 = Path('models/stage1')

conf = {
    'sampleSize': 2048,
    'nSamplesPerDay': 32,
    'partitions': 32,
}

In [5]:
# Read tweets data
tweets_train = common.data.create_tweets_rdd(pathTrainInstances, sc=sc, partitions=conf['partitions']) #.sample(False, 0.001)
tweets_train.take(4)

[(datetime.date(2017, 12, 5), 'to recognize jerusalem as the capital'),
 (datetime.date(2017, 12, 5), 'trump tell abbas'),
 (datetime.date(2017, 12, 5),
  'hdtalk wisconsin to drug test food stamp users'),
 (datetime.date(2017, 12, 5), 'trump shrink the size')]

In [6]:
# Read an example ticker. We only need the time idx.

ticker = "AAPL"
stockDf = P.read_csv(pathStocks / f'{ticker}.csv').set_index('Date')
stockDf.index = stockDf.index.map(datetime.datetime.fromisoformat).map(lambda x:x.date())
idx_train = stockDf.index[stockDf.index.map(lambda x:startTrain <= x and x < endTrain)]
idx_test = stockDf.index[stockDf.index.map(lambda x:startTest <= x and x < endTest)]

In [14]:
with open(pathStage1 / 'properties.json', 'w') as f:
    json.dump(conf, f)

### Train set

In [7]:
pOut = pathStage1 / 'instances_train'
if pOut.exists():
    shutil.rmtree(pOut)
rdd_train = common.data.create_data_rdd(tweets_train, idx_train, sc=sc, **conf)
rdd_train.saveAsPickleFile(str(pOut))

In [8]:
rdd_train.take(10)

[(datetime.date(2017, 12, 15),
  array(['harvey weinstein have issue a statement',
         'beijing be build the infrastructure', 'israel hit gaza', ...,
         'be in top for 2017', 'to sell off units for billion',
         'beijing be build the infrastructure'], dtype='<U79')),
 (datetime.date(2017, 12, 15),
  array(['news woman use bitcoin', 'donald trump spend christmas',
         'bout ukad this whole thing be a mess', ...,
         'blake farenthold wo not seek re-election',
         'house ethics committee revive sexual harassment investigation',
         'donald trump spend christmas'], dtype='<U79')),
 (datetime.date(2017, 12, 15),
  array(['brexit deal will tie uk', 'news woman use bitcoin',
         'bitcoin craze propel coinbase app', ...,
         'sarah sanders shut down acosta',
         'to recognize jerusalem as israeli capital - statement',
         'the final season wo not return until 2019'], dtype='<U79')),
 (datetime.date(2017, 12, 15),
  array(['australia lega

### Test set

In [8]:
tweets_test = common.data.create_tweets_rdd(pathTestInstances, sc=sc)

In [10]:
pOut = pathStage1 / 'instances_test'
if pOut.exists():
    shutil.rmtree(pOut)
rdd_test = common.data.create_data_rdd(tweets_test, idx_test, sc=sc, leadTime=0, **conf)
rdd_test.saveAsPickleFile(str(pOut))

In [11]:
rdd_train.count()

14688

In [12]:
rdd_test.count()

3968