In [1]:
%load_ext autoreload
%autoreload 2

In [32]:
import json
import logging
import mmh3
import os
import shutil
import string
import textwrap
from datetime import datetime, timedelta

import dask
import dask.multiprocessing
import numpy as np
import pandas as pd

# Make sure you `pip install bt-ai[dev]` for these (or you can use the [prod] dependencies if you like)
from bt_ai.stable.data_input.dataframe import MultiDataFrameLoader, DataFrameTarget
from bt_ai.stable.data_input.redshift import UnloadQuery, UnloadTask, HourlyEventDumpQuery, RawEventDumpQuery, RawEventQueryWithSession
from bt_ai.stable.data_input.resources import ResourcesDb, ResourceDump

# Make sure you `pip install bt-notebook-utils` for these
from notebook_utils.logging import setup_logging
from notebook_utils.luigi import run_luigi_tasks
from notebook_utils.s3 import delete_s3_folder
from notebook_utils.sequences import daterange, hourrange, date_compressed_hourrange, pairwise


# Use this to get multi-processing with out-of-core processing
dask.set_options(get=dask.multiprocessing.get)

<dask.context.set_options at 0x31527af28>

In [33]:
REDSHIFT_CREDENTIALS = json.loads(os.environ.get('REDSHIFT_CREDENTIALS'))

In [34]:
# This section sets up the logging, so running Luigi jobs produces output in this notebook

# Check `notebook_utils.logging.LOGGER_OVERRIDES` for the default logger overrides, 
# or, optionally, pass in your own additional overrides (which can override LOGGER_OVERRIDES)
logging_overrides = {
    'luigi-interface': logging.INFO,
}
setup_logging(level=logging.DEBUG, overrides=logging_overrides)
# Use any name you want for this logger.  
LOG = logging.getLogger('jupyter')

In [35]:
datetime.today().date() - timedelta(days=7)

datetime.date(2018, 5, 10)

In [36]:
# change the site_id and start/end dates to be something sane
site_id = 'wnyt-hubbard-tv'
today=datetime.today()
_7ago=datetime.today() - timedelta(days=7)

end_date=datetime(today.year,today.month, today.day)
start_date = datetime(_7ago.year,_7ago.month, _7ago.day)

# make sure you change this to your own sandbox bucket on S3
s3_data_bucket = 'vladm-sandbox'
s3_data_path = 's3://' + s3_data_bucket
local_data_path = 'output_data'
luigi_planner_uri = 'http://localhost:8082'

model_store_path = local_data_path

In [37]:
# Extract data to from recs.requests table
dump_sub_folder = 'requests'

delete_s3_folder(s3_data_bucket, dump_sub_folder)

query_template = textwrap.dedent('''\
                SELECT 
                  date(event_time) AS request_day,
                  site_id,
                  medium,
                  recset 
                FROM recs.requests 
                WHERE event_time < '{end_date}' 
                  AND event_time >= '{start_date}'
                  AND site_id='{site_id}'
            ''')

s3_unload_path_template = '''s3://{root}/requests/s{start_date}.e{end_date}'''

DATETIME_FORMAT = '%Y%m%dT%H%M%S'
    
daily_requests = [
    UnloadTask(
        redshift_query=UnloadQuery(
            query=query_template.format(
                site_id=site_id,
                start_date=s,
                end_date=e
            ),
            column_names=['request_day', 'site_id', 'medium', 'recset'],
            s3_unload_path=s3_unload_path_template.format(
                root=s3_data_bucket,
                site_id=site_id,
#                 start_date=start_date.strftime(DATETIME_FORMAT),
#                 end_date=end_date.strftime(DATETIME_FORMAT)
                start_date=s.strftime(DATETIME_FORMAT),
                end_date=e.strftime(DATETIME_FORMAT)
            ),
            index_columns=['site_id','recset'],
            date_columns=['request_day']
        ),
        redshift_credentials=REDSHIFT_CREDENTIALS
    )
    for s, e in pairwise(daterange(start_date, end_date))
]

run_luigi_tasks(daily_requests, scheduler_uri=luigi_planner_uri, multiprocess=True, num_processes=8)

LOG.info('Loading dumped data')
requests_loader = MultiDataFrameLoader.create_multi_dataframe_target(
    [task.output() for task in daily_requests], 
    compute=False
)

with requests_loader.open('r') as infile:
    requests_ddf = infile.read()

2018-05-17 10:58:32 INFO     luigi-interface: Informed scheduler that task   UnloadTask_NOT_SET__SELECT_date_28ev_6f1c1a9b5a   has status   PENDING
2018-05-17 10:58:33 INFO     luigi-interface: Informed scheduler that task   UnloadTask_NOT_SET__SELECT_date_28ev_a465abd50a   has status   PENDING
2018-05-17 10:58:34 INFO     luigi-interface: Informed scheduler that task   UnloadTask_NOT_SET__SELECT_date_28ev_cac993e274   has status   PENDING
2018-05-17 10:58:34 INFO     luigi-interface: Informed scheduler that task   UnloadTask_NOT_SET__SELECT_date_28ev_a07988a911   has status   PENDING
2018-05-17 10:58:35 INFO     luigi-interface: Informed scheduler that task   UnloadTask_NOT_SET__SELECT_date_28ev_2af65149b7   has status   PENDING
2018-05-17 10:58:36 INFO     luigi-interface: Informed scheduler that task   UnloadTask_NOT_SET__SELECT_date_28ev_b3d819f8e8   has status   PENDING
2018-05-17 10:58:36 INFO     luigi-interface: Informed scheduler that task   UnloadTask_NOT_SET__SELECT_date_28e

2018-05-17 10:58:37 INFO     luigi-interface: INFO     bt_ai.stable.workflow.task: [pid 17548] Worker Worker(salt=666392742, workers=8, host=Vladimirs-MacBook-Pro.local, username=vladmalabanan, pid=14192) running   UnloadTask(job_name=NOT_SET, parent_flow=, redshift_query=SELECT+date%28event_time%29+AS+request_day%2C+site_id%2C+medium%2C+recset+FROM+recs.requests+WHERE+event_time+%3C+%272018-05-17+00%3A00%3A00%27+AND+event_time+%3E%3D+%272018-05-16+00%3A00%3A00%27+AND+site_id%3D%27wnyt-hubbard-tv%27%0A::s3%3A%2F%2Fvladm-sandbox%2Frequests%2Fs20180516T000000.e20180517T000000::%09::%5B%22request_day%22%2C+%22site_id%22%2C+%22medium%22%2C+%22recset%22%5D)Starting UnloadTask

2018-05-17 10:58:37 INFO     bt_ai.stable.workflow.task: Starting UnloadTask2018-05-17 10:58:37 
DEBUG    bt_ai.stable.data_input.redshift: UNLOAD ( 'SELECT 
  date(event_time) AS request_day,
  site_id,
  medium,
  recset 
FROM recs.requests 
WHERE event_time < \'2018-05-15 00:00:00\' 
  AND event_time >= \'2018-05-1

2018-05-17 10:59:26 INFO    2018-05-17 10:59:26  luigi-interface: INFO     luigi-interface: [pid 17543] Worker Worker(salt=666392742, workers=8, host=Vladimirs-MacBook-Pro.local, username=vladmalabanan, pid=14192) done      UnloadTask(job_name=NOT_SET, parent_flow=, redshift_query=SELECT+date%28event_time%29+AS+request_day%2C+site_id%2C+medium%2C+recset+FROM+recs.requests+WHERE+event_time+%3C+%272018-05-12+00%3A00%3A00%27+AND+event_time+%3E%3D+%272018-05-11+00%3A00%3A00%27+AND+site_id%3D%27wnyt-hubbard-tv%27%0A::s3%3A%2F%2Fvladm-sandbox%2Frequests%2Fs20180511T000000.e20180512T000000::%09::%5B%22request_day%22%2C+%22site_id%22%2C+%22medium%22%2C+%22recset%22%5D)Informed scheduler that task   UnloadTask_NOT_SET__SELECT_date_28ev_2959e854fc   has status   DONE

2018-05-17 10:59:26 INFO     bt_ai.stable.workflow.task: UnloadTask completed successfully
2018-05-17 10:59:26 INFO     luigi-interface: Informed scheduler that task   UnloadTask_NOT_SET__SELECT_date_28ev_a465abd50a   has status   

In [8]:
requests_ddf.request_day.unique().compute()

0   2018-05-09
1   2018-05-10
2   2018-05-11
3   2018-05-12
4   2018-05-13
5   2018-05-14
6   2018-05-15
Name: request_day, dtype: datetime64[ns]

In [None]:

dump_sub_folder = 'user_views'
delete_s3_folder(s3_data_bucket, dump_sub_folder)

query_template = textwrap.dedent(
             ''' SELECT
                     userid AS bsin,
                     DATE_TRUNC('day', event_time) AS event_day,
                     COUNT(1) as views
                 FROM boomtrain.events
                 WHERE app = '{site_id}'
                     AND event_time < '{end_date}'
                     AND event_time >= '{start_date}'
                     AND event_type IN ({event_types})
                     AND model IS NOT NULL
                     AND id IS NOT NULL
                 GROUP BY userid, event_day
                 ORDER BY userid, event_day
             ''')
s3_unload_path_template = '''s3://{root}/user_views/{site_id}/{event_type}.s{start_date}.e{end_date}'''
event_types = ['viewed']
# just a convenience definition
DATETIME_FORMAT = '%Y%m%dT%H%M%S'
    
daily_user_views = [
    UnloadTask(
        redshift_query=UnloadQuery(
            query=query_template.format(
                site_id=site_id,
                start_date=s,
                end_date=e,
                event_types=','.join(["'{}'".format(evt) for evt in event_types])
            ),
            column_names=['bsin', 'event_day', 'views'],
            s3_unload_path=s3_unload_path_template.format(
                root=s3_data_bucket,
                site_id=site_id,
                event_type='.'.join([evt for evt in event_types]),
                start_date=start_date.strftime(DATETIME_FORMAT),
                end_date=end_date.strftime(DATETIME_FORMAT)
            ),
            index_columns=['bsin', 'event_day'],
            date_columns=['event_day']
        ),
        redshift_credentials=REDSHIFT_CREDENTIALS
    )
    for s, e in pairwise(daterange(start_date, end_date))
]

run_luigi_tasks(daily_user_views, scheduler_uri=luigi_planner_uri, multiprocess=True, num_processes=8)

LOG.info('Loading dumped data')
user_views_loader = MultiDataFrameLoader.create_multi_dataframe_target(
    [task.output() for task in daily_user_views], 
    compute=True
)

with user_views_loader.open('r') as infile:
    daily_user_views_df = infile.read()

In [None]:
daily_user_views_df.index.levels[1]

In [None]:
# Create tasks to dump hourly event counts from Redshift to S3
hourly_data = [
    UnloadTask(
        redshift_query=HourlyEventDumpQuery(site_id, s, e, s3_data_bucket),
        redshift_credentials=REDSHIFT_CREDENTIALS
    )
    for s, e in pairwise(daterange(start_date, end_date))
]

# execute these tasks
# You need to have a luigid process running on your machine, by running: `luigid --port=8082`
run_luigi_tasks(hourly_data, scheduler_uri=luigi_planner_uri, multiprocess=True, num_processes=8)

LOG.info('Loading dumped data')
# Create a multi-loader to turn a list of targets into a single target across multiple files
event_count_loader = MultiDataFrameLoader.create_multi_dataframe_target(
    [task.output() for task in hourly_data], # list of targets created from list of tasks
    compute=True
)
# Load the data into a pandas DataFrame (by setting compute=True, above; if this is set to False 
#   [either here or in the task's output definition] it will return a dask DataFrame instead)
with event_count_loader.open('r') as infile:
    hourly_event_count_df = infile.read()

In [None]:
hourly_event_count_df.index.levels[1]

In [38]:
# Extract data to from recs.interactions table
dump_sub_folder = 'interactions'

delete_s3_folder(s3_data_bucket, dump_sub_folder)

query_template = textwrap.dedent('''\
                SELECT DISTINCT 
                  site_id,
                  recset,
                  event_type,
                  min(date(event_time)) AS action_day
                FROM recs.interactions
                WHERE event_time < '{end_date}' 
                  AND event_time >= '{start_date}' 
                  AND site_id='{site_id}'
                GROUP BY site_id,
                  recset,
                  event_type
            ''')

s3_unload_path_template = '''s3://{root}/interactions/s{start_date}.e{end_date}'''

DATETIME_FORMAT = '%Y%m%dT%H%M%S'
    
daily_interactions = [
    UnloadTask(
        redshift_query=UnloadQuery(
            query=query_template.format(
                site_id=site_id,
                start_date=s,
                end_date=e
            ),
            column_names=['site_id', 'recset', 'event_type', 'action_day'],
            s3_unload_path=s3_unload_path_template.format(
                root=s3_data_bucket,
                site_id=site_id,
#                 start_date=start_date.strftime(DATETIME_FORMAT),
#                 end_date=end_date.strftime(DATETIME_FORMAT)
                start_date=s.strftime(DATETIME_FORMAT),
                end_date=e.strftime(DATETIME_FORMAT)
            ),
            index_columns=['site_id','recset'],
            date_columns=['action_day']
        ),
        redshift_credentials=REDSHIFT_CREDENTIALS
    )
    for s, e in pairwise(daterange(start_date, end_date))
]

run_luigi_tasks(daily_interactions, scheduler_uri=luigi_planner_uri, multiprocess=True, num_processes=8)

LOG.info('Loading dumped data')
interactions_loader = MultiDataFrameLoader.create_multi_dataframe_target(
    [task.output() for task in daily_interactions], 
    compute=False
)

with interactions_loader.open('r') as infile:
    interactions_ddf = infile.read()

2018-05-17 10:59:29 INFO     luigi-interface: Informed scheduler that task   UnloadTask_NOT_SET__SELECT_DISTINCT__1391dfce72   has status   PENDING
2018-05-17 10:59:30 INFO     luigi-interface: Informed scheduler that task   UnloadTask_NOT_SET__SELECT_DISTINCT__196b6b0b03   has status   PENDING
2018-05-17 10:59:30 INFO     luigi-interface: Informed scheduler that task   UnloadTask_NOT_SET__SELECT_DISTINCT__f973a366f1   has status   PENDING
2018-05-17 10:59:31 INFO     luigi-interface: Informed scheduler that task   UnloadTask_NOT_SET__SELECT_DISTINCT__89329b8ae7   has status   PENDING
2018-05-17 10:59:32 INFO     luigi-interface: Informed scheduler that task   UnloadTask_NOT_SET__SELECT_DISTINCT__4d9f182f43   has status   PENDING
2018-05-17 10:59:32 INFO     luigi-interface: Informed scheduler that task   UnloadTask_NOT_SET__SELECT_DISTINCT__e0db45189a   has status   PENDING
2018-05-17 10:59:33 INFO     luigi-interface: Informed scheduler that task   UnloadTask_NOT_SET__SELECT_DISTINCT

2018-05-17 10:59:33 INFO     luigi-interface: 2018-05-17 10:59:33 [pid 17635] Worker Worker(salt=328111378, workers=8, host=Vladimirs-MacBook-Pro.local, username=vladmalabanan, pid=14192) running   UnloadTask(job_name=NOT_SET, parent_flow=, redshift_query=SELECT+DISTINCT+site_id%2C+recset%2C+event_type%2C+min%28date%28event_time%29%29+AS+action_day%0AFROM+recs.interactions%0AWHERE+event_time+%3C+%272018-05-16+00%3A00%3A00%27+AND+event_time+%3E%3D+%272018-05-15+00%3A00%3A00%27+AND+site_id%3D%27wnyt-hubbard-tv%27%0AGROUP+BY+site_id%2C+recset%2C+event_type%0A::s3%3A%2F%2Fvladm-sandbox%2Finteractions%2Fs20180515T000000.e20180516T000000::%09::%5B%22site_id%22%2C+%22recset%22%2C+%22event_type%22%2C+%22action_day%22%5D)DEBUG    bt_ai.stable.data_input.redshift: 
UNLOAD ( 'SELECT DISTINCT 
  site_id,
  recset,
  event_type,
  min(date(event_time)) AS action_day
FROM recs.interactions
WHERE event_time < \'2018-05-15 00:00:00\' 
  AND event_time >= \'2018-05-14 00:00:00\' 
  AND site_id=\'wnyt-h

2018-05-17 11:00:54 INFO     luigi-interface: [pid 17634] Worker Worker(salt=328111378, workers=8, host=Vladimirs-MacBook-Pro.local, username=vladmalabanan, pid=14192) done      UnloadTask(job_name=NOT_SET, parent_flow=, redshift_query=SELECT+DISTINCT+site_id%2C+recset%2C+event_type%2C+min%28date%28event_time%29%29+AS+action_day%0AFROM+recs.interactions%0AWHERE+event_time+%3C+%272018-05-15+00%3A00%3A00%27+AND+event_time+%3E%3D+%272018-05-14+00%3A00%3A00%27+AND+site_id%3D%27wnyt-hubbard-tv%27%0AGROUP+BY+site_id%2C+recset%2C+event_type%0A::s3%3A%2F%2Fvladm-sandbox%2Finteractions%2Fs20180514T000000.e20180515T000000::%09::%5B%22site_id%22%2C+%22recset%22%2C+%22event_type%22%2C+%22action_day%22%5D)
2018-05-17 11:00:54 INFO     bt_ai.stable.workflow.task: UnloadTask completed successfully
2018-05-17 11:00:54 INFO     luigi-interface: Informed scheduler that task   UnloadTask_NOT_SET__SELECT_DISTINCT__4d9f182f43   has status   DONE
2018-05-17 11:00:54 INFO     bt_ai.stable.workflow.task: Fini

In [39]:
%%time
merge_ddf=dask.dataframe.merge(requests_ddf, interactions_ddf, on=['recset', 'site_id'], how='left')

CPU times: user 16.7 ms, sys: 2.85 ms, total: 19.6 ms
Wall time: 18.8 ms


In [12]:
%%time
merge_ddf2=dask.dataframe.merge(requests_ddf, interactions_ddf, left_index=True, right_index=True, how='left').compute()

CPU times: user 23.6 s, sys: 1.1 s, total: 24.7 s
Wall time: 28.3 s


In [23]:
merge_ddf.head(2)


Unnamed: 0,request_day,site_id,medium,recset,event_type,action_day
0,2018-05-09,wide-open-eats,web,66d6f878-531e-11e8-b93e-0ebad3a1a55c,bt_rec_view,2018-05-09
1,2018-05-09,wide-open-eats,web,a3d158ae-531e-11e8-a73b-129a2db092fa,bt_rec_view,2018-05-09


In [17]:
merge_ddf2.head(3)

Unnamed: 0,request_day,site_id_x,medium,recset_x,site_id_y,recset_y,event_type,action_day
0,2018-05-09,wnyt-hubbard-tv,web,07fe0604-531c-11e8-bd27-129a2db092fa,wnyt-hubbard-tv,1b94232e-5641-11e8-88dc-0e7f8c15c77e,bt_rec_view,2018-05-13
0,2018-05-09,wnyt-hubbard-tv,web,07fe0604-531c-11e8-bd27-129a2db092fa,wnyt-hubbard-tv,244ed7ee-5578-11e8-a520-0a4158a2ffca,bt_rec_view,2018-05-12
0,2018-05-09,wnyt-hubbard-tv,web,07fe0604-531c-11e8-bd27-129a2db092fa,wnyt-hubbard-tv,7305b824-531d-11e8-b1e0-0e02e382ecf6,bt_rec_view,2018-05-09


In [40]:
%%time
merge_ddf=merge_ddf.fillna("No Action")

CPU times: user 3.56 ms, sys: 363 µs, total: 3.92 ms
Wall time: 3.84 ms


In [41]:
%%time
aggregate_ddf=merge_ddf.groupby(['site_id', 'request_day', 'action_day', 'medium', 'event_type']).recset.nunique()

CPU times: user 19.6 ms, sys: 6.15 ms, total: 25.7 ms
Wall time: 24.6 ms


In [42]:
%%time
test_df = pd.DataFrame(dask.compute(aggregate_ddf)[0])

CPU times: user 25 s, sys: 1.16 s, total: 26.2 s
Wall time: 28.5 s


In [43]:
test_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,recset
site_id,request_day,action_day,medium,event_type,Unnamed: 5_level_1
wnyt-hubbard-tv,2018-05-10,2018-05-10 00:00:00,email,bt_rec_click,26
wnyt-hubbard-tv,2018-05-10,2018-05-10 00:00:00,email,bt_rec_view,202
wnyt-hubbard-tv,2018-05-10,2018-05-10 00:00:00,web,bt_rec_click,1145
wnyt-hubbard-tv,2018-05-10,2018-05-10 00:00:00,web,bt_rec_view,22457
wnyt-hubbard-tv,2018-05-10,2018-05-11 00:00:00,email,bt_rec_click,4
wnyt-hubbard-tv,2018-05-10,2018-05-11 00:00:00,email,bt_rec_view,101
wnyt-hubbard-tv,2018-05-10,2018-05-11 00:00:00,web,bt_rec_view,97
wnyt-hubbard-tv,2018-05-10,2018-05-12 00:00:00,email,bt_rec_view,11
wnyt-hubbard-tv,2018-05-10,2018-05-12 00:00:00,web,bt_rec_click,3
wnyt-hubbard-tv,2018-05-10,2018-05-12 00:00:00,web,bt_rec_view,16


pandas.core.frame.DataFrame