In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import json
import logging
import mmh3
import os
import shutil
import string
import textwrap
from datetime import datetime, timedelta

import dask
import dask.multiprocessing
import numpy as np
import pandas as pd

# Make sure you `pip install bt-ai[dev]` for these (or you can use the [prod] dependencies if you like)
from bt_ai.stable.data_input.dataframe import MultiDataFrameLoader, DataFrameTarget
from bt_ai.stable.data_input.redshift import UnloadQuery, UnloadTask, HourlyEventDumpQuery, RawEventDumpQuery, RawEventQueryWithSession
from bt_ai.stable.data_input.resources import ResourcesDb, ResourceDump

# Make sure you `pip install bt-notebook-utils` for these
from notebook_utils.logging import setup_logging
from notebook_utils.luigi import run_luigi_tasks
from notebook_utils.s3 import delete_s3_folder
from notebook_utils.sequences import daterange, hourrange, date_compressed_hourrange, pairwise


# Use this to get multi-processing with out-of-core processing
dask.set_options(get=dask.multiprocessing.get)

In [None]:
REDSHIFT_CREDENTIALS = json.loads(os.environ.get('REDSHIFT_CREDENTIALS'))

In [None]:
# This section sets up the logging, so running Luigi jobs produces output in this notebook

# Check `notebook_utils.logging.LOGGER_OVERRIDES` for the default logger overrides, 
# or, optionally, pass in your own additional overrides (which can override LOGGER_OVERRIDES)
logging_overrides = {
    'luigi-interface': logging.INFO,
}
setup_logging(level=logging.DEBUG, overrides=logging_overrides)
# Use any name you want for this logger.  
LOG = logging.getLogger('jupyter')

In [None]:
# change the site_id and start/end dates to be something sane
#site_id = 'wnyt-hubbard-tv'
today=datetime.today()
_7ago=datetime.today() - timedelta(days=7)

end_date=datetime(today.year,today.month, today.day)
start_date = datetime(_7ago.year,_7ago.month, _7ago.day)

# make sure you change this to your own sandbox bucket on S3
s3_data_bucket = 'vladm-sandbox'
s3_data_path = 's3://' + s3_data_bucket
local_data_path = 'output_data'
luigi_planner_uri = 'http://localhost:8082'

model_store_path = local_data_path

## Extract data from requests and interactions table

In [None]:
# Extract data to from recs.requests table
dump_sub_folder = 'requests-interactions'

#delete_s3_folder(s3_data_bucket, dump_sub_folder)

query_template = textwrap.dedent('''\
                WITH 
                    a AS (
                        SELECT 
                          date(event_time) AS request_day, 
                          site_id, 
                          medium, 
                          recset 
                        FROM 
                          recs.requests 
                        WHERE 
                          event_time < '{end_date}' 
                          AND event_time >= '{start_date}')
                    ,b AS (
                        SELECT DISTINCT 
                          site_id,
                          recset,
                          event_type,
                          min(date(event_time)) AS action_day
                        FROM 
                          recs.interactions
                        WHERE 
                          event_time < '{end_date}' 
                          AND event_time >= '{start_date}'
                        GROUP BY 
                          site_id, recset, event_type)
                SELECT 
                  a.request_day,
                  a.site_id,
                  b.event_type, 
                  b.action_day, 
                  a.medium, 
                  count(DISTINCT a.recset) as count
                FROM 
                  a 
                LEFT JOIN 
                  b
                ON
                  a.recset = b.recset
                GROUP BY
                  event_type, request_day, action_day, a.site_id, a.medium
                ORDER BY 
                  event_type, request_day, action_day, a.site_id, a.medium
            ''')

s3_unload_path_template = '''s3://{root}/requests-interactions/s{start_date}.e{end_date}'''

DATETIME_FORMAT = '%Y%m%dT%H%M%S'
    
daily_requests = [
    UnloadTask(
        redshift_query=UnloadQuery(
            query=query_template.format(
                #site_id=site_id,
                start_date=s,
                end_date=e
            ),
            column_names=['request_day', 'site_id','event_type','action_day', 'medium', 'count'],
            s3_unload_path=s3_unload_path_template.format(
                root=s3_data_bucket,
#                 site_id=site_id,
#                 start_date=start_date.strftime(DATETIME_FORMAT),
#                 end_date=end_date.strftime(DATETIME_FORMAT)
                start_date=s.strftime(DATETIME_FORMAT),
                end_date=e.strftime(DATETIME_FORMAT)
            ),
            index_columns=['request_day', 'site_id','action_day'],
            date_columns=['request_day','action_day']
        ),
        redshift_credentials=REDSHIFT_CREDENTIALS
    )
    for s, e in pairwise(daterange(start_date, end_date))
]

run_luigi_tasks(daily_requests, scheduler_uri=luigi_planner_uri, multiprocess=True, num_processes=8)

LOG.info('Loading dumped data')
requests_loader = MultiDataFrameLoader.create_multi_dataframe_target(
    [task.output() for task in daily_requests], 
    compute=False
)

with requests_loader.open('r') as infile:
    requests_ddf = infile.read()

## Extract data from interactions table 

In [None]:
%%time
# Extract data to from recs.interactions table
dump_sub_folder = 'interactions'

#delete_s3_folder(s3_data_bucket, dump_sub_folder)

query_template = textwrap.dedent('''\
                SELECT DISTINCT 
                  site_id,
                  recset,
                  event_type,
                  min(date(event_time)) AS action_day
                FROM recs.interactions
                WHERE event_time < '{end_date}' 
                  AND event_time >= '{start_date}' 
                GROUP BY site_id,
                  recset,
                  event_type
            ''')

s3_unload_path_template = '''s3://{root}/interactions/s{start_date}.e{end_date}'''

DATETIME_FORMAT = '%Y%m%dT%H%M%S'
    
daily_interactions = [
    UnloadTask(
        redshift_query=UnloadQuery(
            query=query_template.format(
                start_date=s,
                end_date=e
            ),
            column_names=['site_id', 'recset', 'event_type', 'action_day'],
            s3_unload_path=s3_unload_path_template.format(
                root=s3_data_bucket,
#                 site_id=site_id,
#                 start_date=start_date.strftime(DATETIME_FORMAT),
#                 end_date=end_date.strftime(DATETIME_FORMAT)
                start_date=s.strftime(DATETIME_FORMAT),
                end_date=e.strftime(DATETIME_FORMAT)
            ),
            index_columns=['recset', 'site_id'],
            date_columns=['action_day']
        ),
        redshift_credentials=REDSHIFT_CREDENTIALS
    )
    for s, e in pairwise(daterange(start_date, end_date))
]

run_luigi_tasks(daily_interactions, scheduler_uri=luigi_planner_uri, multiprocess=True, num_processes=8)

LOG.info('Loading dumped data')
interactions_loader = MultiDataFrameLoader.create_multi_dataframe_target(
    [task.output() for task in daily_interactions], 
    compute=False
)

with interactions_loader.open('r') as infile:
    interactions_ddf = infile.read()

## Merge requests and interactions table 

In [None]:
%%time
merge_ddf=dask.dataframe.merge(requests_ddf, interactions_ddf, on=['recset', 'site_id'], how='left')

In [None]:
%%time
merge_ddf=merge_ddf.fillna("No Action")

## Aggregate to count requests by day by event type and by event day 

In [None]:
%%time
aggregate_ddf=merge_ddf.groupby(['site_id', 'request_day', 'action_day', 'medium', 'event_type']).recset.nunique()

## Convert to data frame 

In [None]:
%%time
final_df = pd.DataFrame(dask.compute(aggregate_ddf)[0])

In [None]:
final_df