In [None]:
%pip install -qq umap-learn 

In [None]:
import pydata_google_auth
import pandas_gbq
import pandas as pd

In [None]:
PROJECT_ID = 'uw-data-platform-prod'

auth = pydata_google_auth.get_user_credentials(scopes=["https://www.googleapis.com/auth/bigquery"])

In [None]:
ids_q = """SELECT DISTINCT partner_position_id FROM 
`uw-partner-prod.metrics_analysis.partner_position_master_data_daily` 
"""

In [None]:
observed_features = ['jtc_applications_last_3m', 'articles_unique_days_viewed_last_1m',
                     'prospects_unique_days_viewed_last_1m', 'learning_plans_started_last_1m',
                    'days_logged_into_portal_last_3m', "customers_gathered_last_1m"]

_id = ['partner_position_id', 'snapshot_date', 'avg_monthly_ri_earned_last_1y']

In [None]:
## A copy of this file can be downloaded from 
## https://drive.google.com/file/d/1VWyqF_lzN3ABcsyrkTF-HayiUO3F_Gsr/view?usp=sharing
population = pd.read_csv('../dashboard_experiment.csv')

In [None]:
q_pattern = """\
SELECT {} FROM `uw-partner-prod.metrics_analysis.partner_position_master_data_daily`  
WHERE snapshot_date BETWEEN '2021-03-09' AND '2021-04-09'
ORDER BY 1, 2
"""
features_query = q_pattern.format(' ,'.join(_id + observed_features))


In [None]:
resp_df = pd.read_gbq(features_query, PROJECT_ID, credentials=auth, use_bqstorage_api=True )

In [None]:
population['experiment_group'] = 'None'
population.loc[population.is_test, 'experiment_group'] = 'Test'
population.loc[population.is_control, 'experiment_group'] = 'Control'

In [None]:
data_df = resp_df.set_index('partner_position_id').join(population.set_index('partner_position_id')
                                              [['is_test', 'is_control', 'experiment_group']], how='inner')

In [None]:
data_df['has_gathered_last_1m'] = data_df['customers_gathered_last_1m'] > 0

In [None]:
columns = ['snapshot_date', 'avg_monthly_ri_earned_last_1y',
       'jtc_applications_last_3m', 'articles_unique_days_viewed_last_1m',
       'prospects_unique_days_viewed_last_1m',
       'learning_plans_started_last_1m', 'days_logged_into_portal_last_3m',
       'customers_gathered_last_1m']

In [None]:
data_df['avg_monthly_ri_earned_last_1y'] = data_df['avg_monthly_ri_earned_last_1y'].astype(float)


In [None]:
from sklearn.cluster import DBSCAN
import umap
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt

In [None]:
X = data_df.loc[data_df.is_test , :"customers_gathered_last_1m"]

In [None]:
import seaborn as sns
sns.set(style='white', context='notebook', rc={'figure.figsize':(14,10)})


In [None]:
def draw_umap(n_neighbors=10, min_dist=0.3, n_components=2, metric='euclidean', title='', eps=.8):
    RANDOM_STATE = 39
    fit = umap.UMAP(
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        n_components=n_components,
        metric=metric, n_jobs=-1,
        random_state=RANDOM_STATE
    )

    X_ = X.loc[X.snapshot_date == X.snapshot_date.min(), "avg_monthly_ri_earned_last_1y":].dropna().copy()
    u = fit.fit_transform(X_)
    db = DBSCAN(eps=eps, min_samples=10, n_jobs=-1)
    cols = db.fit_predict(u)
    X_['component_1'] = u[:, 0]
    X_['component_2'] = u[:, 1]
    X_['cluster'] = cols
    fig = plt.figure()
    ax = fig.add_subplot(111)
    scatter = ax.scatter(x='component_1', y='component_2', c='cluster', data=X_,  cmap='Accent', s=50)
    legend1 = ax.legend(*scatter.legend_elements(), title="Cluster")
    ax.add_artist(legend1)
    plt.title(title, fontsize=18)
    return X_, fit, db

Warning! The next cell is for demonstration purposes, the actual cluster where found without the fixed random seed, with careful parameter search and on a specific date. The result are loaded back in the following cell

In [None]:
df_, fit, db = draw_umap(n_neighbors=5, min_dist=.5, eps=1.5)

In [None]:
df_ = read.csv('../cluster')

In [None]:
import joblib

# This file can be downloaded from 
# https://drive.google.com/open?id=10mrRmB14J8NZCAU_n-ErBGd92fHQ8MrQ&authuser=mbucci%40uw.co.uk&usp=drive_fs

df_ = joblib.load('clustered_df.jbl')['dataframe']

In [None]:
# Cluster 0: General low activity, possibly dormant
# Cluster 1: highest RI, higher customer gathered
# Cluster 2: higher articles view, higher portal logins, higher customer gathered
# Cluster 3: Low activity, better than 0
# Cluster 4: higher articles view, highest prospect views, highest n. logins (low customer)
# Cluster 5: highest JTC application (low customer)
# Cluster 6: medium-hi JTC application, very low portal activity (low customer)


cluster_dict = {-1: -1, 0:0, 1:6, 2:5, 3:1, 4:2, 5:4, 6:3}
df_['ord_clustered'] = df_['cluster'].map(cluster_dict)

In [None]:
df_.loc[:, 
        "avg_monthly_ri_earned_last_1y": ].groupby('ord_clustered').agg(['mean', 'median', 'max', 'min']).T

In [None]:
import plotly.graph_objects as go

In [None]:
from scipy.stats import mode
idxs = ['partner_position_id', 'snapshot_date']
rolling_mode = df_.reset_index().sort_values(idxs).groupby(
    'partner_position_id')['ord_clustered'].rolling(7, 1).agg(lambda x:x.value_counts().index[0])[::7].reset_index()

In [None]:
rolling_mode['day'] = (rolling_mode.level_1 % 28 )

In [None]:
rolling_mode['label'] = rolling_mode.day.astype(str) + 'C' + rolling_mode.ord_clustered.astype(int).astype(str)

In [None]:
pivoted = rolling_mode.sort_values(['partner_position_id', 'day']).pivot(columns='day', index='partner_position_id', values='label')

In [None]:
dfs = []
for p in zip(pivoted.columns[:-1], pivoted.columns[1:]):
    _d = pivoted.groupby(list(p)).size().reset_index(name='count')
    _d.columns = ['source', 'target', 'value']
    dfs.append(_d)

In [None]:
sankey_data = pd.concat(dfs).reset_index(drop=True)

In [None]:
sankey_labels = sorted(list(set(sankey_data.source.unique().tolist() + sankey_data.target.unique().tolist())))
sankey_sources = sankey_data.source.apply(lambda x: sankey_labels.index(x))
sankey_targets = sankey_data.target.apply(lambda x: sankey_labels.index(x))

In [None]:
col_dicts = {'-1': 'grey', 
             '0': 'rgba(210, 34, 45, 0.8)', 
             '6': 'rgba(0, 112, 0, 0.8)', 
             '5': 'rgba(0, 112, 0, 0.8)', 
             '1': 'rgba(255, 150, 0, 0.8)', 
             '2': 'rgba(255, 230, 0, 0.8)', 
             '4': 'rgba(150, 200, 50, 0.8)', 
             '3': 'rgba(255, 230, 0, 0.8)'}

node_col = [col_dicts.get(x.split('C')[-1]) for x in sankey_labels]

In [None]:
labels_dict = {'-1': ' TBD', 
             '0': '0 - Low activity, possibly dormant',              
             '1': '1 - Low activity', 
             '2': '2 - High Portal activity, low gathering', 
             '4': '4 - High JTC activity, low gathering', 
             '3': '3 - Medium JTC activity, low portal activity',
             '5': '5 - High Portal activity, good gathering',
             '6': '6 - High RI, good gathering', }

short_labels_dict = {'-1': ' TBD', 
             '0': '0 - Dormant', 
             '6': '6 - +RI, +Gathering', 
             '5': '5 - +Portal, +Gathering', 
             '1': '1 - -Portal', 
             '2': '2 - +Portal, -Gathering', 
             '4': '4 - ++JTC, -Gathering', 
             '3': '3 - +JTC, -Portal, -Gathering'}

long_labels = [labels_dict.get(x.split('C')[-1]) if x.split('C')[0] == '0' else
               short_labels_dict.get(x.split('C')[-1]) for x in sankey_labels]

In [None]:
import plotly.graph_objects as go

fig = go.Figure(data=[go.Sankey(
    node = dict(
      pad = 15,
      thickness = 20,
      line = dict(color = "black", width = 0.5),
      label = long_labels,
      color = node_col
    ),
    link = dict(
      source = sankey_sources, # indices correspond to labels, eg A1, A2, A1, B1, ...
      target = sankey_targets,
      value = sankey_data['value'],
      color = [node_col[i].replace("0.8", "0.2") for i in sankey_targets]
  ))])

fig.update_layout(title_text="Basic Flow analysis", font_size=10)
fig.show()