In [1]:
import os
import sys
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient, __version__
from dotenv import load_dotenv
import pandas as pd
import datetime
from dateutil import parser as date_parser
import json
from pprint import pprint
from functions import *

In [3]:
# Environment vars
load_dotenv()
connect_str = os.getenv('AZURE_STORAGE_CONNECTION_STRING')

In [6]:
# Hardcoded date
today = datetime.datetime(year=2022, month=5, day=17)
today_str = datetime_string(today, "/")

In [None]:
# Instantiate blob service client
try:
    blob_service_client = BlobServiceClient.from_connection_string(connect_str)
except Exception as e:
    print(f'Unable to connect to BlobServiceClient: {e}')

In [None]:
# Fetching users, accounts, and chargebee subscriptions.
# Merging all tables into one.
df = get_users_accounts_subscriptions_data(blob_service_client, today_str)

In [None]:
df["user|dateCreated.$date"].head()
df[df["user|dateCreated.$date"] != None].head()

In [None]:
# Last 60 days
# threshold_date = today - datetime.timedelta(days=60)

In [None]:
# Filter on users created at least 60 days ago
# df = df[df["user|dateCreated.$date"] <= threshold_date]

In [None]:
df.head()

In [None]:
df.shape

In [None]:
# Read in CSDA data
csda_df = pd.read_csv(os.path.join(os.pardir, "shared_data", "csda_salesforce_export.csv"))

# Rename columns
csda_df.columns = [
    "_".join(column.lower().split())
    for column in csda_df.columns
]

# Add prefix for clarity after join
csda_df = csda_df.add_prefix("salesforce|")

In [None]:
# Left join CSDA table with main dataframe (df)
df = df.merge(
    csda_df,
    how="left",
    left_on="account|_id.$oid",
    right_on="salesforce|chargebee_id"
)

In [None]:
# Get Hubspot data
hubspot_df = get_hubspot_data()

In [None]:
# Merge users with HubSpot data
df = df.merge(
    hubspot_df,
    how="left",
    left_on="user|email",
    right_on="hubspot|email"
)

In [None]:
# Add CSDA column
df["salesforce|team_member_name"] = df["salesforce|team_member_name"].fillna("-")
df["salesforce|is_csda"] = df["salesforce|team_member_name"].map(
    lambda x: True if x != "-" else False
)

In [None]:
# Rename columns intended to be kept
column_name_remapping = {
    "account|_id.$oid": "account.id",
    "account|name": "account.name",
    "account|dateCreated.$date": "account.date_created",
    "chargebee|status": "chargebee.status",
    "user|_id.$oid": "user.id",
    "user|language": "user.language",
    "user|dateCreated.$date": "user.date_created",
    "user|email": "user.email",
    "user|auth0Cache.usermetadata.lastName": "user.last_name",
    "user|auth0Cache.usermetadata.firstName": "user.first_name",
    "account|users_count": "account.users_count",
    "hubspot|jobtitle": "hubspot.job_title",
    "hubspot|industry": "hubspot.industry",
    "hubspot|numberofemployees": "hubspot.number_of_employees",
    "salesforce|is_csda": "salesforce.is_csda",
    "hubspot|country": "hubspot.country"
}

# Keep only the keys of mapping columns
df = df[column_name_remapping.keys()]

# Change names of columns based on mapping
df = df.rename(columns=column_name_remapping)

In [None]:
# Change language code to categorical
language_code_mapping = {
    0: "English",
    1: "German",
    2: "Chinese",
    3: "Bulgarian",
    4: "Spanish",
    5: "French",
    6: "Portuguese"
}

df = df.replace({"user.language": language_code_mapping})

In [None]:
df.head()

In [None]:
# Fetch user_roles data
user_roles_df = get_user_roles_table(blob_service_client, today_str)
user_roles_df.head()

In [None]:
# Fetch roles data
roles_df = get_roles_table(blob_service_client, today_str)
roles_df.head()

In [None]:
# Join user_roles with roles
user_roles_df = user_roles_df.merge(
    roles_df,
    how="left",
    left_on="user_roles.roleId.$oid",
    right_on="roles._id.$oid"
)

user_roles_df.head()

In [None]:
unique_user_roles_df = get_roles_by_user(user_roles_df)
unique_user_roles_df.head()

In [None]:
unique_user_roles_df["new_role"] = unique_user_roles_df["roles"].map(assign_role)
unique_user_roles_df.head()

In [None]:
new_role_frequency = (
    unique_user_roles_df.groupby(["new_role"])
    .size()
    .reset_index()
    .rename(columns={0: "count"})
)

new_role_frequency.head(20)

In [None]:
# Get model fit / % of model buy in
total_usage = new_role_frequency["count"].sum()
catchall_usage = new_role_frequency[new_role_frequency["new_role"] == "other"][
    "count"
].sum()

print("Model Acceptance Rate:", (total_usage - catchall_usage) / total_usage * 100, "%")

In [None]:
# Join df with user_roles_df
_df = df.merge(
    unique_user_roles_df,
    how="left",
    left_on="user.id",
    right_on="user_roles.userId.$oid"
)

_df.head()

In [None]:
_df["roles"] = _df["roles"].fillna("")
# df["user.is_okr_champion"] = df["roles"].map(lambda role: True if "champion" in (_role := role.lower()) or ("okr" in _role and "champion" in _role) else False)

In [None]:
# Are there any users with no role?
_df[_df["new_role"] == ""]

In [None]:
# Drop unnecessary columns
"""
Should I drop all id's?
account.id
user.id

user_roles._id.$oid
user_roles.userId.$oid
user_roles.roleId.$oid
user_roles.accountId.$oid
roles._id.$oid
roles.accountId.$oid

"""

df = df[
    df.columns[
        ~df.columns.isin(
            [
                # "account.id",
                # "user.id",
                "user_roles._id.$oid",
                "user_roles.userId.$oid",
                "user_roles.roleId.$oid",
                "user_roles.accountId.$oid",
                "roles._id.$oid",
                "roles.accountId.$oid"
            ]
        )
    ]
]

In [None]:
df.info()

In [None]:
df["user.date_created"] = df["user.date_created"].astype("datetime64[ns]")
df["account.date_created"] = df["account.date_created"].astype("datetime64[ns]")

In [None]:
df.info()

In [None]:
df.head()

In [None]:
# Format datetime fields as string for output file
df["account.date_created"] = df["account.date_created"].dt.strftime("%Y/%m/%d")
df["user.date_created"] = df["user.date_created"].dt.strftime("%Y/%m/%d")

In [None]:
df.shape

In [None]:
# Rename columns to be more intuitive
unique_user_roles_df.rename(columns={"user_roles.userId.$oid": "user_id"}, inplace=True)

In [8]:
output_path = os.path.join(
    "output", 
    f"unique_user_roles_{datetime_string(today, '-')} as of {datetime_string(datetime.datetime.now(), '-')}.csv"
)


In [None]:
# Save data to csv file
unique_user_roles_df.to_csv(output_path, index=False)

In [4]:
# Upload output to data lake
dl = DataLake(connect_str)

In [9]:
lake_path = "Processed/user_roles.csv"
file_path = output_path
dl.upload_file(lake_path, file_path)

True