In [1]:
import os
import sys
from random import sample
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient, __version__
from dotenv import load_dotenv
import pandas as pd
import datetime
from dateutil import parser as date_parser
import json
from pprint import pprint
from functions import (
    get_invalid_json,
    get_json_blob_as_df,
    json_csv,
    datetime_string,
    epoch_to_datetime,
    export_tag_frequency,
    get_hubspot_data,
    get_accounts_data,
    get_chargebee_subscriptions,
    get_users_data,
    get_users_accounts_subscriptions_data,
    get_task_created,
    get_task_modified,
    get_task_deleted
)

In [2]:
# Environment vars
load_dotenv()
connect_str = os.getenv('AZURE_STORAGE_CONNECTION_STRING')

In [3]:
# Hardcoded date
today = datetime.datetime(year=2022, month=4, day=7)
today_str = datetime_string(today, "/")

In [4]:
# Instantiate blob service client
try:
    blob_service_client = BlobServiceClient.from_connection_string(connect_str)
except Exception as e:
    print(f'Unable to connect to BlobServiceClient: {e}')

In [5]:
# Fetching users, accounts, and chargebee subscriptions.
# Merging all tables into one.
df = get_users_accounts_subscriptions_data(blob_service_client, today_str)

Fetching user data...
<class 'pandas.core.frame.DataFrame'>
Int64Index: 88329 entries, 3 to 90338
Data columns (total 10 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   _id.$oid                           88329 non-null  object 
 1   accountId.$oid                     88329 non-null  object 
 2   language                           88289 non-null  float64
 3   dateCreated.$date                  88329 non-null  int64  
 4   email                              88329 non-null  object 
 5   auth0Cache.usermetadata.lastName   88248 non-null  object 
 6   auth0Cache.usermetadata.firstName  88256 non-null  object 
 7   subscriptionType                   88329 non-null  object 
 8   isPrimary                          88329 non-null  bool   
 9   auth0Cache.usermetadata.demo       161 non-null    object 
dtypes: bool(1), float64(1), int64(1), object(7)
memory usage: 6.8+ MB
None
Fetching account data...


In [6]:
df["user|dateCreated.$date"].head()

0    2018-06-07 14:13:32.294000
1           2018-06-04 07:45:09
2    2018-06-11 08:44:48.498000
3    2018-06-12 12:20:29.635000
4    2019-06-25 15:40:06.882000
Name: user|dateCreated.$date, dtype: object

In [7]:
# Last 180 days
threshold_date = today - datetime.timedelta(days=180)

In [8]:
# Filter on users created at least x days ago
df = df[df["user|dateCreated.$date"] <= threshold_date]

In [9]:
# Number of unique users existing for at least 60 days
df.shape[0]

25534

In [10]:
# Sample 250 users, but no more than 1 user per account
sample_accounts = sample(df["account|_id.$oid"].tolist(), 250)
sample_users = sample(df["user|_id.$oid"].tolist(), 250)

In [11]:
df = df[df["user|_id.$oid"].isin(sample_users)]

In [12]:
df.head()
df.shape

(250, 53)

In [13]:
# Read in CSDA data
csda_df = pd.read_csv(os.path.join(os.pardir, "shared_data", "csda_salesforce_export.csv"))

# Rename columns
csda_df.columns = [
    "_".join(column.lower().split())
    for column in csda_df.columns
]

# Add prefix for clarity after join
csda_df = csda_df.add_prefix("salesforce|")

In [14]:
# Left join CSDA table with main dataframe (df)
df = df.merge(
    csda_df,
    how="left",
    left_on="account|_id.$oid",
    right_on="salesforce|chargebee_id"
)

In [15]:
# Get Hubspot data
hubspot_df = get_hubspot_data()

Fetching Hubspot data...
Hubspot contacts...
Hubspot companies...


In [16]:
# Merge users with HubSpot data
df = df.merge(
    hubspot_df,
    how="left",
    left_on="user|email",
    right_on="hubspot|email"
)

In [17]:
# Add CSDA column
df["salesforce|team_member_name"] = df["salesforce|team_member_name"].fillna("-")
df["salesforce|is_csda"] = df["salesforce|team_member_name"].map(
    lambda x: True if x != "-" else False
)

In [18]:
# Rename columns intended to be kept
column_name_remapping = {
    "account|_id.$oid": "account.id",
    "account|name": "account.name",
    "account|dateCreated.$date": "account.date_created",
    "chargebee|status": "chargebee.status",
    "user|_id.$oid": "user.id",
    "user|language": "user.language",
    "user|dateCreated.$date": "user.date_created",
    "user|email": "user.email",
    "user|auth0Cache.usermetadata.lastName": "user.last_name",
    "user|auth0Cache.usermetadata.firstName": "user.first_name",
    "account|users_count": "account.users_count",
    "hubspot|jobtitle": "hubspot.job_title",
    "hubspot|industry": "hubspot.industry",
    "hubspot|numberofemployees": "hubspot.number_of_employees",
    "salesforce|is_csda": "salesforce.is_csda",
    "hubspot|country": "hubspot.country"
}

# Keep only the keys of mapping columns
df = df[column_name_remapping.keys()]

# Change names of columns based on mapping
df = df.rename(columns=column_name_remapping)

In [19]:
# Change language code to categorical
language_code_mapping = {
    0: "English",
    1: "German",
    2: "Chinese",
    3: "Bulgarian",
    4: "Spanish",
    5: "French",
    6: "Portuguese"
}

df = df.replace({"user.language": language_code_mapping})

In [20]:
df.head()
df.shape

(252, 16)

In [21]:
# Fetch user_roles data
table_name = "user_roles"

data_eu = get_invalid_json(
    blob_service_client, 
    "researchanalyticsinsights", 
    f"Unprocessed/Gtmhub MongoDB EU/{today_str}/{table_name}.json"
)

data_us = get_invalid_json(
    blob_service_client, 
    "researchanalyticsinsights", 
    f"Unprocessed/Gtmhub MongoDB US/{today_str}/{table_name}.json"
)
user_roles_df = pd.DataFrame(json_csv(data_eu + data_us))
user_roles_df = user_roles_df.add_prefix("user_roles.")

user_roles_df.head()

Unnamed: 0,user_roles._id.$oid,user_roles.userId.$oid,user_roles.roleId.$oid,user_roles.accountId.$oid
0,574e9259ed915d0006b985e5,574e9259ed915d0006b985e4,573d93d9ed915d00052efb6b,574e9259ed915d0006b985e3
1,573e222fed915d0005cc2d07,573e222fed915d0005cc2d06,573d93d9ed915d00052efb6b,573e222fed915d0005cc2d05
2,573d93d9ed915d00052efb6c,573d93d9ed915d00052efb6a,573d93d9ed915d00052efb6b,573d93d9ed915d00052efb69
3,5746e3c8ed915d0005cc319c,5746e3c8ed915d0005cc319b,573d93d9ed915d00052efb6b,5746e3c8ed915d0005cc319a
4,573f32bded915d0005cc2e5d,573f32bded915d0005cc2e5c,573d93d9ed915d00052efb6b,573f32bded915d0005cc2e5b


In [22]:
# Fetch roles data
table_name = "roles"

data_eu = get_invalid_json(
    blob_service_client, 
    "researchanalyticsinsights", 
    f"Unprocessed/Gtmhub MongoDB EU/{today_str}/{table_name}.json"
)

data_us = get_invalid_json(
    blob_service_client, 
    "researchanalyticsinsights", 
    f"Unprocessed/Gtmhub MongoDB US/{today_str}/{table_name}.json"
)

roles_df = pd.DataFrame(json_csv(data_eu + data_us))
roles_df = roles_df.add_prefix("roles.")

roles_df.head()

Unnamed: 0,roles._id.$oid,roles.name,roles.accountId.$oid
0,573dbb61ed915d0005cc2c4d,user,
1,5a9f9cbee5274a0007acfcf9,Company B,57fb5f7bed915d0006582898
2,5b86ad85df457100079c04e4,Admin II,57fb5f7bed915d0006582898
3,5b8d2accf9159100080416a2,Air (bo's experiment),573dbb12ed915d0005cc2c46
4,58f632d3ed915d0005e9ef6c,test role,58822288ed915d0005afa6ee


In [23]:
# Join user_roles with roles
user_roles_df = user_roles_df.merge(
    roles_df,
    how="left",
    left_on="user_roles.roleId.$oid",
    right_on="roles._id.$oid"
)

user_roles_df.head()

Unnamed: 0,user_roles._id.$oid,user_roles.userId.$oid,user_roles.roleId.$oid,user_roles.accountId.$oid,roles._id.$oid,roles.name,roles.accountId.$oid
0,574e9259ed915d0006b985e5,574e9259ed915d0006b985e4,573d93d9ed915d00052efb6b,574e9259ed915d0006b985e3,573d93d9ed915d00052efb6b,admin,
1,574e9259ed915d0006b985e5,574e9259ed915d0006b985e4,573d93d9ed915d00052efb6b,574e9259ed915d0006b985e3,573d93d9ed915d00052efb6b,admin,
2,573e222fed915d0005cc2d07,573e222fed915d0005cc2d06,573d93d9ed915d00052efb6b,573e222fed915d0005cc2d05,573d93d9ed915d00052efb6b,admin,
3,573e222fed915d0005cc2d07,573e222fed915d0005cc2d06,573d93d9ed915d00052efb6b,573e222fed915d0005cc2d05,573d93d9ed915d00052efb6b,admin,
4,573d93d9ed915d00052efb6c,573d93d9ed915d00052efb6a,573d93d9ed915d00052efb6b,573d93d9ed915d00052efb69,573d93d9ed915d00052efb6b,admin,


In [24]:
# Join df with user_roles_df
df = df.merge(
    user_roles_df,
    how="left",
    left_on="user.id",
    right_on="user_roles.userId.$oid"
)

df.head()
df.shape

(1323, 23)

In [25]:
df[~df["roles.name"].isnull()].head()

Unnamed: 0,account.id,account.name,account.date_created,chargebee.status,user.id,user.language,user.date_created,user.email,user.last_name,user.first_name,...,hubspot.number_of_employees,salesforce.is_csda,hubspot.country,user_roles._id.$oid,user_roles.userId.$oid,user_roles.roleId.$oid,user_roles.accountId.$oid,roles._id.$oid,roles.name,roles.accountId.$oid
0,5bbb6f55192103000756bf10,Petzl,2018-10-08 14:53:09.532,active,5c8f5f64505e8a0001b3b83a,English,2019-03-18 09:05:40.716000,goudot@petzl.com,Oudot,Guillaume,...,1000,True,,5c94f6e17d317f00016038f8,5c8f5f64505e8a0001b3b83a,573d93d9ed915d00052efb6b,5bbb6f55192103000756bf10,573d93d9ed915d00052efb6b,admin,
1,5bbb6f55192103000756bf10,Petzl,2018-10-08 14:53:09.532,active,5c8f5f64505e8a0001b3b83a,English,2019-03-18 09:05:40.716000,goudot@petzl.com,Oudot,Guillaume,...,1000,True,,5c94f6e17d317f00016038f8,5c8f5f64505e8a0001b3b83a,573d93d9ed915d00052efb6b,5bbb6f55192103000756bf10,573d93d9ed915d00052efb6b,admin,
2,5bbb6f55192103000756bf10,Petzl,2018-10-08 14:53:09.532,active,5c8f5f64505e8a0001b3b83a,English,2019-03-18 09:05:40.716000,goudot@petzl.com,Oudot,Guillaume,...,1000,True,,5d3efeef8e708400013621c1,5c8f5f64505e8a0001b3b83a,5bbdafc121a89c000768257d,5bbb6f55192103000756bf10,5bbdafc121a89c000768257d,Communication & Marketing_141d4e53fa4f0846_Mem...,5bbb6f55192103000756bf10
3,5bbb6f55192103000756bf10,Petzl,2018-10-08 14:53:09.532,active,5c8f5f64505e8a0001b3b83a,English,2019-03-18 09:05:40.716000,goudot@petzl.com,Oudot,Guillaume,...,1000,True,,619237b6af2494000134892e,5c8f5f64505e8a0001b3b83a,61487553bb49e10001e6ab23,5bbb6f55192103000756bf10,61487553bb49e10001e6ab23,Communication Managers_19d3df5956ed339c_Member...,5bbb6f55192103000756bf10
4,5bbb6f55192103000756bf10,Petzl,2018-10-08 14:53:09.532,active,5c8f5f64505e8a0001b3b83a,English,2019-03-18 09:05:40.716000,goudot@petzl.com,Oudot,Guillaume,...,1000,True,,619238c8e8a38700017e0a67,5c8f5f64505e8a0001b3b83a,618129904ce36700019ca37e,5bbb6f55192103000756bf10,618129904ce36700019ca37e,Task Force CSR/DEI_60094eb93e1e33e4_MembersRole,5bbb6f55192103000756bf10


In [26]:
df["roles.name"] = df["roles.name"].fillna("")
# df["user.is_okr_champion"] = df["roles.name"].map(lambda role: True if "champion" in (_role := role.lower()) or ("okr" in _role and "champion" in _role) else False)

In [27]:
# Drop unnecessary columns
"""
Should I drop all id's?
account.id
user.id

user_roles._id.$oid
user_roles.userId.$oid
user_roles.roleId.$oid
user_roles.accountId.$oid
roles._id.$oid
roles.accountId.$oid

"""

df = df[
    df.columns[
        ~df.columns.isin(
            [
                "account.id",
                "user.id",
                "user_roles._id.$oid",
                "user_roles.userId.$oid",
                "user_roles.roleId.$oid",
                "user_roles.accountId.$oid",
                "roles._id.$oid",
                "roles.accountId.$oid"
            ]
        )
    ]
]

In [28]:
# Format datetime fields as string for output file
# df["account.date_created"] = df["account.date_created"].dt.strftime("%Y/%m/%d")
# df["user.date_created"] = df["user.date_created"].dt.strftime("%Y/%m/%d")

In [29]:
# Save data to csv file
df.to_csv(f"sample_users_180+days_data_{datetime_string(today, '-')}.csv", index=False)