In [1]:
import os
import sys
from random import sample
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient, __version__
from dotenv import load_dotenv
import pandas as pd
import datetime
from dateutil import parser as date_parser
import json
from pprint import pprint
from functions_2 import *

In [2]:
# Environment vars
load_dotenv()
connect_str = os.getenv('AZURE_STORAGE_CONNECTION_STRING')

In [3]:
# Hardcoded date
today = datetime.datetime(year=2022, month=4, day=7)
today_str = datetime_string(today, "/")

In [4]:
# Instantiate blob service client
try:
    blob_service_client = BlobServiceClient.from_connection_string(connect_str)
except Exception as e:
    print(f'Unable to connect to BlobServiceClient: {e}')

In [5]:
# Fetching users, accounts, and chargebee subscriptions.
# Merging all tables into one.
df = get_users_accounts_subscriptions_data(blob_service_client, today_str)

Fetching user data...
<class 'pandas.core.frame.DataFrame'>
Int64Index: 88329 entries, 3 to 90338
Data columns (total 10 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   _id.$oid                           88329 non-null  object 
 1   accountId.$oid                     88329 non-null  object 
 2   language                           88289 non-null  float64
 3   dateCreated.$date                  88329 non-null  int64  
 4   email                              88329 non-null  object 
 5   auth0Cache.usermetadata.lastName   88248 non-null  object 
 6   auth0Cache.usermetadata.firstName  88256 non-null  object 
 7   subscriptionType                   88329 non-null  object 
 8   isPrimary                          88329 non-null  bool   
 9   auth0Cache.usermetadata.demo       161 non-null    object 
dtypes: bool(1), float64(1), int64(1), object(7)
memory usage: 6.8+ MB
None
Fetching account data...


In [6]:
df["user|dateCreated.$date"].head()
df[df["user|dateCreated.$date"] != None].head()

Unnamed: 0,account|_id.$oid,account|name,account|domain,account|isActive,account|type,account|trialEnds.$date,account|dateCreated.$date,account|ownerId.$oid,account|edition,account|subscriptionId,...,user|language,user|dateCreated.$date,user|email,user|auth0Cache.usermetadata.lastName,user|auth0Cache.usermetadata.firstName,user|subscriptionType,user|isPrimary,user|auth0Cache.usermetadata.demo,account|id,account|users_count
0,5b14ee04854ff50007e2a077,Azavista,azavista,True,1.0,1528897421000,2018-06-04 07:45:08.977,5b14ee05854ff50007e2a078,gtmhub-summit,1mk51ZXQtzEmDUIfF,...,0.0,2018-06-07 14:13:32.294000,jp@azavista.com,van der Kuijl,JP,regular,True,,5b14ee04854ff50007e2a077,2
1,5b14ee04854ff50007e2a077,Azavista,azavista,True,1.0,1528897421000,2018-06-04 07:45:08.977,5b14ee05854ff50007e2a078,gtmhub-summit,1mk51ZXQtzEmDUIfF,...,0.0,2018-06-04 07:45:09,gabriel.thomaidis@azavista.com,Thomaidis,Gabriel,regular,True,,5b14ee04854ff50007e2a077,2
2,5b1e3680d8c5b500075ecfd3,wetransform GmbH,wetransform,True,1.0,1529311488473,2018-06-11 08:44:48.473,5b1e3680d8c5b500075ecfd4,gtmhub-summit,1mkVvueQuePTWUOmB,...,0.0,2018-06-11 08:44:48.498000,tr@wetransform.to,Reitz,Thorsten,regular,True,,5b1e3680d8c5b500075ecfd3,4
3,5b1e3680d8c5b500075ecfd3,wetransform GmbH,wetransform,True,1.0,1529311488473,2018-06-11 08:44:48.473,5b1e3680d8c5b500075ecfd4,gtmhub-summit,1mkVvueQuePTWUOmB,...,0.0,2018-06-12 12:20:29.635000,st@wetransform.to,Templer,Simon,regular,True,,5b1e3680d8c5b500075ecfd3,4
4,5b1e3680d8c5b500075ecfd3,wetransform GmbH,wetransform,True,1.0,1529311488473,2018-06-11 08:44:48.473,5b1e3680d8c5b500075ecfd4,gtmhub-summit,1mkVvueQuePTWUOmB,...,0.0,2019-06-25 15:40:06.882000,ch@wetransform.to,Hönn,Christopher,regular,True,,5b1e3680d8c5b500075ecfd3,4


In [7]:
# Last 60 days
threshold_date = today - datetime.timedelta(days=60)

In [8]:
# Filter on users created at least 60 days ago
df = df[df["user|dateCreated.$date"] <= threshold_date]

In [9]:
# Number of unique users existing for at least 60 days
df.shape[0]

35074

In [10]:
# Sample 250 users, but no more than 1 user per account

# Sample 250 unique accounts
sample_accounts = sample(df["account|_id.$oid"].unique().tolist(), 250)
# df = df[df["account|_id.$oid"].isin(sample_accounts)]

# Choose 1 user for each of the accounts sampled
sample_users = [
    sample(
        df[df["account|_id.$oid"] == acc]["user|_id.$oid"].tolist(), 
        1
    )[0]
    for acc in sample_accounts # sample(df["user|_id.$oid"].tolist(), 250)
]

len(sample_users)

250

In [11]:
sample_users[:10]

['60a7bedb58321a00010e969c',
 '5f3ceb72fc68560001f9543d',
 '61e55f0b0d823f000192d84f',
 '5f185b989d29880001ba0140',
 '60d0b5045535ee00013124bd',
 '609072517453d2000154bfc2',
 '5dc47cb0ee825700017381f9',
 '610bdb2a61c125000103f96b',
 '5eba5f58575a9e0001afd057',
 '5df7bc05b874bc00018dfdac']

In [12]:
df = df[df["user|_id.$oid"].isin(sample_users)]

In [13]:
df.head()

Unnamed: 0,account|_id.$oid,account|name,account|domain,account|isActive,account|type,account|trialEnds.$date,account|dateCreated.$date,account|ownerId.$oid,account|edition,account|subscriptionId,...,user|language,user|dateCreated.$date,user|email,user|auth0Cache.usermetadata.lastName,user|auth0Cache.usermetadata.firstName,user|subscriptionType,user|isPrimary,user|auth0Cache.usermetadata.demo,account|id,account|users_count
1015,5dbe9552fc535d0001015133,Advery,advery,True,1.0,1573375954305,2019-11-03 08:52:34.305,5dbe9552fc535d0001015134,gtmhub-start-v3,2sUBzx2uRgkVU1I1lWb,...,1.0,2021-10-06 20:24:18.590000,falk.kreuger@brueggli.ch,,falk.kreuger,regular,True,,5dbe9552fc535d0001015133,49
1725,5836fdcbed915d0007090826,Prime Holding,prime,True,1.0,0,2016-11-24 00:00:00.000,58939d52ed915d0005d0fb68,enterprise,AzZdQbSagtvqqMHZ,...,0.0,2018-11-05 12:01:12.621000,bilyana.baycheva@sbnd.net,Baycheva,Biliana,regular,True,,5836fdcbed915d0007090826,1
2502,5da59040f862dc0001f2ae06,Mews,mews,True,1.0,1572511612000,2019-10-15 09:24:16.957,5da59041f862dc0001f2ae07,gtmhub-summit,2skdb1bJRexXTqJEYY,...,0.0,2019-12-16 17:16:53.040000,craig.bonafont@mews.com,Bonafont,Craig,regular,True,,5da59040f862dc0001f2ae06,44
2671,5b4c5eacf85c070007d02b68,Solitea,sbs,True,1.0,1532334329000,2018-07-16 09:00:28.255,5b4c5eacf85c070007d02b69,gtmhub-start-v3,1mkVvrbQxx7oWuHGB,...,0.0,2022-02-02 12:41:31.436000,marian.videka@mainstream.cz,Videka,marian.videka,regular,True,,5b4c5eacf85c070007d02b68,96
2841,5df25fa696e9cd00013ae7c0,Scruff of the Neck,scruff-of-the-neck,True,1.0,1630367999000,2019-12-12 15:41:26.187,5df25fa696e9cd00013ae7c1,gtmhub-start-v3,16A1AZRkSCuoW10SA,...,0.0,2020-10-08 15:07:01.302000,joe.cockburn@scruffoftheneck.com,Cockburn,Joe,regular,True,,5df25fa696e9cd00013ae7c0,11


In [14]:
df.shape

(250, 53)

In [15]:
# Read in CSDA data
csda_df = pd.read_csv(os.path.join(os.pardir, "shared_data", "csda_salesforce_export.csv"))

# Rename columns
csda_df.columns = [
    "_".join(column.lower().split())
    for column in csda_df.columns
]

# Add prefix for clarity after join
csda_df = csda_df.add_prefix("salesforce|")

In [16]:
# Left join CSDA table with main dataframe (df)
df = df.merge(
    csda_df,
    how="left",
    left_on="account|_id.$oid",
    right_on="salesforce|chargebee_id"
)

In [17]:
# Get Hubspot data
hubspot_df = get_hubspot_data()

Fetching Hubspot data...
Hubspot contacts...
Hubspot companies...


In [18]:
# Merge users with HubSpot data
df = df.merge(
    hubspot_df,
    how="left",
    left_on="user|email",
    right_on="hubspot|email"
)

In [19]:
# Add CSDA column
df["salesforce|team_member_name"] = df["salesforce|team_member_name"].fillna("-")
df["salesforce|is_csda"] = df["salesforce|team_member_name"].map(
    lambda x: True if x != "-" else False
)

In [20]:
# Rename columns intended to be kept
column_name_remapping = {
    "account|_id.$oid": "account.id",
    "account|name": "account.name",
    "account|dateCreated.$date": "account.date_created",
    "chargebee|status": "chargebee.status",
    "user|_id.$oid": "user.id",
    "user|language": "user.language",
    "user|dateCreated.$date": "user.date_created",
    "user|email": "user.email",
    "user|auth0Cache.usermetadata.lastName": "user.last_name",
    "user|auth0Cache.usermetadata.firstName": "user.first_name",
    "account|users_count": "account.users_count",
    "hubspot|jobtitle": "hubspot.job_title",
    "hubspot|industry": "hubspot.industry",
    "hubspot|numberofemployees": "hubspot.number_of_employees",
    "salesforce|is_csda": "salesforce.is_csda",
    "hubspot|country": "hubspot.country"
}

# Keep only the keys of mapping columns
df = df[column_name_remapping.keys()]

# Change names of columns based on mapping
df = df.rename(columns=column_name_remapping)

In [21]:
# Change language code to categorical
language_code_mapping = {
    0: "English",
    1: "German",
    2: "Chinese",
    3: "Bulgarian",
    4: "Spanish",
    5: "French",
    6: "Portuguese"
}

df = df.replace({"user.language": language_code_mapping})

In [22]:
df.head()

Unnamed: 0,account.id,account.name,account.date_created,chargebee.status,user.id,user.language,user.date_created,user.email,user.last_name,user.first_name,account.users_count,hubspot.job_title,hubspot.industry,hubspot.number_of_employees,salesforce.is_csda,hubspot.country
0,5dbe9552fc535d0001015133,Advery,2019-11-03 08:52:34.305,active,615e05f268dff200012d0cf7,German,2021-10-06 20:24:18.590000,falk.kreuger@brueggli.ch,,falk.kreuger,49,,,,False,
1,5836fdcbed915d0007090826,Prime Holding,2016-11-24 00:00:00.000,active,5be031087b36380001172075,English,2018-11-05 12:01:12.621000,bilyana.baycheva@sbnd.net,Baycheva,Biliana,1,,,,True,
2,5da59040f862dc0001f2ae06,Mews,2019-10-15 09:24:16.957,active,5df7bc05b874bc00018dfdac,English,2019-12-16 17:16:53.040000,craig.bonafont@mews.com,Bonafont,Craig,44,,,,False,
3,5b4c5eacf85c070007d02b68,Solitea,2018-07-16 09:00:28.255,active,61fa7bfb5498de00013e1689,English,2022-02-02 12:41:31.436000,marian.videka@mainstream.cz,Videka,marian.videka,96,,,,False,
4,5df25fa696e9cd00013ae7c0,Scruff of the Neck,2019-12-12 15:41:26.187,active,5f7f2b15f7d6770001587b27,English,2020-10-08 15:07:01.302000,joe.cockburn@scruffoftheneck.com,Cockburn,Joe,11,,,,False,


In [23]:
# Fetch user_roles data
user_roles_df = get_user_roles_table(blob_service_client, today_str)
user_roles_df.head()

Unnamed: 0,user_roles._id.$oid,user_roles.userId.$oid,user_roles.roleId.$oid,user_roles.accountId.$oid
0,574e9259ed915d0006b985e5,574e9259ed915d0006b985e4,573d93d9ed915d00052efb6b,574e9259ed915d0006b985e3
1,573e222fed915d0005cc2d07,573e222fed915d0005cc2d06,573d93d9ed915d00052efb6b,573e222fed915d0005cc2d05
2,573d93d9ed915d00052efb6c,573d93d9ed915d00052efb6a,573d93d9ed915d00052efb6b,573d93d9ed915d00052efb69
3,5746e3c8ed915d0005cc319c,5746e3c8ed915d0005cc319b,573d93d9ed915d00052efb6b,5746e3c8ed915d0005cc319a
4,573f32bded915d0005cc2e5d,573f32bded915d0005cc2e5c,573d93d9ed915d00052efb6b,573f32bded915d0005cc2e5b


In [24]:
# Fetch roles data
roles_df = get_roles_table(blob_service_client, today_str)
roles_df.head()

Unnamed: 0,roles._id.$oid,roles.name,roles.accountId.$oid
0,573dbb61ed915d0005cc2c4d,user,
1,5a9f9cbee5274a0007acfcf9,Company B,57fb5f7bed915d0006582898
2,5b86ad85df457100079c04e4,Admin II,57fb5f7bed915d0006582898
3,5b8d2accf9159100080416a2,Air (bo's experiment),573dbb12ed915d0005cc2c46
4,58f632d3ed915d0005e9ef6c,test role,58822288ed915d0005afa6ee


In [25]:
# Join user_roles with roles
user_roles_df = user_roles_df.merge(
    roles_df,
    how="left",
    left_on="user_roles.roleId.$oid",
    right_on="roles._id.$oid"
)

user_roles_df.head()

Unnamed: 0,user_roles._id.$oid,user_roles.userId.$oid,user_roles.roleId.$oid,user_roles.accountId.$oid,roles._id.$oid,roles.name,roles.accountId.$oid
0,574e9259ed915d0006b985e5,574e9259ed915d0006b985e4,573d93d9ed915d00052efb6b,574e9259ed915d0006b985e3,573d93d9ed915d00052efb6b,admin,
1,574e9259ed915d0006b985e5,574e9259ed915d0006b985e4,573d93d9ed915d00052efb6b,574e9259ed915d0006b985e3,573d93d9ed915d00052efb6b,admin,
2,573e222fed915d0005cc2d07,573e222fed915d0005cc2d06,573d93d9ed915d00052efb6b,573e222fed915d0005cc2d05,573d93d9ed915d00052efb6b,admin,
3,573e222fed915d0005cc2d07,573e222fed915d0005cc2d06,573d93d9ed915d00052efb6b,573e222fed915d0005cc2d05,573d93d9ed915d00052efb6b,admin,
4,573d93d9ed915d00052efb6c,573d93d9ed915d00052efb6a,573d93d9ed915d00052efb6b,573d93d9ed915d00052efb69,573d93d9ed915d00052efb6b,admin,


In [26]:
unique_user_roles_df = get_roles_by_user(user_roles_df)
unique_user_roles_df.head()

Unnamed: 0,user_roles.userId.$oid,roles
0,573d9359ed915d00052efb10,SysAdmin
1,573d93d9ed915d00052efb6a,admin
2,573db6aeed915d0005cc2bc5,admin
3,573dbb12ed915d0005cc2c47,"admin, user, user+, Ivan Osmak's team_7ba678c2..."
4,573dbb61ed915d0005cc2c4c,"admin, user, user+, Engineering_a8a9b5181dbd8f..."


In [27]:
# Join df with user_roles_df
df = df.merge(
    unique_user_roles_df,
    how="left",
    left_on="user.id",
    right_on="user_roles.userId.$oid"
)

df.head()

Unnamed: 0,account.id,account.name,account.date_created,chargebee.status,user.id,user.language,user.date_created,user.email,user.last_name,user.first_name,account.users_count,hubspot.job_title,hubspot.industry,hubspot.number_of_employees,salesforce.is_csda,hubspot.country,user_roles.userId.$oid,roles
0,5dbe9552fc535d0001015133,Advery,2019-11-03 08:52:34.305,active,615e05f268dff200012d0cf7,German,2021-10-06 20:24:18.590000,falk.kreuger@brueggli.ch,,falk.kreuger,49,,,,False,,615e05f268dff200012d0cf7,"user, Brüggli Medien Beratung_7fd187d76ecc9d87..."
1,5836fdcbed915d0007090826,Prime Holding,2016-11-24 00:00:00.000,active,5be031087b36380001172075,English,2018-11-05 12:01:12.621000,bilyana.baycheva@sbnd.net,Baycheva,Biliana,1,,,,True,,5be031087b36380001172075,"user, Design Team_e3167a16d7584dfa_MembersRole..."
2,5da59040f862dc0001f2ae06,Mews,2019-10-15 09:24:16.957,active,5df7bc05b874bc00018dfdac,English,2019-12-16 17:16:53.040000,craig.bonafont@mews.com,Bonafont,Craig,44,,,,False,,5df7bc05b874bc00018dfdac,"user, com_c65235d138f7e4d1_MembersRole, com_11..."
3,5b4c5eacf85c070007d02b68,Solitea,2018-07-16 09:00:28.255,active,61fa7bfb5498de00013e1689,English,2022-02-02 12:41:31.436000,marian.videka@mainstream.cz,Videka,marian.videka,96,,,,False,,61fa7bfb5498de00013e1689,"user, EP_3f1ff9ee9d87831b_MembersRole, Mainstr..."
4,5df25fa696e9cd00013ae7c0,Scruff of the Neck,2019-12-12 15:41:26.187,active,5f7f2b15f7d6770001587b27,English,2020-10-08 15:07:01.302000,joe.cockburn@scruffoftheneck.com,Cockburn,Joe,11,,,,False,,5f7f2b15f7d6770001587b27,user


In [28]:
df["roles"] = df["roles"].fillna("")
# df["user.is_okr_champion"] = df["roles"].map(lambda role: True if "champion" in (_role := role.lower()) or ("okr" in _role and "champion" in _role) else False)

In [29]:
# Drop unnecessary columns
"""
Should I drop all id's?
account.id
user.id

user_roles._id.$oid
user_roles.userId.$oid
user_roles.roleId.$oid
user_roles.accountId.$oid
roles._id.$oid
roles.accountId.$oid

"""

df = df[
    df.columns[
        ~df.columns.isin(
            [
                "account.id",
                "user.id",
                "user_roles._id.$oid",
                "user_roles.userId.$oid",
                "user_roles.roleId.$oid",
                "user_roles.accountId.$oid",
                "roles._id.$oid",
                "roles.accountId.$oid"
            ]
        )
    ]
]

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 250 entries, 0 to 249
Data columns (total 15 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   account.name                 250 non-null    object        
 1   account.date_created         250 non-null    datetime64[ns]
 2   chargebee.status             250 non-null    object        
 3   user.language                250 non-null    object        
 4   user.date_created            250 non-null    object        
 5   user.email                   250 non-null    object        
 6   user.last_name               250 non-null    object        
 7   user.first_name              250 non-null    object        
 8   account.users_count          250 non-null    int64         
 9   hubspot.job_title            45 non-null     object        
 10  hubspot.industry             40 non-null     object        
 11  hubspot.number_of_employees  68 non-null     

In [31]:
df["user.date_created"] = df["user.date_created"].astype("datetime64[ns]")
df["account.date_created"] = df["account.date_created"].astype("datetime64[ns]")

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 250 entries, 0 to 249
Data columns (total 15 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   account.name                 250 non-null    object        
 1   account.date_created         250 non-null    datetime64[ns]
 2   chargebee.status             250 non-null    object        
 3   user.language                250 non-null    object        
 4   user.date_created            250 non-null    datetime64[ns]
 5   user.email                   250 non-null    object        
 6   user.last_name               250 non-null    object        
 7   user.first_name              250 non-null    object        
 8   account.users_count          250 non-null    int64         
 9   hubspot.job_title            45 non-null     object        
 10  hubspot.industry             40 non-null     object        
 11  hubspot.number_of_employees  68 non-null     

In [33]:
df.head()

Unnamed: 0,account.name,account.date_created,chargebee.status,user.language,user.date_created,user.email,user.last_name,user.first_name,account.users_count,hubspot.job_title,hubspot.industry,hubspot.number_of_employees,salesforce.is_csda,hubspot.country,roles
0,Advery,2019-11-03 08:52:34.305,active,German,2021-10-06 20:24:18.590,falk.kreuger@brueggli.ch,,falk.kreuger,49,,,,False,,"user, Brüggli Medien Beratung_7fd187d76ecc9d87..."
1,Prime Holding,2016-11-24 00:00:00.000,active,English,2018-11-05 12:01:12.621,bilyana.baycheva@sbnd.net,Baycheva,Biliana,1,,,,True,,"user, Design Team_e3167a16d7584dfa_MembersRole..."
2,Mews,2019-10-15 09:24:16.957,active,English,2019-12-16 17:16:53.040,craig.bonafont@mews.com,Bonafont,Craig,44,,,,False,,"user, com_c65235d138f7e4d1_MembersRole, com_11..."
3,Solitea,2018-07-16 09:00:28.255,active,English,2022-02-02 12:41:31.436,marian.videka@mainstream.cz,Videka,marian.videka,96,,,,False,,"user, EP_3f1ff9ee9d87831b_MembersRole, Mainstr..."
4,Scruff of the Neck,2019-12-12 15:41:26.187,active,English,2020-10-08 15:07:01.302,joe.cockburn@scruffoftheneck.com,Cockburn,Joe,11,,,,False,,user


In [34]:
# Format datetime fields as string for output file
df["account.date_created"] = df["account.date_created"].dt.strftime("%Y/%m/%d")
df["user.date_created"] = df["user.date_created"].dt.strftime("%Y/%m/%d")

In [35]:
df.shape

(250, 15)

In [36]:
# Save data to csv file
df.to_csv(f"sample_users_180+days_data_{datetime_string(today, '-')}.csv", index=False)