In [1]:
import os
import sys
import pandas as pd
import json
from datetime import datetime, timedelta
from dateutil import parser as datetime_parser
from functions import *

In [2]:
def merge_hubspot_salesforce(hubspot, salesforce):
    # Merge Hubspot with Salesforce
    df = (
        hubspot.merge(salesforce, how="left", left_on="salesforce_id", right_on="id")
        .drop(columns=["id_y"])
        .rename(columns={"id_x": "id"})
    )

    # print(df.columns)

    # for index, row in df.iterrows():
    #     print(list(row))

    # sys.exit()

    df["is_mql"] = df["became_mql_date"].map(lambda x: 1 if pd.notnull(x) else 0)

    # Create new columns
    df["has_opportunity"] = df["new_business_count"].map(
        lambda x: 1 if pd.notnull(x) and x > 0 else 0
    )
    df["has_meeting"] = df["MeetingCount"].map(
        lambda x: 1 if pd.notnull(x) and x > 0 else 0
    )
    df["has_won_opportunity"] = df["is_new_business_won_count"].map(
        lambda x: 1 if pd.notnull(x) and x > 0 else 0
    )

    # Drop time portion of datetime field
    df["campaign.created_at"] = df["campaign.created_at"].dt.date

    group_by = ["salesforce_id", "campaign_paid.value", "campaign.created_at"]

    region = (
        df.groupby(group_by + ["region"])
        .size()
        .unstack(fill_value=0)
        .add_prefix("region_")
        .reset_index()
    )

    sub_region = (
        df.groupby(group_by + ["sub_region"])
        .size()
        .unstack(fill_value=0)
        .add_prefix("sub_region_")
        .reset_index()
    )

    industry = (
        df.groupby(group_by + ["industry"])
        .size()
        .unstack(fill_value=0)
        .add_prefix("industry_")
        .reset_index()
    )

    state = (
        df.groupby(group_by + ["state"])
        .size()
        .unstack(fill_value=0)
        .add_prefix("state_")
        .reset_index()
    )

    job_function = (
        df.groupby(group_by + ["job_function"])
        .size()
        .unstack(fill_value=0)
        .add_prefix("job_function_")
        .reset_index()
    )

    country = (
        df.groupby(group_by + ["country"])
        .size()
        .unstack(fill_value=0)
        .add_prefix("country_")
        .reset_index()
    )

    day_campaign = (
        df.groupby(group_by)
        .agg(
            {
                "id": "count",
                "has_opportunity": "sum",
                "has_meeting": "sum",
                "has_won_opportunity": "sum",
                "is_mql": "sum",
                "new_business_opportunity_amount_sum": "sum",
                "new_business_won_amount_sum": "sum",
            }
        )
        .reset_index()
    )

    # Rename specific columns
    day_campaign.rename(
        columns={
            "id": "user_count",
            "is_mql": "mql_count",
            "has_opportunity": "opportunity_count",
            "has_meeting": "meeting_count",
            "has_won_opportunity": "won_opportunity_count",
        },
        inplace=True,
    )

    # print(industry.columns.tolist())

    # do not include all industry values as fields
    # Get the top 15 used industries and group the remaining together.
    # Get the sum of each industry column.
    industry_counts = {
        header: industry[header].sum()
        for header in industry.columns
        if header.startswith("industry_")
    }

    industry_counts_ranked = sorted(
        industry_counts.items(), key=lambda x: x[1], reverse=True
    )
    industry_fields_ranked = [x[0] for x in industry_counts_ranked]
    # industry_top_15_fields = industry_fields_ranked[:15]
    industry_remaining_fields = industry_fields_ranked[15:]
    # print(industry_counts_ranked)

    # Combine the remaining industry fields
    # Drop the original columns
    industry["industry_remaining"] = industry[industry_remaining_fields].sum(axis=1)
    industry = industry.drop(columns=industry_remaining_fields)

    # print(industry.columns.tolist())

    # for _df in (region, sub_region, state, job_function, industry, country):
    #     day_campaign = day_campaign.merge(
    #         _df,
    #         how="left",
    #         left_on=group_by,
    #         right_on=group_by
    #     )

    day_campaign = day_campaign.merge(
        df,
        how="left",
        left_on="salesforce_id",
        right_on="salesforce_id"
    )

    # Create unique identifier for date + campaign in both
    # day_campaign["right"] = day_campaign["campaign.created_at"] + day_campaign["campaign_paid.value"]

    return day_campaign

In [3]:
def keep_x_only(df):
    _df = df.rename(columns={
        col: col[:-2]
        for col in df.columns
        if col.endswith("_x")
    }).drop(
        columns=[col for col in df.columns if col.endswith("_y")]
    )

    return _df

In [4]:
run_date = datetime_string(datetime.now(), "-")

In [5]:
# Read in hubspot data
hubspot = get_hubspot_contact_campaign_data()

In [6]:
hubspot.head()

Unnamed: 0,id,campaign_paid.value,campaign_paid.timestamp,campaign_source.value,salesforce_id,campaign.created_at
0,5551,spotahome_hyper_growth,2021-04-29T09:39:38.109Z,linkedin / impressions,00Q4W00001ROq8FUAT,2021-04-29 09:39:38.109000+00:00
1,5601,spotahome_hyper_growth,2021-04-29T09:39:38.994Z,linkedin / impressions,00Q4W00001RP4oEUAT,2021-04-29 09:39:38.994000+00:00
2,10851,spotahome_hyper_growth,2021-04-29T09:39:39.139Z,linkedin / impressions,,2021-04-29 09:39:39.139000+00:00
3,22551,spotahome_hyper_growth,2021-04-29T09:39:38.446Z,linkedin / impressions,0034W00002NjFl5QAF,2021-04-29 09:39:38.446000+00:00
4,100801,goodlord case study,2021-04-29T09:39:39.018Z,Google,00Q4W00001RP3sPUAT,2021-04-29 09:39:39.018000+00:00


In [7]:
hubspot.shape

(207465, 6)

In [8]:
# Remove rows that don't have a value for campaign___paid
hubspot_campaigns = hubspot[
    ~hubspot["campaign_paid.value"].isnull()
    & (hubspot["campaign_paid.value"] != "")
]

In [9]:
hubspot_campaigns.shape

(207056, 6)

In [10]:
# _hs = hubspot.groupby(["salesforce_id", "campaign_paid.value"]).size().reset_index()

In [11]:
# _hs[_hs[0] > 1]

In [12]:
# Sample
# ultim. okrs - e-book - download - lp - lal
# 0034W00002hPw9RQAS
# sample = hubspot[
#     (hubspot["salesforce_id"] == "0034W00002hPw9RQAS")
#     & (hubspot["campaign_paid.value"] == "ultim. okrs - e-book - download - lp - lal")
# ]
# sample.head()

In [13]:
# hs = hs[
#     hs["campaign_paid.value"]
#     == "lead-multi-axis-ebook-cso-companies-noam-linkedin"
# ]

In [14]:
# Read in Salesforce data
salesforce = pd.read_csv(
    os.path.join("data", "salesforce", "salesforce_prospects_data_for_ads_v5.csv")
)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [15]:
# unique_combos = []
# first_touches = []
# for index, row in hubspot_campaigns.iterrows():
#     salesforce_id = row["salesforce_id"]
#     campaign = row["campaign_paid.value"]
#     combo = (salesforce_id, campaign)

#     if salesforce_id == "":
#         first_touches.append(False)
#     elif combo not in unique_combos:
#         unique_combos.append(combo)
#         first_touches.append(True)
#     else:
#         first_touches.append(False)

# hubspot_campaigns["is_first_touch"] = first_touches

In [16]:
hubspot_first_campaigns = hubspot_campaigns.sort_values(
    ["salesforce_id", "campaign.created_at", "campaign_paid.value"]
).drop_duplicates(
    subset=["salesforce_id", "campaign_paid.value"], keep="first", ignore_index=True
)

In [17]:
hubspot_first_campaigns.columns

Index(['id', 'campaign_paid.value', 'campaign_paid.timestamp',
       'campaign_source.value', 'salesforce_id', 'campaign.created_at'],
      dtype='object')

In [18]:
# Unique headers for first touch and all touch campaigns
unique_headers = [
    "user_count",
    "opportunity_count",
    "meeting_count",
    "won_opportunity_count",
    "mql_count",
    "new_business_opportunity_amount_sum",
    "new_business_won_amount_sum",
]

In [19]:
hs_sf_contact_campaign_prospect_df_all = merge_hubspot_salesforce(hubspot_campaigns, salesforce)

In [20]:
hs_sf_contact_campaign_prospect_df_all.columns

Index(['salesforce_id', 'campaign_paid.value_x', 'campaign.created_at_x',
       'user_count', 'opportunity_count', 'meeting_count',
       'won_opportunity_count', 'mql_count',
       'new_business_opportunity_amount_sum_x',
       'new_business_won_amount_sum_x', 'id', 'campaign_paid.value_y',
       'campaign_paid.timestamp', 'campaign_source.value',
       'campaign.created_at_y', 'became_mql_date', 'title', 'job_function',
       'account_id', 'sub_region', 'industry', 'state', 'country', 'region',
       'new_business_opportunity_amount_sum_y', 'new_business_count',
       'is_new_business_won_count', 'new_business_won_amount_sum_y',
       'call_count', 'email_count', 'other_count', 'MeetingCount',
       'first_meeting_date', 'first_opportunity_won_date', 'is_mql',
       'has_opportunity', 'has_meeting', 'has_won_opportunity'],
      dtype='object')

In [21]:
hs_sf_contact_campaign_prospect_df_all = hs_sf_contact_campaign_prospect_df_all.rename(
    columns={
        'campaign_paid.value_x': 'campaign_paid.value',
        'campaign.created_at_x': 'campaign.created_at',
        'new_business_opportunity_amount_sum_x': "all_new_business_opportunity_amount_sum",
        'new_business_won_amount_sum_x': 'all_new_business_won_amount_sum',
        'user_count': 'all_user_count',
        'opportunity_count': 'all_opportunity_count', 
        'meeting_count': 'all_meeting_count',
        'won_opportunity_count': 'all_won_opportunity_count', 
        'mql_count': 'all_mql_count',
    }
).drop(
    columns=[
        'campaign_paid.value_y',
        'campaign.created_at_y', 
        'new_business_opportunity_amount_sum_y', 
        'new_business_count',
        'is_new_business_won_count', 
        'new_business_won_amount_sum_y',
        'call_count', 
        'email_count', 
        'other_count', 
        'MeetingCount',
        'is_mql',
        'has_opportunity', 
        'has_meeting', 
        'has_won_opportunity'
    ]
)

In [22]:
# hs_sf_contact_campaign_prospect_df_all = keep_x_only(hs_sf_contact_campaign_prospect_df_all)

In [23]:
# hs_sf_contact_campaign_prospect_df_all.rename(
#     columns={old_header: "all_" + old_header for old_header in unique_headers},
#     inplace=True,
# )


In [41]:
hs_sf_contact_campaign_prospect_df_first = merge_hubspot_salesforce(hubspot_first_campaigns, salesforce)

In [42]:
hs_sf_contact_campaign_prospect_df_first = hs_sf_contact_campaign_prospect_df_first.rename(
    columns={
        'campaign_paid.value_x': 'campaign_paid.value',
        'campaign.created_at_x': 'campaign.created_at',
        'new_business_opportunity_amount_sum_x': "first_new_business_opportunity_amount_sum",
        'new_business_won_amount_sum_x': 'first_new_business_won_amount_sum',
        'user_count': 'first_user_count',
        'opportunity_count': 'first_opportunity_count', 
        'meeting_count': 'first_meeting_count',
        'won_opportunity_count': 'first_won_opportunity_count', 
        'mql_count': 'first_mql_count',
    }
).drop(
    columns=[
        'campaign_paid.value_y',
        'campaign.created_at_y', 
        'new_business_opportunity_amount_sum_y', 
        'new_business_count',
        'is_new_business_won_count', 
        'new_business_won_amount_sum_y',
        'call_count', 
        'email_count', 
        'other_count', 
        'MeetingCount',
        'is_mql',
        'has_opportunity', 
        'has_meeting', 
        'has_won_opportunity'
    ]
)

In [43]:
# hs_sf_contact_campaign_prospect_df_first = keep_x_only(hs_sf_contact_campaign_prospect_df_first)

In [44]:
# hs_sf_contact_campaign_prospect_df_all.rename(
#     columns={old_header: "first_" + old_header for old_header in unique_headers},
#     inplace=True,
# )


In [45]:
hs_sf_contact_campaign_prospect_df_first.head()

Unnamed: 0,salesforce_id,campaign_paid.value,campaign.created_at,first_user_count,first_opportunity_count,first_meeting_count,first_won_opportunity_count,first_mql_count,first_new_business_opportunity_amount_sum,first_new_business_won_amount_sum,...,title,job_function,account_id,sub_region,industry,state,country,region,first_meeting_date,first_opportunity_won_date
0,0034W00002M8ygTQAR,ultim. okrs - e-book - download - lp - lal,2021-06-21,1,0,0,0,0,0.0,0.0,...,,,,,,,,,,
1,0034W00002M8zbcQAB,ultim. okrs - e-book - download - lp - lal,2021-06-21,1,0,0,0,0,0.0,0.0,...,,,,,,,,,,
2,0034W00002M90P4QAJ,ultim. okrs - e-book - download - leadgen form...,2021-04-29,1,0,0,0,0,0.0,0.0,...,,,,,,,,,,
3,0034W00002M90dAQAR,lead - ultim. okrs - e-book - download - search,2021-04-29,1,0,0,0,0,0.0,0.0,...,,,,,,,,,,
4,0034W00002M90lKQAR,lead - ultim. okrs - e-book - download - search,2021-04-29,1,0,0,0,0,0.0,0.0,...,,,,,,,,,,


In [46]:
hs_sf = hs_sf_contact_campaign_prospect_df_all.merge(
    hs_sf_contact_campaign_prospect_df_first,
    how="left",
    left_on=["salesforce_id", "campaign_paid.value",  "campaign.created_at"],
    right_on=["salesforce_id", "campaign_paid.value",  "campaign.created_at"]
)

In [47]:
hs_sf.columns.tolist()

['salesforce_id',
 'campaign_paid.value',
 'campaign.created_at',
 'all_user_count',
 'all_opportunity_count',
 'all_meeting_count',
 'all_won_opportunity_count',
 'all_mql_count',
 'all_new_business_opportunity_amount_sum',
 'all_new_business_won_amount_sum',
 'id_x',
 'campaign_paid.timestamp_x',
 'campaign_source.value_x',
 'became_mql_date_x',
 'title_x',
 'job_function_x',
 'account_id_x',
 'sub_region_x',
 'industry_x',
 'state_x',
 'country_x',
 'region_x',
 'first_meeting_date_x',
 'first_opportunity_won_date_x',
 'first_user_count',
 'first_opportunity_count',
 'first_meeting_count',
 'first_won_opportunity_count',
 'first_mql_count',
 'first_new_business_opportunity_amount_sum',
 'first_new_business_won_amount_sum',
 'id_y',
 'campaign_paid.timestamp_y',
 'campaign_source.value_y',
 'became_mql_date_y',
 'title_y',
 'job_function_y',
 'account_id_y',
 'sub_region_y',
 'industry_y',
 'state_y',
 'country_y',
 'region_y',
 'first_meeting_date_y',
 'first_opportunity_won_date_y'

In [48]:
# def is_added_metric(col):
#     added_metrics = [
#         "user_count",
#         "opportunity_count",
#         "meeting_count",
#         "won_opportunity_count",
#         "mql_count",
#         "new_business_opportunity_amount_sum",
#         "new_business_won_amount_sum",
#     ]

#     for added_metric in added_metrics:
#         if added_metric in col:
#             return True    
    
#     return False

In [49]:
# new_column_names = []
# remove_columns = []
# for col in hs_sf.columns:
#     if col.endswith("_x"):
#         if is_added_metric(col):
#             new_column_names.append("all_" + col[:-2])
#         else:
#             new_column_names.append(col[:-2])
#     elif col.endswith("_y"):
#         if is_added_metric(col):
#             new_column_names.append("first_" + col[:-2])
#         else:
#             new_column_names.append(col)
#             remove_columns.append(col)
#     else:
#         new_column_names.append(col)

In [50]:
# new_column_names

In [51]:
# hs_sf.columns = new_column_names

In [52]:
# hs_sf = hs_sf.drop(columns=remove_columns, inplace=False)

In [53]:
hs_sf.columns.tolist()

['salesforce_id',
 'campaign_paid.value',
 'campaign.created_at',
 'all_user_count',
 'all_opportunity_count',
 'all_meeting_count',
 'all_won_opportunity_count',
 'all_mql_count',
 'all_new_business_opportunity_amount_sum',
 'all_new_business_won_amount_sum',
 'id_x',
 'campaign_paid.timestamp_x',
 'campaign_source.value_x',
 'became_mql_date_x',
 'title_x',
 'job_function_x',
 'account_id_x',
 'sub_region_x',
 'industry_x',
 'state_x',
 'country_x',
 'region_x',
 'first_meeting_date_x',
 'first_opportunity_won_date_x',
 'first_user_count',
 'first_opportunity_count',
 'first_meeting_count',
 'first_won_opportunity_count',
 'first_mql_count',
 'first_new_business_opportunity_amount_sum',
 'first_new_business_won_amount_sum',
 'id_y',
 'campaign_paid.timestamp_y',
 'campaign_source.value_y',
 'became_mql_date_y',
 'title_y',
 'job_function_y',
 'account_id_y',
 'sub_region_y',
 'industry_y',
 'state_y',
 'country_y',
 'region_y',
 'first_meeting_date_y',
 'first_opportunity_won_date_y'

In [54]:
_hs_sf = keep_x_only(hs_sf)

In [56]:
_hs_sf.to_csv(
    os.path.join(
        "output",
        "hubspot_salesforce",
        f"hubspot_salesforce_prospect_campaign_day_{run_date}.csv"
    ),
    index=False,
)