# Get Salesforce Data to Join with Marketing Data

## Set up and load data

In [1]:
from toolbox.datalake import DataLake
from simple_salesforce import Salesforce
from dotenv import load_dotenv
import pandas as pd
import datetime
import json
import os

In [2]:
load_dotenv()
CONNECT_STR = os.getenv('AZURE_STORAGE_CONNECTION_STRING')
SALESFORCE_USERNAME = os.getenv('SALESFORCE_USERNAME')
SALESFORCE_PASSWORD = os.getenv('SALESFORCE_PASSWORD')
SALESFORCE_TOKEN = os.getenv('SALESFORCE_TOKEN')

In [3]:
dl = DataLake(CONNECT_STR)

In [4]:
BASE_SF = 'Unprocessed/Salesforce/2022/04/14'

In [5]:
leads = dl.get_json_lines_as_df(f'{BASE_SF}/Lead.json')
contacts = dl.get_json_lines_as_df(f'{BASE_SF}/Contact.json')
oppconroles = dl.get_json_lines_as_df(f'{BASE_SF}/OpportunityContactRole.json')
opportunities = dl.get_json_lines_as_df(f'{BASE_SF}/Opportunity.json')
tasks = dl.get_json_lines_as_df(f'{BASE_SF}/Task.json')
events = dl.get_json_lines_as_df(f'{BASE_SF}/Event.json')
accounts = dl.get_json_lines_as_df(f'{BASE_SF}/Account.json')

In [6]:
opportunities.columns.tolist()
# 'Became_a_marketing_qualified_lead_date__c'
# 'ActivityDate'
# 'CreatedDate'

['Id',
 'IsDeleted',
 'AccountId',
 'RecordTypeId',
 'Name',
 'Description',
 'StageName',
 'Amount',
 'Probability',
 'CloseDate',
 'Type',
 'NextStep',
 'LeadSource',
 'IsClosed',
 'IsWon',
 'ForecastCategory',
 'ForecastCategoryName',
 'CurrencyIsoCode',
 'CampaignId',
 'HasOpportunityLineItem',
 'Pricebook2Id',
 'OwnerId',
 'Territory2Id',
 'IsExcludedFromTerritory2Filter',
 'CreatedDate',
 'CreatedById',
 'LastModifiedDate',
 'LastModifiedById',
 'SystemModstamp',
 'LastActivityDate',
 'PushCount',
 'LastStageChangeDate',
 'FiscalQuarter',
 'FiscalYear',
 'Fiscal',
 'ContactId',
 'LastViewedDate',
 'LastReferencedDate',
 'SyncedQuoteId',
 'ContractId',
 'HasOpenActivity',
 'HasOverdueTask',
 'LastAmountChangedHistoryId',
 'LastCloseDateChangedHistoryId',
 'Budget_Confirmed__c',
 'Discovery_Completed__c',
 'ROI_Analysis_Completed__c',
 'Contract_Length__c',
 'Loss_Reason__c',
 'Annual_Contract_Value__c',
 'MRR__c',
 'MEDDPICC_Metrics__c',
 'MEDDPICC_Economic_Buyer__c',
 'MEDDPICC_D

In [7]:
sf = Salesforce(username=SALESFORCE_USERNAME, password=SALESFORCE_PASSWORD, security_token=SALESFORCE_TOKEN, instance_url='https://gtmhub.my.salesforce.com')

In [8]:
contact_query = f'SELECT Id, Became_a_marketing_qualified_lead_date__c FROM Contact'
contact_df = pd.DataFrame()

contact_results = sf.bulk.Contact.query(contact_query, lazy_operation=True)

for list_results in contact_results:
    new_df = pd.DataFrame(list_results)
    contact_df = pd.concat([contact_df, new_df])

In [9]:
contact_df.head()

Unnamed: 0,attributes,Id,Became_a_marketing_qualified_lead_date__c
0,"{'type': 'Contact', 'url': '/services/data/v52...",0034W00002NjEeyQAF,
1,"{'type': 'Contact', 'url': '/services/data/v52...",0034W00002NjEezQAF,
2,"{'type': 'Contact', 'url': '/services/data/v52...",0034W00002NjEf0QAF,1579259000000.0
3,"{'type': 'Contact', 'url': '/services/data/v52...",0034W00002NjEf1QAF,
4,"{'type': 'Contact', 'url': '/services/data/v52...",0034W00002NjEf2QAF,1579672000000.0


In [10]:
accounts.head()

Unnamed: 0,Id,IsDeleted,MasterRecordId,Name,Type,RecordTypeId,ParentId,BillingStreet,BillingCity,BillingState,...,Channeltivity_Partner_Activation_Date__c,Competitor_renewal_date__c,Competitors__c,Other_Competitor__c,Number_of_Lost_Opportunities__c,Number_of_Open_Opportunities__c,Number_of_Opportunities__c,Number_of_Won_Opportunities__c,Ctvt_PrimaryPartnerManager__c,Partner_Certified__c
0,0014W00002EXLbPQAX,False,,Intetics,,0128a000000ccv8AAA,,"1900 E Golf Rd, Ste 950",Schaumburg,Illinois,...,,,,,0.0,0.0,0.0,0.0,,False
1,0014W00002Fh63pQAB,False,,Department of Health Abu Dhabi,,0128a000000ccv8AAA,,,Abu Dhabi,,...,,,,,0.0,0.0,0.0,0.0,,False
2,0014W00002Fh63qQAB,False,,i engineering Group,,0128a000000ccv8AAA,,,,,...,,,,,0.0,0.0,0.0,0.0,,False
3,0014W00002Fh63rQAB,False,,Creative Business Solutions,,0128a000000ccv8AAA,,"KODRA E DIELLIT, SELITE 4/31",Tirana,,...,,,,,0.0,0.0,0.0,0.0,,False
4,0014W00002Fh63sQAB,False,,Kontakt Al,,0128a000000ccv8AAA,,,Tirana,,...,,,,,0.0,0.0,0.0,0.0,,False


## Leads and Contacts

1. Remove unneeded columns.
2. Merge Leads and Contacts to "prospects" table.
3. Coalesce duplicate columns.

In [11]:
leads.head()

Unnamed: 0,Id,IsDeleted,MasterRecordId,LastName,FirstName,Salutation,MiddleName,Suffix,Name,RecordTypeId,...,Why_Unqualified__c,Use_Case_Request__c,Persona__c,ContactId__c,Customer_cf_accounts_segmentation_c__c,Lead_Age__c,OpportunityId__c,Ctvt_ChannelAccountManager__c,Insightboards__c,Audience__c
0,00Q4W00001ZYjKtUAL,False,,‚ú™ üåê Digital Marketer,JC,,,,JC ‚ú™ üåê Digital Marketer,,...,,,,,,100.0,,,,
1,00Q4W00001ZYjL8UAL,False,,–¶–≤–µ—Ç–∞–Ω–æ–≤–∞,–í–∏–ª–∏,,,,–í–∏–ª–∏ –¶–≤–µ—Ç–∞–Ω–æ–≤–∞,,...,,,,,,100.0,,,,
2,00Q4W00001ZYjLDUA1,False,,Charrier,Frederic,,,,Frederic Charrier,,...,,,,,,100.0,,,,
3,00Q4W00001ZYjLIUA1,False,,[not provided],G√ºm√º≈ü √ßizgiler,,,,G√ºm√º≈ü √ßizgiler [not provided],,...,,,,,,100.0,,,,
4,00Q4W00001ZYjLSUA1,False,,[not provided],Nasus Given Me,,,,Nasus Given Me [not provided],,...,,,,,,100.0,,,,


In [12]:
req_lead_fields = [
    'Id', 
    'ConvertedContactId', 
    'Became_a_marketing_qualified_lead_date__c', 
    'Area__c', 
    'Territory__c', 
    'State',
    'Country',
    'Industry', 
    'Title', 
    'Job_Function__c', 
    'ConvertedAccountId', 
    'Account_Name__c'
]
leads = leads.loc[:, req_lead_fields]

In [13]:
req_contact_fields = ['Id', 'Title', 'Job_Function__c', 'AccountId']
contacts = contacts.loc[:, req_contact_fields]

In [14]:
req_account_fields = [
    'Id', 
    'Industry', 
    'BillingState', 
    'BillingCountry', 
    'Account_Area__c', 
    'Account_Territory__c'
]
accounts = accounts.loc[:, req_account_fields]

In [15]:
contact_df = contact_df.drop(['attributes'], axis=1)

In [16]:
contacts = contacts.merge(
    contact_df,
    how="left",
    left_on="Id",
    right_on="Id"
)

In [17]:
contacts.columns.tolist()

['Id',
 'Title',
 'Job_Function__c',
 'AccountId',
 'Became_a_marketing_qualified_lead_date__c']

In [18]:
contacts.loc[:, 'BecameMQLDate'] = contacts.Became_a_marketing_qualified_lead_date__c.apply(lambda x: datetime.datetime.fromtimestamp(x/1000) if pd.notnull(x) else None)

In [19]:
leads.loc[:, 'BecameMQLDate'] = leads.Became_a_marketing_qualified_lead_date__c.apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%dT%H:%M:%S') if pd.notnull(x) else None)

In [20]:
leads = leads.drop(['Became_a_marketing_qualified_lead_date__c'], axis=1)
contacts = contacts.drop(['Became_a_marketing_qualified_lead_date__c'], axis=1)

In [21]:
prospects = contacts.merge(leads, how='outer', left_on='Id', right_on='ConvertedContactId')

In [22]:
prospects.head()

Unnamed: 0,Id_x,Title_x,Job_Function__c_x,AccountId,BecameMQLDate_x,Id_y,ConvertedContactId,Area__c,Territory__c,State,Country,Industry,Title_y,Job_Function__c_y,ConvertedAccountId,Account_Name__c,BecameMQLDate_y
0,0034W00002NjEeyQAF,,Manager,0014W00002FhDWeQAN,NaT,,,,,,,,,,,,NaT
1,0034W00002NjEezQAF,,Manager,0014W00002FhDOLQA3,NaT,,,,,,,,,,,,NaT
2,0034W00002NjEf0QAF,OD Project Director,Director,0014W00002Fh6yWQAR,2020-01-17 11:07:07,,,,,,,,,,,,NaT
3,0034W00002NjEf1QAF,Regional Head of HR,Director,0014W00002Fh6yWQAR,NaT,,,,,,,,,,,,NaT
4,0034W00002NjEf2QAF,System Integrator (System Support),Team member,0014W00002Fh6yWQAR,2020-01-22 05:44:06,,,,,,,,,,,,NaT


In [23]:
prospects.loc[:, 'id'] = prospects.Id_x.combine_first(prospects.Id_y)
prospects.loc[:, 'became_mql_date'] = prospects.BecameMQLDate_x.combine_first(prospects.BecameMQLDate_y)
prospects.loc[:, 'title'] = prospects.Title_x.combine_first(prospects.Title_y)
prospects.loc[:, 'job_function'] = prospects.Job_Function__c_x.combine_first(prospects.Job_Function__c_y)
prospects.loc[:, 'account_id'] = prospects.AccountId.combine_first(prospects.Account_Name__c)

prospects = prospects.drop(
    [
        'Id_x', 
        'Id_y', 
        'BecameMQLDate_x', 
        'BecameMQLDate_y', 
        'ConvertedContactId', 
        'Title_x', 
        'Title_y', 
        'Job_Function__c_x', 
        'Job_Function__c_y', 
        'Account_Name__c', 
        'ConvertedAccountId',
        'AccountId'
    ], 
    axis=1
)

In [24]:
prospects.head()

Unnamed: 0,Area__c,Territory__c,State,Country,Industry,id,became_mql_date,title,job_function,account_id
0,,,,,,0034W00002NjEeyQAF,NaT,,Manager,0014W00002FhDWeQAN
1,,,,,,0034W00002NjEezQAF,NaT,,Manager,0014W00002FhDOLQA3
2,,,,,,0034W00002NjEf0QAF,2020-01-17 11:07:07,OD Project Director,Director,0014W00002Fh6yWQAR
3,,,,,,0034W00002NjEf1QAF,NaT,Regional Head of HR,Director,0014W00002Fh6yWQAR
4,,,,,,0034W00002NjEf2QAF,2020-01-22 05:44:06,System Integrator (System Support),Team member,0014W00002Fh6yWQAR


In [25]:
prospects_0 = prospects.merge(
    accounts,
    how='left',
    left_on='account_id',
    right_on='Id'
)

In [26]:
prospects_0.head()

Unnamed: 0,Area__c,Territory__c,State,Country,Industry_x,id,became_mql_date,title,job_function,account_id,Id,Industry_y,BillingState,BillingCountry,Account_Area__c,Account_Territory__c
0,,,,,,0034W00002NjEeyQAF,NaT,,Manager,0014W00002FhDWeQAN,0014W00002FhDWeQAN,,,,Rest of World,Rest of World
1,,,,,,0034W00002NjEezQAF,NaT,,Manager,0014W00002FhDOLQA3,0014W00002FhDOLQA3,,,,Rest of World,Rest of World
2,,,,,,0034W00002NjEf0QAF,2020-01-17 11:07:07,OD Project Director,Director,0014W00002Fh6yWQAR,0014W00002Fh6yWQAR,Technology,Hong Kong,China,Rest of World,APAC
3,,,,,,0034W00002NjEf1QAF,NaT,Regional Head of HR,Director,0014W00002Fh6yWQAR,0014W00002Fh6yWQAR,Technology,Hong Kong,China,Rest of World,APAC
4,,,,,,0034W00002NjEf2QAF,2020-01-22 05:44:06,System Integrator (System Support),Team member,0014W00002Fh6yWQAR,0014W00002Fh6yWQAR,Technology,Hong Kong,China,Rest of World,APAC


In [27]:
prospects_0.columns

Index(['Area__c', 'Territory__c', 'State', 'Country', 'Industry_x', 'id',
       'became_mql_date', 'title', 'job_function', 'account_id', 'Id',
       'Industry_y', 'BillingState', 'BillingCountry', 'Account_Area__c',
       'Account_Territory__c'],
      dtype='object')

In [28]:
prospects_0.loc[:, "territory"] = prospects_0.Account_Territory__c.combine_first(prospects_0.Territory__c)
prospects_0.loc[:, "industry"] = prospects_0.Industry_y.combine_first(prospects_0.Industry_x)
prospects_0.loc[:, "state"] = prospects_0.BillingState.combine_first(prospects_0.State)
prospects_0.loc[:, "country"] = prospects_0.BillingCountry.combine_first(prospects_0.Country)
prospects_0.loc[:, "account_area"] = prospects_0.Account_Area__c.combine_first(prospects_0.Area__c)

prospects_0 = prospects_0.drop(
    [
        'Account_Territory__c', 
        'Territory__c', 
        'Industry_x', 
        'Industry_y', 
        'BillingState',
        'State',
        'BillingCountry',
        'Country',
        'Account_Area__c', 
        'Area__c',
        'Id'
    ], 
    axis=1
)

In [29]:
prospects_0.head()

Unnamed: 0,id,became_mql_date,title,job_function,account_id,territory,industry,state,country,account_area
0,0034W00002NjEeyQAF,NaT,,Manager,0014W00002FhDWeQAN,Rest of World,,,,Rest of World
1,0034W00002NjEezQAF,NaT,,Manager,0014W00002FhDOLQA3,Rest of World,,,,Rest of World
2,0034W00002NjEf0QAF,2020-01-17 11:07:07,OD Project Director,Director,0014W00002Fh6yWQAR,APAC,Technology,Hong Kong,China,Rest of World
3,0034W00002NjEf1QAF,NaT,Regional Head of HR,Director,0014W00002Fh6yWQAR,APAC,Technology,Hong Kong,China,Rest of World
4,0034W00002NjEf2QAF,2020-01-22 05:44:06,System Integrator (System Support),Team member,0014W00002Fh6yWQAR,APAC,Technology,Hong Kong,China,Rest of World


## Opportunities & OpportunityContactRoles

1. Remove unneeded columns.
2. Merge opportunity data on contact relationship data.

In [30]:
opportunities = opportunities.loc[:, ['Id', 'RecordTypeId', 'IsClosed', 'IsWon', 'Amount', 'CloseDate', 'Type', 'CreatedDate']]

In [31]:
oppconroles = oppconroles.loc[:, ['Id', 'OpportunityId', 'ContactId']]

In [32]:
opps = oppconroles.merge(opportunities, how='left', left_on='OpportunityId', right_on='Id')

In [33]:
opps = opps.rename(
    {
        'Id_x': 'OppConRoleId'
    }, 
    axis=1
).drop(['Id_y'], axis=1)

In [34]:
opps.loc[:, 'CloseDate'] = opps.CloseDate.apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%dT%H:%M:%S'))

In [35]:
opps.head()

Unnamed: 0,OppConRoleId,OpportunityId,ContactId,RecordTypeId,IsClosed,IsWon,Amount,CloseDate,Type,CreatedDate
0,00K8a00000XJm4bEAD,0068a00001GdcT8AAJ,0038a00002rQon6AAC,0124W0000007bPpQAI,False,False,100000.0,2022-12-29,New Business,2022-04-12T11:07:45
1,00K8a00000XJniXEAT,0068a00001GdeqOAAR,0038a00002rQxZzAAK,0124W0000007bPpQAI,False,False,50000.0,2022-06-30,,2022-04-13T08:29:54
2,00K8a00000XJpSvEAL,0068a00001Gdi6IAAR,0038a00002rR8K4AAK,0124W0000007bPpQAI,False,False,50000.0,2022-06-30,New Business,2022-04-14T14:58:35
3,00K8a00000XJjFgEAL,0068a00001GdX03AAF,0038a00002qcCjNAAU,0124W0000007bPpQAI,False,False,15000.0,2022-08-31,New Business,2022-04-08T14:21:06
4,00K8a00000XJjX1EAL,0068a00001GdXOyAAN,0038a00002qcD6RAAU,0124W0000007bPpQAI,False,False,50000.0,2022-08-26,New Business,2022-04-08T16:28:10


In [36]:
opps[opps["Amount"] == 10000]

Unnamed: 0,OppConRoleId,OpportunityId,ContactId,RecordTypeId,IsClosed,IsWon,Amount,CloseDate,Type,CreatedDate
13,00K8a00000XJj5qEAD,0068a00001GdWNVAA3,0038a00002qcABSAA2,0124W0000007bPpQAI,False,False,10000.0,2022-06-30,,2022-04-08T01:44:32
22,00K8a00000XJkSCEA1,0068a00001GdZopAAF,0038a00002qcJGLAA2,0124W0000007bPpQAI,False,False,10000.0,2022-06-30,,2022-04-10T23:51:17
143,00K4W00000LgYHvUAN,0064W0000181XBrQAM,0034W00002NjM5xQAF,0124W0000007bPpQAI,True,True,10000.0,2021-10-28,New Business,2021-10-21T15:20:03
147,00K4W00000LglxJUAR,0064W0000181lUYQAY,0034W00002kOUEkQAO,0124W0000007bPpQAI,True,False,10000.0,2021-10-26,New Business,2021-10-25T14:56:23
240,00K4W00000LhbFEUAZ,0064W00001831G9QAI,0034W00002nVq5GQAS,0124W0000007bPpQAI,True,False,10000.0,2021-11-12,New Business,2021-11-10T20:54:53
303,00K4W00000JAdwtUAD,0064W00000ypGFfQAM,0034W00002cFNJ1QAO,0124W0000007bPpQAI,True,False,10000.0,2021-09-23,New Business,2021-03-15T18:38:05
385,00K4W00000HLYf1UAH,0064W00000yqSSPQA2,0034W00002WZggLQAT,0124W0000007bPpQAI,True,False,10000.0,2021-12-16,New Business,2021-03-30T14:39:31
547,00K4W00000HMmUZUA1,0064W000011xjucQAA,0034W00002a5aulQAA,0124W0000007bPpQAI,True,False,10000.0,2021-10-15,New Business,2021-05-19T20:00:43
624,00K4W00000HNDctUAH,0064W000011yR2YQAU,0034W00002a77wIQAQ,0124W0000007bPpQAI,True,False,10000.0,2021-06-10,New Business,2021-05-27T18:07:59
632,00K4W00000LfiJPUAZ,0064W000011ydsfQAA,0034W00002iuZnXQAU,0124W0000007bPpQAI,True,False,10000.0,2021-10-05,New Business,2021-06-01T13:04:19


## Get supplemental prospect data

1. Remove excess tasks.
2. Update opps with OpportunityType.
3. Create summary dataframes.
4. Merge sumary dataframes with prospects.
5. Clean up fields.
6. Output to CSV.

In [37]:
tasks = tasks[
    (tasks.RecordTypeId == '0124W000001lKRjQAM') & # Only sales tasks record type
    (tasks.Status == 'Completed') &
    (tasks.Type.isin(['Call', 'Email', 'Other', 'Meeting']))
]

In [38]:
def opp_record_type(row):
    if row['RecordTypeId'] == '0124W0000007bPpQAI':
        return 'New Business'
    elif row['RecordTypeId'] == '0124W0000007bPuQAI':
        return 'Renewal'
    else:
        return 'Partner'

def opp_type(row):
    if row['RecordType'] == 'Renewal' or row['RecordType'] == 'Partner':
        return row['RecordType']
    if row['Type'] == 'Existing Business':
        return 'Upsell'
    else:
        return 'New Business'

In [39]:
opps.loc[:, 'RecordType'] = opps.apply(opp_record_type, axis=1)
opps.loc[:, 'OpportunityType'] = opps.apply(opp_type, axis=1)

In [40]:
new_business_ops = opps[opps["OpportunityType"] == "New Business"]

In [41]:
new_business_ops.head()

Unnamed: 0,OppConRoleId,OpportunityId,ContactId,RecordTypeId,IsClosed,IsWon,Amount,CloseDate,Type,CreatedDate,RecordType,OpportunityType
0,00K8a00000XJm4bEAD,0068a00001GdcT8AAJ,0038a00002rQon6AAC,0124W0000007bPpQAI,False,False,100000.0,2022-12-29,New Business,2022-04-12T11:07:45,New Business,New Business
1,00K8a00000XJniXEAT,0068a00001GdeqOAAR,0038a00002rQxZzAAK,0124W0000007bPpQAI,False,False,50000.0,2022-06-30,,2022-04-13T08:29:54,New Business,New Business
2,00K8a00000XJpSvEAL,0068a00001Gdi6IAAR,0038a00002rR8K4AAK,0124W0000007bPpQAI,False,False,50000.0,2022-06-30,New Business,2022-04-14T14:58:35,New Business,New Business
3,00K8a00000XJjFgEAL,0068a00001GdX03AAF,0038a00002qcCjNAAU,0124W0000007bPpQAI,False,False,15000.0,2022-08-31,New Business,2022-04-08T14:21:06,New Business,New Business
4,00K8a00000XJjX1EAL,0068a00001GdXOyAAN,0038a00002qcD6RAAU,0124W0000007bPpQAI,False,False,50000.0,2022-08-26,New Business,2022-04-08T16:28:10,New Business,New Business


In [42]:
new_business_ops.loc[:, "IsWon"] = new_business_ops["IsWon"].map(lambda x: 1 if x else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_business_ops.loc[:, "IsWon"] = new_business_ops["IsWon"].map(lambda x: 1 if x else 0)


In [43]:
opps.head()

Unnamed: 0,OppConRoleId,OpportunityId,ContactId,RecordTypeId,IsClosed,IsWon,Amount,CloseDate,Type,CreatedDate,RecordType,OpportunityType
0,00K8a00000XJm4bEAD,0068a00001GdcT8AAJ,0038a00002rQon6AAC,0124W0000007bPpQAI,False,False,100000.0,2022-12-29,New Business,2022-04-12T11:07:45,New Business,New Business
1,00K8a00000XJniXEAT,0068a00001GdeqOAAR,0038a00002rQxZzAAK,0124W0000007bPpQAI,False,False,50000.0,2022-06-30,,2022-04-13T08:29:54,New Business,New Business
2,00K8a00000XJpSvEAL,0068a00001Gdi6IAAR,0038a00002rR8K4AAK,0124W0000007bPpQAI,False,False,50000.0,2022-06-30,New Business,2022-04-14T14:58:35,New Business,New Business
3,00K8a00000XJjFgEAL,0068a00001GdX03AAF,0038a00002qcCjNAAU,0124W0000007bPpQAI,False,False,15000.0,2022-08-31,New Business,2022-04-08T14:21:06,New Business,New Business
4,00K8a00000XJjX1EAL,0068a00001GdXOyAAN,0038a00002qcD6RAAU,0124W0000007bPpQAI,False,False,50000.0,2022-08-26,New Business,2022-04-08T16:28:10,New Business,New Business


In [44]:
opps.shape

(1588, 12)

In [45]:
len(opps.ContactId.unique())

1196

In [46]:
opps_to_merge = opps.groupby(
    [
        'ContactId', 'OpportunityType'
    ]
).size().unstack(fill_value=0).reset_index().rename(
    {
        'New Business': 'new_business_count', 
        'Upsell': 'upsell_count', 
        'Renewal': 'renewal_count', 
        'Partner': 'partner_count'
    }, 
    axis=1
)

In [47]:
opps_to_merge.head()

OpportunityType,ContactId,new_business_count,partner_count,renewal_count,upsell_count
0,0034W00002NjEeyQAF,1,0,0,0
1,0034W00002NjEf1QAF,0,0,1,0
2,0034W00002NjEfiQAF,0,0,1,0
3,0034W00002NjEgGQAV,0,0,2,0
4,0034W00002NjEgIQAV,0,0,1,0


In [48]:
opps_to_merge.shape

(1196, 5)

In [49]:
len(opps_to_merge["ContactId"].unique())

1196

In [50]:
contact_amount = opps.groupby(["ContactId"]).agg({"Amount": "sum"}).reset_index()
contact_amount.head()

Unnamed: 0,ContactId,Amount
0,0034W00002NjEeyQAF,0.0
1,0034W00002NjEf1QAF,11520.0
2,0034W00002NjEfiQAF,42000.0
3,0034W00002NjEgGQAV,13000.0
4,0034W00002NjEgIQAV,7200.0


In [51]:
new_business_ops["amount_won"] = new_business_ops.apply(lambda x: x["Amount"] if x['IsWon'] == 1 else 0, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_business_ops["amount_won"] = new_business_ops.apply(lambda x: x["Amount"] if x['IsWon'] == 1 else 0, axis=1)


In [52]:
new_business_ops.head()

Unnamed: 0,OppConRoleId,OpportunityId,ContactId,RecordTypeId,IsClosed,IsWon,Amount,CloseDate,Type,CreatedDate,RecordType,OpportunityType,amount_won
0,00K8a00000XJm4bEAD,0068a00001GdcT8AAJ,0038a00002rQon6AAC,0124W0000007bPpQAI,False,0,100000.0,2022-12-29,New Business,2022-04-12T11:07:45,New Business,New Business,0.0
1,00K8a00000XJniXEAT,0068a00001GdeqOAAR,0038a00002rQxZzAAK,0124W0000007bPpQAI,False,0,50000.0,2022-06-30,,2022-04-13T08:29:54,New Business,New Business,0.0
2,00K8a00000XJpSvEAL,0068a00001Gdi6IAAR,0038a00002rR8K4AAK,0124W0000007bPpQAI,False,0,50000.0,2022-06-30,New Business,2022-04-14T14:58:35,New Business,New Business,0.0
3,00K8a00000XJjFgEAL,0068a00001GdX03AAF,0038a00002qcCjNAAU,0124W0000007bPpQAI,False,0,15000.0,2022-08-31,New Business,2022-04-08T14:21:06,New Business,New Business,0.0
4,00K8a00000XJjX1EAL,0068a00001GdXOyAAN,0038a00002qcD6RAAU,0124W0000007bPpQAI,False,0,50000.0,2022-08-26,New Business,2022-04-08T16:28:10,New Business,New Business,0.0


In [53]:
new_business_amount = new_business_ops.groupby(["ContactId"]).agg({"Amount": "sum", "OpportunityType": "count", "IsWon": "sum", "amount_won": "sum"}).reset_index()
new_business_amount.head()

Unnamed: 0,ContactId,Amount,OpportunityType,IsWon,amount_won
0,0034W00002NjEeyQAF,0.0,1,0,0.0
1,0034W00002NjEgeQAF,576000.0,1,0,0.0
2,0034W00002NjEh2QAF,30000.0,1,1,30000.0
3,0034W00002NjEhCQAV,33000.0,1,1,33000.0
4,0034W00002NjEhIQAV,131000.0,1,1,131000.0


In [54]:
new_business_amount[new_business_amount["OpportunityType"] > 1]

Unnamed: 0,ContactId,Amount,OpportunityType,IsWon,amount_won
8,0034W00002NjErPQAV,48.0,2,2,48.0
19,0034W00002NjF1yQAF,2099063.82,2,0,0.0
21,0034W00002NjF2SQAV,79000.0,2,0,0.0
62,0034W00002NjFgHQAV,123600.0,2,0,0.0
63,0034W00002NjFgKQAV,375000.0,2,1,25000.0
99,0034W00002NjIhcQAF,330000.0,2,1,270000.0
115,0034W00002NjKyOQAV,150720.0,2,1,150000.0
130,0034W00002NjNwdQAF,149991.0,2,1,119991.0
174,0034W00002ToOLtQAN,25000.0,2,1,25000.0
176,0034W00002Tp7NRQAZ,197000.0,3,1,47000.0


In [55]:
new_business_ops

Unnamed: 0,OppConRoleId,OpportunityId,ContactId,RecordTypeId,IsClosed,IsWon,Amount,CloseDate,Type,CreatedDate,RecordType,OpportunityType,amount_won
0,00K8a00000XJm4bEAD,0068a00001GdcT8AAJ,0038a00002rQon6AAC,0124W0000007bPpQAI,False,0,100000.0,2022-12-29,New Business,2022-04-12T11:07:45,New Business,New Business,0.0
1,00K8a00000XJniXEAT,0068a00001GdeqOAAR,0038a00002rQxZzAAK,0124W0000007bPpQAI,False,0,50000.0,2022-06-30,,2022-04-13T08:29:54,New Business,New Business,0.0
2,00K8a00000XJpSvEAL,0068a00001Gdi6IAAR,0038a00002rR8K4AAK,0124W0000007bPpQAI,False,0,50000.0,2022-06-30,New Business,2022-04-14T14:58:35,New Business,New Business,0.0
3,00K8a00000XJjFgEAL,0068a00001GdX03AAF,0038a00002qcCjNAAU,0124W0000007bPpQAI,False,0,15000.0,2022-08-31,New Business,2022-04-08T14:21:06,New Business,New Business,0.0
4,00K8a00000XJjX1EAL,0068a00001GdXOyAAN,0038a00002qcD6RAAU,0124W0000007bPpQAI,False,0,50000.0,2022-08-26,New Business,2022-04-08T16:28:10,New Business,New Business,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1571,00K4W00000MxyLpUAJ,0064W000016HBz0QAG,0034W00002qqkeZQAQ,0124W0000007bPpQAI,False,0,75000.0,2022-05-31,New Business,2021-08-27T18:27:14,New Business,New Business,0.0
1572,00K4W00000MxyLoUAJ,0064W000016HBz0QAG,0034W00002qqkWsQAI,0124W0000007bPpQAI,False,0,75000.0,2022-05-31,New Business,2021-08-27T18:27:14,New Business,New Business,0.0
1573,00K4W00000MxyLnUAJ,0064W000016HBz0QAG,0034W00002qqkTJQAY,0124W0000007bPpQAI,False,0,75000.0,2022-05-31,New Business,2021-08-27T18:27:14,New Business,New Business,0.0
1574,00K4W00000JrrL7UAJ,0064W000016HBz0QAG,0034W00002fZv7sQAC,0124W0000007bPpQAI,False,0,75000.0,2022-05-31,New Business,2021-08-27T18:27:14,New Business,New Business,0.0


In [56]:
new_business_ops["AmountWon"] = new_business_ops.apply(lambda x: x["Amount"] if x['IsWon'] == 1 else 0, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_business_ops["AmountWon"] = new_business_ops.apply(lambda x: x["Amount"] if x['IsWon'] == 1 else 0, axis=1)


In [57]:
contact_amount

Unnamed: 0,ContactId,Amount
0,0034W00002NjEeyQAF,0.0
1,0034W00002NjEf1QAF,11520.0
2,0034W00002NjEfiQAF,42000.0
3,0034W00002NjEgGQAV,13000.0
4,0034W00002NjEgIQAV,7200.0
...,...,...
1191,0038a00002rQyGtAAK,50000.0
1192,0038a00002rR3rBAAS,50000.0
1193,0038a00002rR7tXAAS,72000.0
1194,0038a00002rR7uQAAS,35000.0


In [58]:
opps_to_merge

OpportunityType,ContactId,new_business_count,partner_count,renewal_count,upsell_count
0,0034W00002NjEeyQAF,1,0,0,0
1,0034W00002NjEf1QAF,0,0,1,0
2,0034W00002NjEfiQAF,0,0,1,0
3,0034W00002NjEgGQAV,0,0,2,0
4,0034W00002NjEgIQAV,0,0,1,0
...,...,...,...,...,...
1191,0038a00002rQyGtAAK,1,0,0,0
1192,0038a00002rR3rBAAS,1,0,0,0
1193,0038a00002rR7tXAAS,1,0,0,0
1194,0038a00002rR7uQAAS,1,0,0,0


In [59]:
tasks_to_merge = tasks.groupby(['WhoId', 'Type']).size().unstack(fill_value=0).reset_index().rename({'Call': 'call_count', 'Email': 'email_count', 'Other': 'other_count'}, axis=1)

In [60]:
tasks_to_merge

Type,WhoId,call_count,email_count,Meeting,other_count
0,0034W00002NjEfPQAV,0,1,0,0
1,0034W00002NjEfiQAF,0,2,0,0
2,0034W00002NjEg6QAF,0,13,0,0
3,0034W00002NjEgGQAV,0,2,0,0
4,0034W00002NjEgRQAV,0,25,0,0
...,...,...,...,...,...
81297,00Q8a00001q1ESDEA2,0,1,0,0
81298,00Q8a00001q1ETWEA2,0,1,0,0
81299,00Q8a00001q1ETXEA2,0,1,0,0
81300,00Q8a00001q1EUbEAM,0,2,0,0


In [61]:
events_to_merge = events.groupby(['WhoId']).size().reset_index(name='event_count')

In [63]:
prospects_0.columns

Index(['id', 'became_mql_date', 'title', 'job_function', 'account_id',
       'territory', 'industry', 'state', 'country', 'account_area'],
      dtype='object')

In [64]:
prospects2 = prospects_0.merge(new_business_amount, how="left", left_on="id", right_on="ContactId").drop(['ContactId'], axis=1)

In [65]:
# prospects2 = prospects.merge(opps_to_merge, how='left', left_on='Id', right_on='ContactId').drop(['ContactId'], axis=1)

In [67]:
prospects3 = prospects2.merge(tasks_to_merge, how='left', left_on='id', right_on='WhoId').drop(['WhoId'], axis=1)

In [69]:
prospects4 = prospects3.merge(events_to_merge, how='left', left_on='id', right_on='WhoId').drop(['WhoId'], axis=1)

In [70]:
prospects4.loc[:, "MeetingCount"] = prospects4.Meeting + prospects4.event_count

In [71]:
prospects4 = prospects4.drop(["Meeting", "event_count"], axis=1)

In [72]:
prospects4.columns

Index(['id', 'became_mql_date', 'title', 'job_function', 'account_id',
       'territory', 'industry', 'state', 'country', 'account_area', 'Amount',
       'OpportunityType', 'IsWon', 'amount_won', 'call_count', 'email_count',
       'other_count', 'MeetingCount'],
      dtype='object')

In [73]:
prospects4 = prospects4.rename(columns={
    "OpportunityType": "new_business_count",
    "account_area": "region",
    "territory": "sub_region",
    "Amount": "new_business_opportunity_amount_sum",
    "amount_won": "new_business_won_amount_sum",
    "IsWon": "is_new_business_won_count"
})

In [74]:
prospects4.columns

Index(['id', 'became_mql_date', 'title', 'job_function', 'account_id',
       'sub_region', 'industry', 'state', 'country', 'region',
       'new_business_opportunity_amount_sum', 'new_business_count',
       'is_new_business_won_count', 'new_business_won_amount_sum',
       'call_count', 'email_count', 'other_count', 'MeetingCount'],
      dtype='object')

In [75]:
prospects4[~prospects4.MeetingCount.isna()].head()

Unnamed: 0,id,became_mql_date,title,job_function,account_id,sub_region,industry,state,country,region,new_business_opportunity_amount_sum,new_business_count,is_new_business_won_count,new_business_won_amount_sum,call_count,email_count,other_count,MeetingCount
70,0034W00002NjEg6QAF,NaT,Service Lifecycle Manager,Manager,0014W00002FhBkbQAF,NA South,Electronics,Georgia,United States,,,,,,0.0,13.0,0.0,2.0
91,0034W00002NjEgRQAV,2020-02-14 18:31:14,,HR manager,0014W00002Fh81yQAB,NA Northeast,Consulting,New Jersey,United States,,,,,,0.0,25.0,0.0,13.0
99,0034W00002NjEgZQAV,2020-01-24 23:27:01,CFO,C-level,0014W00002Fh8SDQAZ,LatAm,Business Services,,Mexico,Rest of World,,,,,0.0,10.0,0.0,1.0
196,0034W00002NjEkuQAF,2020-01-03 08:14:00,Chief Information Officer,C-level,0014W00002FhBJfQAN,NA South,Not For Profit,Maryland,United States,,,,,,0.0,10.0,0.0,1.0
204,0034W00002NjEl2QAF,NaT,Controlling,,0014W00002Fh7a2QAB,Central Europe,Electronics,,Germany,EMEA,,,,,0.0,59.0,0.0,16.0


In [80]:
# Clean up
prospects4.new_business_count = prospects4.new_business_count.fillna(0)
prospects4.new_business_opportunity_amount_sum = prospects4.new_business_opportunity_amount_sum.fillna(0)
prospects4.new_business_won_amount_sum = prospects4.new_business_won_amount_sum.fillna(0)
prospects4.is_new_business_won_count = prospects4.is_new_business_won_count.fillna(0)
# prospects4.renewal_count = prospects4.renewal_count.fillna(0)
# prospects4.upsell_count = prospects4.upsell_count.fillna(0)
# prospects4.partner_count = prospects4.partner_count.fillna(0)
prospects4.call_count = prospects4.call_count.fillna(0)
prospects4.email_count = prospects4.email_count.fillna(0)
prospects4.other_count = prospects4.other_count.fillna(0)
prospects4.MeetingCount = prospects4.MeetingCount.fillna(0)
# prospects4.event_count = prospects4.event_count.fillna(0)
# prospects4.Amount = prospects4.Amount.fillna(0)
prospects4.region = prospects4.region.fillna("")
prospects4.sub_region = prospects4.sub_region.fillna("")

In [81]:
# Change NA -> North America in region and sub_region
prospects4.loc[:, "region"] = prospects4.region.map(lambda x: x.replace("NA", "North America") if x.startswith("NA") else x)
prospects4.loc[:, "sub_region"] = prospects4.region.map(lambda x: x.replace("NA", "North America") if x.startswith("NA") else x)

In [82]:
prospects4.head()

Unnamed: 0,id,became_mql_date,title,job_function,account_id,sub_region,industry,state,country,region,new_business_opportunity_amount_sum,new_business_count,is_new_business_won_count,new_business_won_amount_sum,call_count,email_count,other_count,MeetingCount
0,0034W00002NjEeyQAF,NaT,,Manager,0014W00002FhDWeQAN,Rest of World,,,,Rest of World,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0034W00002NjEezQAF,NaT,,Manager,0014W00002FhDOLQA3,Rest of World,,,,Rest of World,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0034W00002NjEf0QAF,2020-01-17 11:07:07,OD Project Director,Director,0014W00002Fh6yWQAR,Rest of World,Technology,Hong Kong,China,Rest of World,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0034W00002NjEf1QAF,NaT,Regional Head of HR,Director,0014W00002Fh6yWQAR,Rest of World,Technology,Hong Kong,China,Rest of World,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0034W00002NjEf2QAF,2020-01-22 05:44:06,System Integrator (System Support),Team member,0014W00002Fh6yWQAR,Rest of World,Technology,Hong Kong,China,Rest of World,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [83]:
prospects4.to_csv(os.path.join(os.pardir, 'top_of_funnel', 'data', 'salesforce_prospects_data_for_ads_v4.csv'), index=False)