# Get Salesforce Data to Join with Marketing Data

## Set up and load data

In [1]:
from toolbox.datalake import DataLake
from simple_salesforce import Salesforce
from dotenv import load_dotenv
import pandas as pd
import datetime
import json
import os

In [2]:
load_dotenv()
CONNECT_STR = os.getenv('AZURE_STORAGE_CONNECTION_STRING')
SALESFORCE_USERNAME = os.getenv('SALESFORCE_USERNAME')
SALESFORCE_PASSWORD = os.getenv('SALESFORCE_PASSWORD')
SALESFORCE_TOKEN = os.getenv('SALESFORCE_TOKEN')

In [3]:
dl = DataLake(CONNECT_STR)

In [4]:
BASE_SF = 'Unprocessed/Salesforce/2022/04/14'

In [None]:
# leads = dl.get_json_lines_as_df(f'{BASE_SF}/Lead.json')
# contacts = dl.get_json_lines_as_df(f'{BASE_SF}/Contact.json')
# oppconroles = dl.get_json_lines_as_df(f'{BASE_SF}/OpportunityContactRole.json')
# opportunities = dl.get_json_lines_as_df(f'{BASE_SF}/Opportunity.json')
# tasks = dl.get_json_lines_as_df(f'{BASE_SF}/Task.json')
# events = dl.get_json_lines_as_df(f'{BASE_SF}/Event.json')
# accounts = dl.get_json_lines_as_df(f'{BASE_SF}/Account.json')

In [5]:
leads = dl.get_json_lines_as_df(f'{BASE_SF}/Lead.json')
leads.columns.tolist()

['Id',
 'IsDeleted',
 'MasterRecordId',
 'LastName',
 'FirstName',
 'Salutation',
 'MiddleName',
 'Suffix',
 'Name',
 'RecordTypeId',
 'Title',
 'Company',
 'Street',
 'City',
 'State',
 'PostalCode',
 'Country',
 'StateCode',
 'CountryCode',
 'Latitude',
 'Longitude',
 'GeocodeAccuracy',
 'Phone',
 'MobilePhone',
 'Email',
 'Website',
 'PhotoUrl',
 'Description',
 'LeadSource',
 'Status',
 'Industry',
 'Rating',
 'CurrencyIsoCode',
 'NumberOfEmployees',
 'OwnerId',
 'HasOptedOutOfEmail',
 'IsConverted',
 'ConvertedDate',
 'ConvertedAccountId',
 'ConvertedContactId',
 'ConvertedOpportunityId',
 'IsUnreadByOwner',
 'CreatedDate',
 'CreatedById',
 'LastModifiedDate',
 'LastModifiedById',
 'SystemModstamp',
 'LastActivityDate',
 'LastViewedDate',
 'LastReferencedDate',
 'Jigsaw',
 'JigsawContactId',
 'EmailBouncedReason',
 'EmailBouncedDate',
 'IndividualId',
 'Agreed_to_provide_Case_Study__c',
 'Buying_role__c',
 'X18DigitLeadId__c',
 'Ctvt_PartnerCompanyName__c',
 'Ctvt_PartnerOwnerUser

In [6]:
contacts = dl.get_json_lines_as_df(f'{BASE_SF}/Contact.json')
contacts.columns.tolist()

['Id',
 'IsDeleted',
 'MasterRecordId',
 'AccountId',
 'LastName',
 'FirstName',
 'Salutation',
 'MiddleName',
 'Suffix',
 'Name',
 'RecordTypeId',
 'MailingStreet',
 'MailingCity',
 'MailingState',
 'MailingPostalCode',
 'MailingCountry',
 'MailingStateCode',
 'MailingCountryCode',
 'MailingLatitude',
 'MailingLongitude',
 'MailingGeocodeAccuracy',
 'Phone',
 'Fax',
 'MobilePhone',
 'ReportsToId',
 'Email',
 'Title',
 'Department',
 'CurrencyIsoCode',
 'OwnerId',
 'CreatedDate',
 'CreatedById',
 'LastModifiedDate',
 'LastModifiedById',
 'SystemModstamp',
 'LastActivityDate',
 'LastCURequestDate',
 'LastCUUpdateDate',
 'LastViewedDate',
 'LastReferencedDate',
 'EmailBouncedReason',
 'EmailBouncedDate',
 'IsEmailBounced',
 'PhotoUrl',
 'Jigsaw',
 'JigsawContactId',
 'IndividualId',
 'Job_Function__c',
 'Super_fan__c',
 'Lifecycle_stage__c',
 'Lead_substage__c',
 'Follow_up_date__c',
 'Competitors__c',
 'Right_budget__c',
 'Decision_maker__c',
 'Timeline_to_starting_using_Gtmhub__c',
 'R

In [7]:
oppconroles = dl.get_json_lines_as_df(f'{BASE_SF}/OpportunityContactRole.json')
oppconroles.columns.tolist()

['Id',
 'OpportunityId',
 'ContactId',
 'Role',
 'IsPrimary',
 'CreatedDate',
 'CreatedById',
 'LastModifiedDate',
 'LastModifiedById',
 'SystemModstamp',
 'IsDeleted',
 'CurrencyIsoCode']

In [8]:
opportunities = dl.get_json_lines_as_df(f'{BASE_SF}/Opportunity.json')
opportunities.columns.tolist()

['Id',
 'IsDeleted',
 'AccountId',
 'RecordTypeId',
 'Name',
 'Description',
 'StageName',
 'Amount',
 'Probability',
 'CloseDate',
 'Type',
 'NextStep',
 'LeadSource',
 'IsClosed',
 'IsWon',
 'ForecastCategory',
 'ForecastCategoryName',
 'CurrencyIsoCode',
 'CampaignId',
 'HasOpportunityLineItem',
 'Pricebook2Id',
 'OwnerId',
 'Territory2Id',
 'IsExcludedFromTerritory2Filter',
 'CreatedDate',
 'CreatedById',
 'LastModifiedDate',
 'LastModifiedById',
 'SystemModstamp',
 'LastActivityDate',
 'PushCount',
 'LastStageChangeDate',
 'FiscalQuarter',
 'FiscalYear',
 'Fiscal',
 'ContactId',
 'LastViewedDate',
 'LastReferencedDate',
 'SyncedQuoteId',
 'ContractId',
 'HasOpenActivity',
 'HasOverdueTask',
 'LastAmountChangedHistoryId',
 'LastCloseDateChangedHistoryId',
 'Budget_Confirmed__c',
 'Discovery_Completed__c',
 'ROI_Analysis_Completed__c',
 'Contract_Length__c',
 'Loss_Reason__c',
 'Annual_Contract_Value__c',
 'MRR__c',
 'MEDDPICC_Metrics__c',
 'MEDDPICC_Economic_Buyer__c',
 'MEDDPICC_D

In [9]:
tasks = dl.get_json_lines_as_df(f'{BASE_SF}/Task.json')
tasks.columns.tolist()

['Id',
 'RecordTypeId',
 'WhoId',
 'WhoCount',
 'WhatCount',
 'Subject',
 'ActivityDate',
 'Status',
 'Priority',
 'IsHighPriority',
 'OwnerId',
 'Description',
 'CurrencyIsoCode',
 'Type',
 'IsDeleted',
 'AccountId',
 'IsClosed',
 'CreatedDate',
 'CreatedById',
 'LastModifiedDate',
 'LastModifiedById',
 'IsArchived',
 'CallDurationInSeconds',
 'CallType',
 'CallDisposition',
 'CallObject',
 'ReminderDateTime',
 'IsReminderSet',
 'RecurrenceActivityId',
 'IsRecurrence',
 'RecurrenceStartDateOnly',
 'RecurrenceEndDateOnly',
 'RecurrenceTimeZoneSidKey',
 'RecurrenceType',
 'RecurrenceInterval',
 'RecurrenceDayOfWeekMask',
 'RecurrenceDayOfMonth',
 'RecurrenceInstance',
 'RecurrenceMonthOfYear',
 'RecurrenceRegeneratedType',
 'TaskSubtype',
 'CompletedDateTime',
 'affectlayer__AffectLayer_Call_Id__c',
 'affectlayer__Chorus_Call_Id__c',
 'Manager__c',
 'Meeting_Disposition__c',
 'Meeting_set_by__c',
 'MeetingSetByIsCurrentUser__c',
 'Associated_Account__c',
 'Associated_Case__c',
 'Associa

In [10]:
events = dl.get_json_lines_as_df(f'{BASE_SF}/Event.json')
events.columns.tolist()

['Id',
 'WhoId',
 'WhatId',
 'WhoCount',
 'WhatCount',
 'Subject',
 'Location',
 'IsAllDayEvent',
 'ActivityDateTime',
 'ActivityDate',
 'DurationInMinutes',
 'StartDateTime',
 'EndDateTime',
 'EndDate',
 'Description',
 'AccountId',
 'OwnerId',
 'CurrencyIsoCode',
 'Type',
 'IsPrivate',
 'ShowAs',
 'IsDeleted',
 'IsChild',
 'IsGroupEvent',
 'GroupEventType',
 'CreatedDate',
 'CreatedById',
 'LastModifiedDate',
 'LastModifiedById',
 'SystemModstamp',
 'IsArchived',
 'RecurrenceActivityId',
 'IsRecurrence',
 'RecurrenceStartDateTime',
 'RecurrenceEndDateOnly',
 'RecurrenceTimeZoneSidKey',
 'RecurrenceType',
 'RecurrenceInterval',
 'RecurrenceDayOfWeekMask',
 'RecurrenceDayOfMonth',
 'RecurrenceInstance',
 'RecurrenceMonthOfYear',
 'ReminderDateTime',
 'IsReminderSet',
 'EventSubtype',
 'IsRecurrence2Exclusion',
 'Recurrence2PatternText',
 'Recurrence2PatternVersion',
 'IsRecurrence2',
 'IsRecurrence2Exception',
 'Recurrence2PatternStartDate',
 'Recurrence2PatternTimeZone',
 'affectlayer

In [11]:
accounts = dl.get_json_lines_as_df(f'{BASE_SF}/Account.json')
accounts.columns.tolist()

['Id',
 'IsDeleted',
 'MasterRecordId',
 'Name',
 'Type',
 'RecordTypeId',
 'ParentId',
 'BillingStreet',
 'BillingCity',
 'BillingState',
 'BillingPostalCode',
 'BillingCountry',
 'BillingStateCode',
 'BillingCountryCode',
 'BillingLatitude',
 'BillingLongitude',
 'BillingGeocodeAccuracy',
 'ShippingStreet',
 'ShippingCity',
 'ShippingState',
 'ShippingPostalCode',
 'ShippingCountry',
 'ShippingStateCode',
 'ShippingCountryCode',
 'ShippingLatitude',
 'ShippingLongitude',
 'ShippingGeocodeAccuracy',
 'Phone',
 'Website',
 'PhotoUrl',
 'Industry',
 'NumberOfEmployees',
 'Description',
 'CurrencyIsoCode',
 'OwnerId',
 'CreatedDate',
 'CreatedById',
 'LastModifiedDate',
 'LastModifiedById',
 'SystemModstamp',
 'LastActivityDate',
 'LastViewedDate',
 'LastReferencedDate',
 'Jigsaw',
 'JigsawCompanyId',
 'AccountSource',
 'SicDesc',
 'Account_Territory__c',
 'Customer_Success_Approved__c',
 'Chargebee__c',
 'SF_Account_ID__c',
 'Gtmhub__c',
 'Segment__c',
 'Number__c',
 'Renewal_Date__c',


In [None]:
# 'Became_a_marketing_qualified_lead_date__c'
# 'ActivityDate'
# 'CreatedDate'

In [12]:
sf = Salesforce(username=SALESFORCE_USERNAME, password=SALESFORCE_PASSWORD, security_token=SALESFORCE_TOKEN, instance_url='https://gtmhub.my.salesforce.com')

In [13]:
contact_query = f'SELECT Id, Became_a_marketing_qualified_lead_date__c FROM Contact'
contact_df = pd.DataFrame()

contact_results = sf.bulk.Contact.query(contact_query, lazy_operation=True)

for list_results in contact_results:
    new_df = pd.DataFrame(list_results)
    contact_df = pd.concat([contact_df, new_df])

In [None]:
contact_df.head()

## Leads and Contacts

1. Remove unneeded columns.
2. Merge Leads and Contacts to "prospects" table.
3. Coalesce duplicate columns.

In [14]:
req_lead_fields = [
    'Id', 
    'ConvertedContactId', 
    'Became_a_marketing_qualified_lead_date__c', 
    'Area__c', 
    'Territory__c', 
    'State',
    'Country',
    'Industry', 
    'Title', 
    'Job_Function__c', 
    'ConvertedAccountId', 
    'Account_Name__c'
]
leads = leads.loc[:, req_lead_fields]

In [15]:
req_contact_fields = ['Id', 'Title', 'Job_Function__c', 'AccountId']
contacts = contacts.loc[:, req_contact_fields]

In [16]:
req_account_fields = [
    'Id', 
    'Industry', 
    'BillingState', 
    'BillingCountry', 
    'Account_Area__c', 
    'Account_Territory__c'
]
accounts = accounts.loc[:, req_account_fields]

In [17]:
contact_df = contact_df.drop(['attributes'], axis=1)

In [18]:
contacts = contacts.merge(
    contact_df,
    how="left",
    left_on="Id",
    right_on="Id"
)

In [19]:
contacts.columns.tolist()

['Id',
 'Title',
 'Job_Function__c',
 'AccountId',
 'Became_a_marketing_qualified_lead_date__c']

In [20]:
contacts.loc[:, 'BecameMQLDate'] = contacts.Became_a_marketing_qualified_lead_date__c.apply(lambda x: datetime.datetime.fromtimestamp(x/1000) if pd.notnull(x) else None)

In [21]:
leads.loc[:, 'BecameMQLDate'] = leads.Became_a_marketing_qualified_lead_date__c.apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%dT%H:%M:%S') if pd.notnull(x) else None)

In [22]:
leads = leads.drop(['Became_a_marketing_qualified_lead_date__c'], axis=1)
contacts = contacts.drop(['Became_a_marketing_qualified_lead_date__c'], axis=1)

In [23]:
prospects = contacts.merge(leads, how='outer', left_on='Id', right_on='ConvertedContactId')

In [24]:
prospects.head()

Unnamed: 0,Id_x,Title_x,Job_Function__c_x,AccountId,BecameMQLDate_x,Id_y,ConvertedContactId,Area__c,Territory__c,State,Country,Industry,Title_y,Job_Function__c_y,ConvertedAccountId,Account_Name__c,BecameMQLDate_y
0,0034W00002NjEeyQAF,,Manager,0014W00002FhDWeQAN,NaT,,,,,,,,,,,,NaT
1,0034W00002NjEezQAF,,Manager,0014W00002FhDOLQA3,NaT,,,,,,,,,,,,NaT
2,0034W00002NjEf0QAF,OD Project Director,Director,0014W00002Fh6yWQAR,2020-01-17 11:07:07,,,,,,,,,,,,NaT
3,0034W00002NjEf1QAF,Regional Head of HR,Director,0014W00002Fh6yWQAR,NaT,,,,,,,,,,,,NaT
4,0034W00002NjEf2QAF,System Integrator (System Support),Team member,0014W00002Fh6yWQAR,2020-01-22 05:44:06,,,,,,,,,,,,NaT


In [25]:
prospects.loc[:, 'id'] = prospects.Id_x.combine_first(prospects.Id_y)
prospects.loc[:, 'became_mql_date'] = prospects.BecameMQLDate_x.combine_first(prospects.BecameMQLDate_y)
prospects.loc[:, 'title'] = prospects.Title_x.combine_first(prospects.Title_y)
prospects.loc[:, 'job_function'] = prospects.Job_Function__c_x.combine_first(prospects.Job_Function__c_y)
prospects.loc[:, 'account_id'] = prospects.AccountId.combine_first(prospects.Account_Name__c)

prospects = prospects.drop(
    [
        'Id_x', 
        'Id_y', 
        'BecameMQLDate_x', 
        'BecameMQLDate_y', 
        'ConvertedContactId', 
        'Title_x', 
        'Title_y', 
        'Job_Function__c_x', 
        'Job_Function__c_y', 
        'Account_Name__c', 
        'ConvertedAccountId',
        'AccountId'
    ], 
    axis=1
)

In [26]:
prospects.head()

Unnamed: 0,Area__c,Territory__c,State,Country,Industry,id,became_mql_date,title,job_function,account_id
0,,,,,,0034W00002NjEeyQAF,NaT,,Manager,0014W00002FhDWeQAN
1,,,,,,0034W00002NjEezQAF,NaT,,Manager,0014W00002FhDOLQA3
2,,,,,,0034W00002NjEf0QAF,2020-01-17 11:07:07,OD Project Director,Director,0014W00002Fh6yWQAR
3,,,,,,0034W00002NjEf1QAF,NaT,Regional Head of HR,Director,0014W00002Fh6yWQAR
4,,,,,,0034W00002NjEf2QAF,2020-01-22 05:44:06,System Integrator (System Support),Team member,0014W00002Fh6yWQAR


In [27]:
prospects_0 = prospects.merge(
    accounts,
    how='left',
    left_on='account_id',
    right_on='Id'
)

In [28]:
prospects_0.head()

Unnamed: 0,Area__c,Territory__c,State,Country,Industry_x,id,became_mql_date,title,job_function,account_id,Id,Industry_y,BillingState,BillingCountry,Account_Area__c,Account_Territory__c
0,,,,,,0034W00002NjEeyQAF,NaT,,Manager,0014W00002FhDWeQAN,0014W00002FhDWeQAN,,,,Rest of World,Rest of World
1,,,,,,0034W00002NjEezQAF,NaT,,Manager,0014W00002FhDOLQA3,0014W00002FhDOLQA3,,,,Rest of World,Rest of World
2,,,,,,0034W00002NjEf0QAF,2020-01-17 11:07:07,OD Project Director,Director,0014W00002Fh6yWQAR,0014W00002Fh6yWQAR,Technology,Hong Kong,China,Rest of World,APAC
3,,,,,,0034W00002NjEf1QAF,NaT,Regional Head of HR,Director,0014W00002Fh6yWQAR,0014W00002Fh6yWQAR,Technology,Hong Kong,China,Rest of World,APAC
4,,,,,,0034W00002NjEf2QAF,2020-01-22 05:44:06,System Integrator (System Support),Team member,0014W00002Fh6yWQAR,0014W00002Fh6yWQAR,Technology,Hong Kong,China,Rest of World,APAC


In [29]:
prospects_0.columns

Index(['Area__c', 'Territory__c', 'State', 'Country', 'Industry_x', 'id',
       'became_mql_date', 'title', 'job_function', 'account_id', 'Id',
       'Industry_y', 'BillingState', 'BillingCountry', 'Account_Area__c',
       'Account_Territory__c'],
      dtype='object')

In [30]:
prospects_0.loc[:, "territory"] = prospects_0.Account_Territory__c.combine_first(prospects_0.Territory__c)
prospects_0.loc[:, "industry"] = prospects_0.Industry_y.combine_first(prospects_0.Industry_x)
prospects_0.loc[:, "state"] = prospects_0.BillingState.combine_first(prospects_0.State)
prospects_0.loc[:, "country"] = prospects_0.BillingCountry.combine_first(prospects_0.Country)
prospects_0.loc[:, "account_area"] = prospects_0.Account_Area__c.combine_first(prospects_0.Area__c)

prospects_0 = prospects_0.drop(
    [
        'Account_Territory__c', 
        'Territory__c', 
        'Industry_x', 
        'Industry_y', 
        'BillingState',
        'State',
        'BillingCountry',
        'Country',
        'Account_Area__c', 
        'Area__c',
        'Id'
    ], 
    axis=1
)

In [31]:
prospects_0.head()

Unnamed: 0,id,became_mql_date,title,job_function,account_id,territory,industry,state,country,account_area
0,0034W00002NjEeyQAF,NaT,,Manager,0014W00002FhDWeQAN,Rest of World,,,,Rest of World
1,0034W00002NjEezQAF,NaT,,Manager,0014W00002FhDOLQA3,Rest of World,,,,Rest of World
2,0034W00002NjEf0QAF,2020-01-17 11:07:07,OD Project Director,Director,0014W00002Fh6yWQAR,APAC,Technology,Hong Kong,China,Rest of World
3,0034W00002NjEf1QAF,NaT,Regional Head of HR,Director,0014W00002Fh6yWQAR,APAC,Technology,Hong Kong,China,Rest of World
4,0034W00002NjEf2QAF,2020-01-22 05:44:06,System Integrator (System Support),Team member,0014W00002Fh6yWQAR,APAC,Technology,Hong Kong,China,Rest of World


## Opportunities & OpportunityContactRoles

1. Remove unneeded columns.
2. Merge opportunity data on contact relationship data.

In [32]:
opportunities = opportunities.loc[:, ['Id', 'RecordTypeId', 'IsClosed', 'IsWon', 'Amount', 'CloseDate', 'Type', 'CreatedDate']]

In [33]:
oppconroles = oppconroles.loc[:, ['Id', 'OpportunityId', 'ContactId']]

In [34]:
opps = oppconroles.merge(opportunities, how='left', left_on='OpportunityId', right_on='Id')

In [35]:
opps = opps.rename(
    {
        'Id_x': 'OppConRoleId'
    }, 
    axis=1
).drop(['Id_y'], axis=1)

In [36]:
opps.loc[:, 'CloseDate'] = opps.CloseDate.apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%dT%H:%M:%S'))

In [37]:
opps[opps["IsWon"] == True].head()

Unnamed: 0,OppConRoleId,OpportunityId,ContactId,RecordTypeId,IsClosed,IsWon,Amount,CloseDate,Type,CreatedDate
25,00K8a00000VtFs7EAF,0068a00001Fpid5AAB,0034W00002b9pqEQAQ,0124W0000007bPpQAI,True,True,6000.0,2022-02-22,New Business,2022-02-14T16:15:10
29,00K4W00000Leo7zUAB,0064W00000xv1zsQAA,0034W00002NjErPQAV,0124W0000007bPpQAI,True,True,24.0,2020-03-22,New Business,2020-03-03T21:12:30
30,00K4W00000Leo7yUAB,0064W00000xv1zsQAA,0034W00002NjErPQAV,0124W0000007bPpQAI,True,True,24.0,2020-03-22,New Business,2020-03-03T21:12:30
35,00K4W00000LhOplUAF,0064W00000xv2AYQAY,0034W00002NjEh2QAF,0124W0000007bPpQAI,True,True,30000.0,2020-09-17,New Business,2020-09-01T16:46:01
54,00K4W00000LhOrmUAF,0064W00000xv2F3QAI,0034W00002NjEkTQAV,0124W0000007bPpQAI,True,True,127200.0,2020-12-22,Existing Business,2020-06-22T18:27:59


## Get supplemental prospect data

1. Remove excess tasks.
2. Update opps with OpportunityType.
3. Create summary dataframes.
4. Merge sumary dataframes with prospects.
5. Clean up fields.
6. Output to CSV.

In [109]:
tasks = tasks[
    (tasks.RecordTypeId == '0124W000001lKRjQAM') & # Only sales tasks record type
    (tasks.Status == 'Completed') &
    (tasks.Type.isin(['Call', 'Email', 'Other', 'Meeting']))
]

In [110]:
def opp_record_type(row):
    if row['RecordTypeId'] == '0124W0000007bPpQAI':
        return 'New Business'
    elif row['RecordTypeId'] == '0124W0000007bPuQAI':
        return 'Renewal'
    else:
        return 'Partner'

def opp_type(row):
    if row['RecordType'] == 'Renewal' or row['RecordType'] == 'Partner':
        return row['RecordType']
    if row['Type'] == 'Existing Business':
        return 'Upsell'
    else:
        return 'New Business'

In [111]:
opps.loc[:, 'RecordType'] = opps.apply(opp_record_type, axis=1)
opps.loc[:, 'OpportunityType'] = opps.apply(opp_type, axis=1)

In [112]:
new_business_ops = opps[opps["OpportunityType"] == "New Business"]

In [113]:
new_business_ops.head()

Unnamed: 0,OppConRoleId,OpportunityId,ContactId,RecordTypeId,IsClosed,IsWon,Amount,CloseDate,Type,CreatedDate,RecordType,OpportunityType
0,00K8a00000XJm4bEAD,0068a00001GdcT8AAJ,0038a00002rQon6AAC,0124W0000007bPpQAI,False,False,100000.0,2022-12-29,New Business,2022-04-12T11:07:45,New Business,New Business
1,00K8a00000XJniXEAT,0068a00001GdeqOAAR,0038a00002rQxZzAAK,0124W0000007bPpQAI,False,False,50000.0,2022-06-30,,2022-04-13T08:29:54,New Business,New Business
2,00K8a00000XJpSvEAL,0068a00001Gdi6IAAR,0038a00002rR8K4AAK,0124W0000007bPpQAI,False,False,50000.0,2022-06-30,New Business,2022-04-14T14:58:35,New Business,New Business
3,00K8a00000XJjFgEAL,0068a00001GdX03AAF,0038a00002qcCjNAAU,0124W0000007bPpQAI,False,False,15000.0,2022-08-31,New Business,2022-04-08T14:21:06,New Business,New Business
4,00K8a00000XJjX1EAL,0068a00001GdXOyAAN,0038a00002qcD6RAAU,0124W0000007bPpQAI,False,False,50000.0,2022-08-26,New Business,2022-04-08T16:28:10,New Business,New Business


In [114]:
new_business_ops.loc[:, "IsWon"] = new_business_ops["IsWon"].map(lambda x: 1 if x else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_business_ops.loc[:, "IsWon"] = new_business_ops["IsWon"].map(lambda x: 1 if x else 0)


In [115]:
opps.head()

Unnamed: 0,OppConRoleId,OpportunityId,ContactId,RecordTypeId,IsClosed,IsWon,Amount,CloseDate,Type,CreatedDate,RecordType,OpportunityType
0,00K8a00000XJm4bEAD,0068a00001GdcT8AAJ,0038a00002rQon6AAC,0124W0000007bPpQAI,False,False,100000.0,2022-12-29,New Business,2022-04-12T11:07:45,New Business,New Business
1,00K8a00000XJniXEAT,0068a00001GdeqOAAR,0038a00002rQxZzAAK,0124W0000007bPpQAI,False,False,50000.0,2022-06-30,,2022-04-13T08:29:54,New Business,New Business
2,00K8a00000XJpSvEAL,0068a00001Gdi6IAAR,0038a00002rR8K4AAK,0124W0000007bPpQAI,False,False,50000.0,2022-06-30,New Business,2022-04-14T14:58:35,New Business,New Business
3,00K8a00000XJjFgEAL,0068a00001GdX03AAF,0038a00002qcCjNAAU,0124W0000007bPpQAI,False,False,15000.0,2022-08-31,New Business,2022-04-08T14:21:06,New Business,New Business
4,00K8a00000XJjX1EAL,0068a00001GdXOyAAN,0038a00002qcD6RAAU,0124W0000007bPpQAI,False,False,50000.0,2022-08-26,New Business,2022-04-08T16:28:10,New Business,New Business


In [116]:
opps_to_merge = opps.groupby(
    [
        'ContactId', 'OpportunityType'
    ]
).size().unstack(fill_value=0).reset_index().rename(
    {
        'New Business': 'new_business_count', 
        'Upsell': 'upsell_count', 
        'Renewal': 'renewal_count', 
        'Partner': 'partner_count'
    }, 
    axis=1
)

In [117]:
opps_to_merge.head()

OpportunityType,ContactId,new_business_count,partner_count,renewal_count,upsell_count
0,0034W00002NjEeyQAF,1,0,0,0
1,0034W00002NjEf1QAF,0,0,1,0
2,0034W00002NjEfiQAF,0,0,1,0
3,0034W00002NjEgGQAV,0,0,2,0
4,0034W00002NjEgIQAV,0,0,1,0


In [118]:
opps_to_merge.shape

(1196, 5)

In [119]:
contact_amount = opps.groupby(["ContactId"]).agg({"Amount": "sum"}).reset_index()
contact_amount.head()

Unnamed: 0,ContactId,Amount
0,0034W00002NjEeyQAF,0.0
1,0034W00002NjEf1QAF,11520.0
2,0034W00002NjEfiQAF,42000.0
3,0034W00002NjEgGQAV,13000.0
4,0034W00002NjEgIQAV,7200.0


In [120]:
new_business_ops["amount_won"] = new_business_ops.apply(lambda x: x["Amount"] if x['IsWon'] == 1 else 0, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_business_ops["amount_won"] = new_business_ops.apply(lambda x: x["Amount"] if x['IsWon'] == 1 else 0, axis=1)


In [121]:
new_business_ops.head()

Unnamed: 0,OppConRoleId,OpportunityId,ContactId,RecordTypeId,IsClosed,IsWon,Amount,CloseDate,Type,CreatedDate,RecordType,OpportunityType,amount_won
0,00K8a00000XJm4bEAD,0068a00001GdcT8AAJ,0038a00002rQon6AAC,0124W0000007bPpQAI,False,0,100000.0,2022-12-29,New Business,2022-04-12T11:07:45,New Business,New Business,0.0
1,00K8a00000XJniXEAT,0068a00001GdeqOAAR,0038a00002rQxZzAAK,0124W0000007bPpQAI,False,0,50000.0,2022-06-30,,2022-04-13T08:29:54,New Business,New Business,0.0
2,00K8a00000XJpSvEAL,0068a00001Gdi6IAAR,0038a00002rR8K4AAK,0124W0000007bPpQAI,False,0,50000.0,2022-06-30,New Business,2022-04-14T14:58:35,New Business,New Business,0.0
3,00K8a00000XJjFgEAL,0068a00001GdX03AAF,0038a00002qcCjNAAU,0124W0000007bPpQAI,False,0,15000.0,2022-08-31,New Business,2022-04-08T14:21:06,New Business,New Business,0.0
4,00K8a00000XJjX1EAL,0068a00001GdXOyAAN,0038a00002qcD6RAAU,0124W0000007bPpQAI,False,0,50000.0,2022-08-26,New Business,2022-04-08T16:28:10,New Business,New Business,0.0


In [122]:
new_business_amount = new_business_ops.groupby(["ContactId"]).agg({"Amount": "sum", "OpportunityType": "count", "IsWon": "sum", "amount_won": "sum"}).reset_index()
new_business_amount.head()

Unnamed: 0,ContactId,Amount,OpportunityType,IsWon,amount_won
0,0034W00002NjEeyQAF,0.0,1,0,0.0
1,0034W00002NjEgeQAF,576000.0,1,0,0.0
2,0034W00002NjEh2QAF,30000.0,1,1,30000.0
3,0034W00002NjEhCQAV,33000.0,1,1,33000.0
4,0034W00002NjEhIQAV,131000.0,1,1,131000.0


In [123]:
new_business_ops

Unnamed: 0,OppConRoleId,OpportunityId,ContactId,RecordTypeId,IsClosed,IsWon,Amount,CloseDate,Type,CreatedDate,RecordType,OpportunityType,amount_won
0,00K8a00000XJm4bEAD,0068a00001GdcT8AAJ,0038a00002rQon6AAC,0124W0000007bPpQAI,False,0,100000.0,2022-12-29,New Business,2022-04-12T11:07:45,New Business,New Business,0.0
1,00K8a00000XJniXEAT,0068a00001GdeqOAAR,0038a00002rQxZzAAK,0124W0000007bPpQAI,False,0,50000.0,2022-06-30,,2022-04-13T08:29:54,New Business,New Business,0.0
2,00K8a00000XJpSvEAL,0068a00001Gdi6IAAR,0038a00002rR8K4AAK,0124W0000007bPpQAI,False,0,50000.0,2022-06-30,New Business,2022-04-14T14:58:35,New Business,New Business,0.0
3,00K8a00000XJjFgEAL,0068a00001GdX03AAF,0038a00002qcCjNAAU,0124W0000007bPpQAI,False,0,15000.0,2022-08-31,New Business,2022-04-08T14:21:06,New Business,New Business,0.0
4,00K8a00000XJjX1EAL,0068a00001GdXOyAAN,0038a00002qcD6RAAU,0124W0000007bPpQAI,False,0,50000.0,2022-08-26,New Business,2022-04-08T16:28:10,New Business,New Business,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1571,00K4W00000MxyLpUAJ,0064W000016HBz0QAG,0034W00002qqkeZQAQ,0124W0000007bPpQAI,False,0,75000.0,2022-05-31,New Business,2021-08-27T18:27:14,New Business,New Business,0.0
1572,00K4W00000MxyLoUAJ,0064W000016HBz0QAG,0034W00002qqkWsQAI,0124W0000007bPpQAI,False,0,75000.0,2022-05-31,New Business,2021-08-27T18:27:14,New Business,New Business,0.0
1573,00K4W00000MxyLnUAJ,0064W000016HBz0QAG,0034W00002qqkTJQAY,0124W0000007bPpQAI,False,0,75000.0,2022-05-31,New Business,2021-08-27T18:27:14,New Business,New Business,0.0
1574,00K4W00000JrrL7UAJ,0064W000016HBz0QAG,0034W00002fZv7sQAC,0124W0000007bPpQAI,False,0,75000.0,2022-05-31,New Business,2021-08-27T18:27:14,New Business,New Business,0.0


In [124]:
new_business_ops["AmountWon"] = new_business_ops.apply(lambda x: x["Amount"] if x['IsWon'] == 1 else 0, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_business_ops["AmountWon"] = new_business_ops.apply(lambda x: x["Amount"] if x['IsWon'] == 1 else 0, axis=1)


In [125]:
contact_amount

Unnamed: 0,ContactId,Amount
0,0034W00002NjEeyQAF,0.0
1,0034W00002NjEf1QAF,11520.0
2,0034W00002NjEfiQAF,42000.0
3,0034W00002NjEgGQAV,13000.0
4,0034W00002NjEgIQAV,7200.0
...,...,...
1191,0038a00002rQyGtAAK,50000.0
1192,0038a00002rR3rBAAS,50000.0
1193,0038a00002rR7tXAAS,72000.0
1194,0038a00002rR7uQAAS,35000.0


In [126]:
opps_to_merge

OpportunityType,ContactId,new_business_count,partner_count,renewal_count,upsell_count
0,0034W00002NjEeyQAF,1,0,0,0
1,0034W00002NjEf1QAF,0,0,1,0
2,0034W00002NjEfiQAF,0,0,1,0
3,0034W00002NjEgGQAV,0,0,2,0
4,0034W00002NjEgIQAV,0,0,1,0
...,...,...,...,...,...
1191,0038a00002rQyGtAAK,1,0,0,0
1192,0038a00002rR3rBAAS,1,0,0,0
1193,0038a00002rR7tXAAS,1,0,0,0
1194,0038a00002rR7uQAAS,1,0,0,0


In [127]:
new_business_ops[new_business_ops["AmountWon"] > 0]

Unnamed: 0,OppConRoleId,OpportunityId,ContactId,RecordTypeId,IsClosed,IsWon,Amount,CloseDate,Type,CreatedDate,RecordType,OpportunityType,amount_won,AmountWon
25,00K8a00000VtFs7EAF,0068a00001Fpid5AAB,0034W00002b9pqEQAQ,0124W0000007bPpQAI,True,1,6000.0,2022-02-22,New Business,2022-02-14T16:15:10,New Business,New Business,6000.0,6000.0
29,00K4W00000Leo7zUAB,0064W00000xv1zsQAA,0034W00002NjErPQAV,0124W0000007bPpQAI,True,1,24.0,2020-03-22,New Business,2020-03-03T21:12:30,New Business,New Business,24.0,24.0
30,00K4W00000Leo7yUAB,0064W00000xv1zsQAA,0034W00002NjErPQAV,0124W0000007bPpQAI,True,1,24.0,2020-03-22,New Business,2020-03-03T21:12:30,New Business,New Business,24.0,24.0
35,00K4W00000LhOplUAF,0064W00000xv2AYQAY,0034W00002NjEh2QAF,0124W0000007bPpQAI,True,1,30000.0,2020-09-17,New Business,2020-09-01T16:46:01,New Business,New Business,30000.0,30000.0
78,00K4W00000JAMYSUA5,0064W000011zx8eQAA,0034W00002WacxAQAR,0124W0000007bPpQAI,True,1,186000.0,2021-11-25,New Business,2021-06-16T10:01:40,New Business,New Business,186000.0,186000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1518,00K4W00000J9gCuUAJ,0064W00000yq7k8QAA,0034W00002b9fgHQAQ,0124W0000007bPpQAI,True,1,25000.0,2021-09-30,New Business,2021-03-25T13:11:34,New Business,New Business,25000.0,25000.0
1519,00K4W00000HKm5DUAT,0064W00000yq7k8QAA,0034W00002ToOLtQAN,0124W0000007bPpQAI,True,1,25000.0,2021-09-30,New Business,2021-03-25T13:11:34,New Business,New Business,25000.0,25000.0
1520,00K4W00000GcFTjUAN,0064W00000yq7k8QAA,0034W00002NjF7KQAV,0124W0000007bPpQAI,True,1,25000.0,2021-09-30,New Business,2021-03-25T13:11:34,New Business,New Business,25000.0,25000.0
1524,00K4W00000LgS7JUAV,0064W00000xv27rQAA,0034W00002NjGBzQAN,0124W0000007bPpQAI,True,1,4500.0,2020-02-18,New Business,2019-12-20T16:36:49,New Business,New Business,4500.0,4500.0


In [128]:
tasks_to_merge = tasks.groupby(['WhoId', 'Type']).size().unstack(fill_value=0).reset_index().rename({'Call': 'call_count', 'Email': 'email_count', 'Other': 'other_count'}, axis=1)

In [129]:
tasks_to_merge

Type,WhoId,call_count,email_count,Meeting,other_count
0,0034W00002NjEfPQAV,0,1,0,0
1,0034W00002NjEfiQAF,0,2,0,0
2,0034W00002NjEg6QAF,0,13,0,0
3,0034W00002NjEgGQAV,0,2,0,0
4,0034W00002NjEgRQAV,0,25,0,0
...,...,...,...,...,...
81297,00Q8a00001q1ESDEA2,0,1,0,0
81298,00Q8a00001q1ETWEA2,0,1,0,0
81299,00Q8a00001q1ETXEA2,0,1,0,0
81300,00Q8a00001q1EUbEAM,0,2,0,0


In [130]:
events_to_merge = events.groupby(['WhoId']).size().reset_index(name='event_count')
events_to_merge.head()

Unnamed: 0,WhoId,event_count
0,0034W00002NjEg6QAF,2
1,0034W00002NjEgIQAV,1
2,0034W00002NjEgRQAV,13
3,0034W00002NjEgWQAV,2
4,0034W00002NjEgZQAV,1


In [131]:
prospects_0.columns

Index(['id', 'became_mql_date', 'title', 'job_function', 'account_id',
       'territory', 'industry', 'state', 'country', 'account_area'],
      dtype='object')

In [132]:
prospects2 = prospects_0.merge(new_business_amount, how="left", left_on="id", right_on="ContactId").drop(['ContactId'], axis=1)

In [133]:
# prospects2 = prospects.merge(opps_to_merge, how='left', left_on='Id', right_on='ContactId').drop(['ContactId'], axis=1)

In [134]:
prospects3 = prospects2.merge(tasks_to_merge, how='left', left_on='id', right_on='WhoId').drop(['WhoId'], axis=1)

In [135]:
prospects4 = prospects3.merge(events_to_merge, how='left', left_on='id', right_on='WhoId').drop(['WhoId'], axis=1)

In [136]:
prospects4.Meeting = prospects4.Meeting.fillna(0)
prospects4.event_count = prospects4.event_count.fillna(0)

In [137]:
prospects4.loc[:, "MeetingCount"] = prospects4.Meeting + prospects4.event_count

In [138]:
prospects4.head()

Unnamed: 0,id,became_mql_date,title,job_function,account_id,territory,industry,state,country,account_area,Amount,OpportunityType,IsWon,amount_won,call_count,email_count,Meeting,other_count,event_count,MeetingCount
0,0034W00002NjEeyQAF,NaT,,Manager,0014W00002FhDWeQAN,Rest of World,,,,Rest of World,0.0,1.0,0.0,0.0,,,0.0,,0.0,0.0
1,0034W00002NjEezQAF,NaT,,Manager,0014W00002FhDOLQA3,Rest of World,,,,Rest of World,,,,,,,0.0,,0.0,0.0
2,0034W00002NjEf0QAF,2020-01-17 11:07:07,OD Project Director,Director,0014W00002Fh6yWQAR,APAC,Technology,Hong Kong,China,Rest of World,,,,,,,0.0,,0.0,0.0
3,0034W00002NjEf1QAF,NaT,Regional Head of HR,Director,0014W00002Fh6yWQAR,APAC,Technology,Hong Kong,China,Rest of World,,,,,,,0.0,,0.0,0.0
4,0034W00002NjEf2QAF,2020-01-22 05:44:06,System Integrator (System Support),Team member,0014W00002Fh6yWQAR,APAC,Technology,Hong Kong,China,Rest of World,,,,,,,0.0,,0.0,0.0


In [139]:
prospects4 = prospects4.drop(["Meeting", "event_count"], axis=1)

In [140]:
prospects4.columns

Index(['id', 'became_mql_date', 'title', 'job_function', 'account_id',
       'territory', 'industry', 'state', 'country', 'account_area', 'Amount',
       'OpportunityType', 'IsWon', 'amount_won', 'call_count', 'email_count',
       'other_count', 'MeetingCount'],
      dtype='object')

In [141]:
prospects4 = prospects4.rename(columns={
    "OpportunityType": "new_business_count",
    "account_area": "region",
    "territory": "sub_region",
    "Amount": "new_business_opportunity_amount_sum",
    "amount_won": "new_business_won_amount_sum",
    "IsWon": "is_new_business_won_count"
})

In [142]:
prospects4.columns

Index(['id', 'became_mql_date', 'title', 'job_function', 'account_id',
       'sub_region', 'industry', 'state', 'country', 'region',
       'new_business_opportunity_amount_sum', 'new_business_count',
       'is_new_business_won_count', 'new_business_won_amount_sum',
       'call_count', 'email_count', 'other_count', 'MeetingCount'],
      dtype='object')

In [143]:
# Clean up
prospects4.new_business_count = prospects4.new_business_count.fillna(0)
prospects4.new_business_opportunity_amount_sum = prospects4.new_business_opportunity_amount_sum.fillna(0)
prospects4.new_business_won_amount_sum = prospects4.new_business_won_amount_sum.fillna(0)
prospects4.is_new_business_won_count = prospects4.is_new_business_won_count.fillna(0)
# prospects4.renewal_count = prospects4.renewal_count.fillna(0)
# prospects4.upsell_count = prospects4.upsell_count.fillna(0)
# prospects4.partner_count = prospects4.partner_count.fillna(0)
prospects4.call_count = prospects4.call_count.fillna(0)
prospects4.email_count = prospects4.email_count.fillna(0)
prospects4.other_count = prospects4.other_count.fillna(0)
prospects4.MeetingCount = prospects4.MeetingCount.fillna(0)
# prospects4.event_count = prospects4.event_count.fillna(0)
# prospects4.Amount = prospects4.Amount.fillna(0)
prospects4.region = prospects4.region.fillna("")
prospects4.sub_region = prospects4.sub_region.fillna("")

In [144]:
# Change NA -> North America in region and sub_region
prospects4.loc[:, "region"] = prospects4.region.map(lambda x: x.replace("NA", "North America") if x.startswith("NA") else x)
prospects4.loc[:, "sub_region"] = prospects4.region.map(lambda x: x.replace("NA", "North America") if x.startswith("NA") else x)

In [145]:
prospects4.head()

Unnamed: 0,id,became_mql_date,title,job_function,account_id,sub_region,industry,state,country,region,new_business_opportunity_amount_sum,new_business_count,is_new_business_won_count,new_business_won_amount_sum,call_count,email_count,other_count,MeetingCount
0,0034W00002NjEeyQAF,NaT,,Manager,0014W00002FhDWeQAN,Rest of World,,,,Rest of World,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0034W00002NjEezQAF,NaT,,Manager,0014W00002FhDOLQA3,Rest of World,,,,Rest of World,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0034W00002NjEf0QAF,2020-01-17 11:07:07,OD Project Director,Director,0014W00002Fh6yWQAR,Rest of World,Technology,Hong Kong,China,Rest of World,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0034W00002NjEf1QAF,NaT,Regional Head of HR,Director,0014W00002Fh6yWQAR,Rest of World,Technology,Hong Kong,China,Rest of World,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0034W00002NjEf2QAF,2020-01-22 05:44:06,System Integrator (System Support),Team member,0014W00002Fh6yWQAR,Rest of World,Technology,Hong Kong,China,Rest of World,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [146]:
meetings = pd.concat([events, tasks[tasks["Type"] == "Meeting"]])
meetings["Type"].head()

0    Meeting
1    Meeting
2    Meeting
3       None
4    Meeting
Name: Type, dtype: object

In [147]:
# When was the first time a prospect had a meeting?
# First ActivityDate in events for each prospect.
# events.groupby(["WhoId", "ActivityDate"]).head()

   
_df = meetings.sort_values(["WhoId", "ActivityDate"], ascending=[True, True])
# _df[temp_df["WhoId"] == "0034W00002NjEg6QAF"]
first_meetings = _df.drop_duplicates(subset="WhoId", keep="first", ignore_index=True)
first_meetings = first_meetings.loc[:, ["WhoId", "ActivityDate"]]
# first_meetings[first_meetings["WhoId"] == "0034W00002NjEg6QAF"]
first_meetings

Unnamed: 0,WhoId,ActivityDate
0,0034W00002NjEg6QAF,2021-06-30T00:00:00
1,0034W00002NjEgIQAV,2021-06-15T00:00:00
2,0034W00002NjEgRQAV,2021-08-26T00:00:00
3,0034W00002NjEgWQAV,2022-02-09T00:00:00
4,0034W00002NjEgZQAV,2022-01-11T00:00:00
...,...,...
2535,00Q8a00001q1ATFEA2,2022-04-14T00:00:00
2536,00Q8a00001q1AdcEAE,2022-04-13T00:00:00
2537,00Q8a00001q1D7BEAU,2022-04-19T00:00:00
2538,00Q8a00001q1DsiEAE,2022-04-22T00:00:00


In [164]:
new_business_ops.groupby(["OpportunityType"]).size()

OpportunityType
New Business    1025
dtype: int64

In [148]:
# When did a prospect become a closed/won?
# First CloseDate for won opportunities.
first_won = (
    new_business_ops[new_business_ops["IsWon"] == True]
    .sort_values(["ContactId", "CloseDate"], ascending=[True, True])
    .drop_duplicates(subset="ContactId", keep="first", ignore_index=True)
)
first_won = first_won.loc[:, ["ContactId", "CloseDate"]]
first_won.head()

Unnamed: 0,ContactId,CloseDate
0,0034W00002NjEh2QAF,2020-09-17
1,0034W00002NjEhCQAV,2020-09-10
2,0034W00002NjEhIQAV,2020-08-11
3,0034W00002NjElIQAV,2020-12-08
4,0034W00002NjErPQAV,2020-03-22


In [149]:
prospects5 = prospects4.merge(
    first_meetings,
    how="left",
    left_on="id",
    right_on="WhoId"
)

In [150]:
prospects5 = prospects5.merge(
    first_won,
    how="left",
    left_on='id', 
    right_on='ContactId'
)

In [151]:
prospects5.drop(columns=['WhoId', 'ContactId'], inplace=True)

In [152]:
prospects5.rename(
    columns={
        "ActivityDate": "first_meeting_date",
        "CloseDate": "first_opportunity_won_date"
    },
    inplace=True
)

In [153]:
prospects5.columns.tolist()

['id',
 'became_mql_date',
 'title',
 'job_function',
 'account_id',
 'sub_region',
 'industry',
 'state',
 'country',
 'region',
 'new_business_opportunity_amount_sum',
 'new_business_count',
 'is_new_business_won_count',
 'new_business_won_amount_sum',
 'call_count',
 'email_count',
 'other_count',
 'MeetingCount',
 'first_meeting_date',
 'first_opportunity_won_date']

In [154]:
prospects5.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 288893 entries, 0 to 288892
Data columns (total 20 columns):
 #   Column                               Non-Null Count   Dtype         
---  ------                               --------------   -----         
 0   id                                   288893 non-null  object        
 1   became_mql_date                      67386 non-null   datetime64[ns]
 2   title                                215101 non-null  object        
 3   job_function                         257503 non-null  object        
 4   account_id                           165118 non-null  object        
 5   sub_region                           288893 non-null  object        
 6   industry                             215637 non-null  object        
 7   state                                111297 non-null  object        
 8   country                              273774 non-null  object        
 9   region                               288893 non-null  object        
 

In [155]:
def fix_date_column(value):
    if type(value) == str:
        print("string")
        print(value)
        if value == "":
            return ""
        sys.exit()
    else:
        print(type(value), value)
    # lambda x: "" if pd.isnull(x) else x.strftime("%Y-%m-%d")

In [156]:
for column in ("first_meeting_date", "first_opportunity_won_date"):
    prospects5[column] = prospects5[column].apply(fix_date_column)
    # prospects5[column].fillna("")

<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan
<class 'float'> nan


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [160]:
import numpy as np
prospects5[prospects5["first_opportunity_won_date"] == pd.NaT]

Unnamed: 0,id,became_mql_date,title,job_function,account_id,sub_region,industry,state,country,region,new_business_opportunity_amount_sum,new_business_count,is_new_business_won_count,new_business_won_amount_sum,call_count,email_count,other_count,MeetingCount,first_meeting_date,first_opportunity_won_date


In [None]:
prospects5[prospects5["new_business_count"] > 0]

In [161]:
prospects5.to_csv(os.path.join(os.pardir, 'top_of_funnel', 'data', 'salesforce_prospects_data_for_ads_v5.csv'), index=False)

In [None]:
prospects5.shape

In [None]:
len(prospects5["id"].unique())