#Packages & Imports

In [None]:
!pip install -q --upgrade gspread
!pip install -q pandas
!pip install -q -U -q PyDrive
!pip install -q datetime
!pip install -q pendulum

[K     |████████████████████████████████| 60 kB 3.0 MB/s 
[K     |████████████████████████████████| 251 kB 10.4 MB/s 
[K     |████████████████████████████████| 155 kB 5.0 MB/s 
[K     |████████████████████████████████| 489 kB 41.7 MB/s 
[?25h

In [None]:
from google.colab import auth
auth.authenticate_user()

import gspread
from oauth2client.client import GoogleCredentials
import pandas as pd
import re
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
gc = gspread.authorize(GoogleCredentials.get_application_default())
import datetime
import pendulum

#Getting Prospection Data

Check the prospects tables for N/As, so:
1. Open them one by one and see if they load without problems: [Prospects 1](https://docs.google.com/spreadsheets/d/19H7kZTWoDlKTk9wnaHt9OFlVGPDOV-LWygpxOsAB9T0/edit#gid=330075284), [Prospects 2](https://docs.google.com/spreadsheets/d/1XGrArQUq3O1uo7oYRPeKtPczGBkxd4v-yJtBdy_SZmQ/edit#gid=0), [Prospects 3](https://docs.google.com/spreadsheets/d/12GRG8PNzKhkhVcuUcrLWXvVLxz58gdF_0zVy4vS24NU/edit#gid=1476972697)
2. If all of them are alright, run the following cells


In [None]:
# Prospects Table
p1 = gc.open_by_key('19H7kZTWoDlKTk9wnaHt9OFlVGPDOV-LWygpxOsAB9T0')
p1_data = p1.worksheet('Prospects')
p1rows = p1_data.get_all_values()
df_p1 = pd.DataFrame.from_records(p1rows[1:],columns=p1rows[0])
# Prospects 2 Table
p2 = gc.open_by_key('1XGrArQUq3O1uo7oYRPeKtPczGBkxd4v-yJtBdy_SZmQ')
p2_data = p2.worksheet('Sheet1')
p2rows = p2_data.get_all_values()
df_p2 = pd.DataFrame.from_records(p2rows[1:],columns=p2rows[0])
p3 = gc.open_by_key('12GRG8PNzKhkhVcuUcrLWXvVLxz58gdF_0zVy4vS24NU')
# Prospects 3 Table
p3_data = p3.worksheet('All')
p3rows = p3_data.get_all_values()
df_p3 = pd.DataFrame.from_records(p3rows[1:],columns=p3rows[0])

# Joining them
prospects_df = pd.concat([df_p1,df_p2,df_p3],axis=0)
#Adding domain column
prospects_df['domain'] = prospects_df['Email'].str.split('@').str[1] # adding Domain column
#Adding Week Column
prospects_df['Date'] = pd.to_datetime(prospects_df['Date'])
prospects_df['WeekNum'] = prospects_df['Date'].dt.strftime('%Y-%b-w%U')

In [None]:
#prospects_df[prospects_df['Company Name']=="MAPFRE Salud ARS"]

# Outreach Weekly Table

## Selecting Outreach Weekly Data

Filtering previous week

In [None]:
prosp_out_df = prospects_df.copy()

In [None]:
mydate = datetime.date.today()
current_week = mydate.strftime('%Y-%b-w%U')
current_week

'2022-Jan-w03'

In [None]:
today = datetime.date.today() #used to get the today datetime
weekday = today.weekday() #get the weekday
start_delta = datetime.timedelta(days=weekday, weeks=1) #delta of the previous week based on the day
start_of_week = today - start_delta #substracting the delta to get a a date respective to the previous week
start_of_week
outreach_week = start_of_week.strftime('%Y-%b-w%U') #transforming to be in the same format as the dataset
outreach_week

'2022-Jan-w02'

In [None]:
from datetime import date, timedelta   
last_friday = today - timedelta(days=today.weekday()) + timedelta(days=4, weeks=-1)
last_monday = today - timedelta(days=today.weekday()) + timedelta(days=7, weeks=-2)
print("Dates: ", last_monday, "to", last_friday)
#outreach_week['Date'] = pd.to_datetime(outreach_week['Date'])
#print(weekly_outreach_raw[(weekly_outreach_raw['Date'] >= pd.Timestamp(last_monday)) & (weekly_outreach_raw['Date'] <= pd.Timestamp(last_friday))])

Dates:  2022-01-10 to 2022-01-14


In [None]:
# Filtered but not treated
prosp_out_df['Date'] = pd.to_datetime(prosp_out_df['Date'])
weekly_outreach_raw = prosp_out_df[(prosp_out_df['Date'] >= pd.Timestamp(last_monday)) & (prosp_out_df['Date'] <= pd.Timestamp(last_friday))]
weekly_outreach_raw.sort_values('Date').shape

(1936, 26)

## Clean-up & Processing

### Applying Head Industries & Translating Industries Names

Head and Sub Industries Dictionary from [Industries File](https://docs.google.com/spreadsheets/d/1ASfhwu4pKwET5bz8coI59Bzpn05OuGoZBmWr12nQsyc/edit#gid=0)

In [None]:
ind_accesss = gc.open_by_key('1ASfhwu4pKwET5bz8coI59Bzpn05OuGoZBmWr12nQsyc')
ind_data = ind_accesss.worksheet('Industries')
indsrows = ind_data.get_all_values()
df_industries = pd.DataFrame.from_records(indsrows[1:],columns=indsrows[0])
ind_dict = dict(zip(df_industries['Industry'],df_industries['Head Industry']))
#Adding Head Industry column to prospection tables
weekly_outreach_raw['Head Industry'] = weekly_outreach_raw["Industry"].map(ind_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Applying Industries Translations using [another table](https://docs.google.com/spreadsheets/d/1opKh-8bi55_A92SNn3UzGp7hBXiMx9LHWlIOeKUKjI8/edit#gid=0) as Dictionary

In [None]:
ind_trans_accesss = gc.open_by_key('1opKh-8bi55_A92SNn3UzGp7hBXiMx9LHWlIOeKUKjI8')
ind_trans_data = ind_trans_accesss.worksheet('Sheet1')
indstransrows = ind_trans_data.get_all_values()
df_industries_trans = pd.DataFrame.from_records(indstransrows[1:],columns=indstransrows[0])
ind_dict_pt = dict(zip(df_industries_trans['Industry (EN)'],df_industries_trans['Industry (PT)'])) # dictionary for Brasil
ind_dict_es = dict(zip(df_industries_trans['Industry (EN)'],df_industries_trans['Industry (ES)'])) # dictionary for LATAM & MX

def ind_trans(weekly_outreach_raw):
  if(weekly_outreach_raw['Country'] == 'Brasil' or weekly_outreach_raw['Country'] == 'Brazil'):
    return weekly_outreach_raw[['Industry']].map(ind_dict_pt)
  elif (weekly_outreach_raw['Country'] not in ['Brasil','Brazil']):
    return weekly_outreach_raw[['Industry']].map(ind_dict_es)

weekly_outreach_fil = weekly_outreach_raw.assign(Industry_T=weekly_outreach_raw.apply(ind_trans,axis=1))
weekly_outreach_fil.head(3)

Unnamed: 0,Date,Company Name,ID,S-ID,LGA,Country,City,Score Corp,Course 1,Course 2,Web page,First Name,Last Name,Contact Name,Score,Title,Email,LinkedIn Profile,Phone,Industry,Company ID,website,Name,NeverBounce,domain,WeekNum,Head Industry,Industry_T
15186,2022-01-11,Naturmega,,,Isabella,Colombia,Barranquilla,,Recruiting,Medical,,,,,2,Analista RH,mdiaz@naturmega.com.co,https://www.linkedin.com/in/maria-camila-diaz-...,(57) 5 371-9773,Alternative Medicine,,https://naturmega.com/home,Maria Camila Diaz,valid,naturmega.com.co,2022-Jan-w02,Healthcare & Pharmaceuticals,de medicina alternativa
15187,2022-01-11,Naturmega,,,Isabella,Colombia,Barranquilla,,Sales,Medical,,,,,3,Directora administrativa,pnavarro@naturmega.com.co,https://www.linkedin.com/in/paola-navarro-rome...,(57) 5 371-9773,Alternative Medicine,,https://naturmega.com/home,Paola Navarro,unknown,naturmega.com.co,2022-Jan-w02,Healthcare & Pharmaceuticals,de medicina alternativa
15188,2022-01-11,Naturmega,,,Isabella,Colombia,Barranquilla,,Sales,Medical,,,,,4,Gerente de I&D y Nuevos Negocios,wmartinez@naturmega.com.co,https://www.linkedin.com/in/wilson-martinez-94...,(57) 5 371-9773,Alternative Medicine,,https://naturmega.com/home,Wilson Martinez,valid,naturmega.com.co,2022-Jan-w02,Healthcare & Pharmaceuticals,de medicina alternativa


### Checking for Prospection Periods

####Checking Last Prospection Date for the Domain crossed with Country and removing <90 days ones

Using loc with subtraction of duplicated combinations of domain and country to keep only a dataset with "lasts" to pass on the filter later

In [None]:
ptest = prosp_out_df[prosp_out_df['WeekNum']!=outreach_week].copy()
m1 = ~ptest.duplicated(['domain','Country'], keep='last')
m2 = ptest.duplicated(['domain','Country'], keep= False)
m = m1 & m2
ptest.loc[m, 'Last'] = 'Last'
lastdomain_df = ptest[ptest['Last']=='Last']
lastdomain_df

Unnamed: 0,Date,Company Name,ID,S-ID,LGA,Country,City,Score Corp,Course 1,Course 2,Web page,First Name,Last Name,Contact Name,Score,Title,Email,LinkedIn Profile,Phone,Industry,Company ID,website,Name,NeverBounce,domain,WeekNum,Last
2351,2019-10-09,KAESER,830067414,,Juan,Colombia,Bogotá,35%,Management,Finance,www.kaeser.com.co,,,Margarita Juliá,,Marketing coordinator,Margarita.Julia@kaeser.com.co,,57 1 7429393,industrial,,,,,kaeser.com.co,2019-Oct-w40,Last
2515,2019-10-31,Corporacion Autonoma Regional del Valle del Cauca,890399002,,Juan,Colombia,Cali,20%,Management,Finance,www.cvc.gov.co,Luis Guillermo,Parra,Luis Guillermo Parra,,Director de Planeación,luis-guillermo.parra@cvc.gov.co,,,gubernamental,,,,,cvc.gov.co,2019-Oct-w43,Last
2521,2019-10-31,Superintendencia de Sociedades,899999086,,Juan,Colombia,Bogotá,20%,Management,Finance,www.supersociedades.gov.co,Danery,Buitrago,Danery Buitrago,,Secretaria General,daneryb@supersociedades.gov.co,,,gubernamental,,,,,supersociedades.gov.co,2019-Oct-w43,Last
2527,2019-10-31,Imprenta Nacional de Colombia,830001113,,Juan,Colombia,Bogotá,20%,Management,Finance,www.imprenta.gov.co,Octavio,Villamarín,Octavio Villamarín,,Gerente General,octavio.villamarin@imprenta.gov.co,,,gubernamental,,,,,imprenta.gov.co,2019-Oct-w43,Last
2532,2019-10-31,Unidad Nacional de Proteccion,900475780,,Juan,Colombia,Bogotá,43%,Management,Finance,www.unp.gov.co,Diana Patricia,Rios,Diana Patricia Rios,,Secretaria General,diana.rios@unp.gov.co,,,gubernamental,,,,,unp.gov.co,2019-Oct-w43,Last
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54102,2022-01-17,Great Place to Work® Colombia,,,Ericka,Colombia,Bogotá,,,,,,,,4,Profesional de Desarrollo,angelica.baquero@greatplacetowork.com.co,https://www.linkedin.com/in/angelica-maria-baq...,57 3187344059,Management Consulting,,https://www.greatplacetowork.com.co/,ANGELICA MARIA BAQUERO GUTIERREZ,,greatplacetowork.com.co,2022-Jan-w03,Last
54105,2022-01-17,Consejo Colombiano de Seguridad - CCS,,,Ericka,Colombia,Bogotá,,,,,,,,4,Coordinadora de Cursos Abiertos,emilce.mora@ccs.org.co,https://www.linkedin.com/in/emilce-mora-ariza-...,(60-1) 9191920,Management Consulting,,https://ccs.org.co/,Emilce Mora Ariza,,ccs.org.co,2022-Jan-w03,Last
54107,2022-01-17,Omnia Solution S.A.C.,,,Ericka,Peru,Lima,,,,,,,,4,Gerente Comercial,psalinas@omniasolution.com,https://www.linkedin.com/in/psalinasc,(51-1) 437-4717,Management Consulting,,https://omniasolution.com/,Pedro Salinas,,omniasolution.com,2022-Jan-w03,Last
54110,2022-01-18,Agencia Global,,,Ericka,Chile,Santiago,,,,,,,,4,Directora de Cuentas & Nuevos Negocios,mkuzmanic@agenciaglobal.cl,https://www.linkedin.com/in/martinakuzmanicgio...,56 2 22479800,Marketing and Advertising,,https://www.agenciaglobal.cl/,Martina Kuzmanic Giovanelli,,agenciaglobal.cl,2022-Jan-w03,Last


Shape of Outreach File after matching

In [None]:
test_merge = weekly_outreach_fil.merge(lastdomain_df[['Date','domain','Country','LGA','Company Name']], on =['domain','Country'],how='left')#.drop_duplicates(['Email'])
test_merge.shape

(1936, 31)

Finding < 90 Days companies crossing Country and Domain attributes and showing the shape after dropping rows.

In [None]:
test_merge['Date_x'] = pd.to_datetime(test_merge['Date_x'])
test_merge['Date_y'] = pd.to_datetime(test_merge['Date_y'])
test_merge['LastProspDate'] = (test_merge['Date_x'] - test_merge['Date_y']).dt.days
#test_merge[(test_merge['LastProspDate'] < 90) & (test_merge['LastProspDate'] != 0) & (test_merge['LastProspDate']!= "NaN")]
#test_merge = test_merge.drop(test_merge[(test_merge['LastProspDate'] < 90) & (test_merge['LastProspDate'] != 0) & (test_merge['LastProspDate']!= "NaN")].index)
test_merge.shape

(1936, 32)

Difference in x is contacts with less than 90 days that were removed

####Checking Last Prospection Date for emails and removing <220 days

Same logic, but for e-mails, using loc with subtraction of duplicated combinations of email to keep only a dataset with "lasts" to pass on the filter later

In [None]:
ptest_email = prosp_out_df[prosp_out_df['WeekNum']!=outreach_week].copy()
m1_email = ~ptest_email.duplicated(['Email'], keep='last')
m2_email = ptest_email.duplicated(['Email'], keep= False)
m_email = m1_email & m2_email
ptest_email.loc[m_email, 'Last'] = 'Last'
lastemail_df = ptest_email[ptest_email['Last']=='Last']
lastemail_df.shape

(25883, 27)

In [None]:
test_merge2 = test_merge.merge(lastemail_df[['Date','Email','Country','Name']], on =['Email','Country'],how='left')#.drop_duplicates(['Email'])
test_merge2.shape

(1936, 34)

In [None]:
test_merge2['LastProspEmailDate'] = (test_merge2['Date_x'] - test_merge2['Date']).dt.days
#test_merge2[(test_merge2['LastProspEmailDate'] < 220) & (test_merge2['LastProspEmailDate'] != 0) & (test_merge2['LastProspEmailDate']!= "NaN")]
#test_merge2 = test_merge2.drop(test_merge2[(test_merge2['LastProspEmailDate'] < 220) & (test_merge2['LastProspEmailDate'] != 0) & (test_merge2['LastProspEmailDate']!= "NaN")].index)
test_merge2.shape

(1936, 35)

#### Checking last Contact on Hubspot

Accessing Hubspot's Contacts Report

In [None]:
hb_cont_download = drive.CreateFile({'id':'1k4XJtuLPSU4K8oR35XSB4aOqD1_3kXYo'})
hb_cont_download.GetContentFile('ContactReport.xlsx')
hbcontacts_df = pd.read_excel('ContactReport.xlsx')
hbcontacts_df.columns

Index(['Contact ID', 'First Name', 'Last Name', 'Email', 'Email Domain',
       'Create Date', 'Last Activity Date', 'Last Contacted', 'Contact owner',
       'Prospector', 'Job Title', 'Phone Number', 'Mobile Phone Number',
       'Company Name', 'Company size', 'Country Instapage', 'Area',
       'Marketing contact status', 'Strategy', 'Original Source',
       'Original Source Drill-Down 1', 'Original Source Drill-Down 2',
       'Marketing contact until next update', 'Lifecycle Stage'],
      dtype='object')

In [None]:
hb_merge1 = test_merge2.merge(hbcontacts_df[['Last Activity Date','Email']], on =['Email'],how='left')#.drop_duplicates(['Email'])
hb_merge1['LastHubSpotDate'] = (hb_merge1['Date_x'] - hb_merge1['Last Activity Date']).dt.days
hb_merge1['LastHubSpotDate']

0         NaN
1       147.0
2         NaN
3         NaN
4         NaN
        ...  
1931      NaN
1932      NaN
1933      NaN
1934    223.0
1935    499.0
Name: LastHubSpotDate, Length: 1936, dtype: float64

#### Checking last contact on Mailshake

In [None]:
msreport_downl = drive.CreateFile({'id':'1PeexA0aSk9zTWbgyTn1mle74IlNdCCXo'})
msreport_downl.GetContentFile('MailshakeReport.csv')
mailshake_df = pd.read_csv('MailshakeReport.csv',delimiter=";")
mailshake_df.columns

  interactivity=interactivity, compiler=compiler, result=result)


Index(['Email', 'Name', 'Campaign', 'Message', 'Subject', 'Sent date',
       'First open date', 'Reply date', 'Campaign ID', 'Click count',
       ...
       'pÃ¡gina web', 'id', 'Ã¡rea', 'account', 'comments', 'emaildomain',
       'lost reason', 'next step', 'telehpone', 'clients'],
      dtype='object', length=139)

In [None]:
m1_ms = ~mailshake_df.duplicated(['Email'], keep='last')
m2_ms = mailshake_df.duplicated(['Email'], keep= False)
m_ms = m1_ms & m2_ms
mailshake_df.loc[m_ms, 'Last'] = 'Last'
lastms_df = mailshake_df[mailshake_df['Last']=='Last']
lastms_df.shape

(127607, 140)

In [None]:
ms_merge1 = hb_merge1.merge(lastms_df[['Sent date','Email']], on =['Email'],how='left')#.drop_duplicates(['Email'])
ms_merge1['Sent date'] = pd.to_datetime(ms_merge1['Sent date'])
ms_merge1['LastMailShakeDate'] = (ms_merge1['Date_x'] - ms_merge1['Sent date']).dt.days
ms_merge1.shape

(1936, 39)

####Denining final columns after check

Re-defining weekly_outreach_fil as the output to work on next functions and dropping formulated columns

In [None]:
weekly_outreach_fil = ms_merge1[['Date_x', 'Company Name_x', 'ID', 'S-ID', 'LGA_x', 'Country', 'City',
       'Score Corp', 'Course 1', 'Course 2', 'Web page', 'First Name',
       'Last Name', 'Contact Name', 'Score', 'Title', 'Email',
       'LinkedIn Profile', 'Phone', 'Industry','Industry_T', 'Company ID', 'website',
       'Name_x', 'NeverBounce', 'domain', 'WeekNum', 'Head Industry',
       'LastProspDate','LastProspEmailDate','LastHubSpotDate','LastMailShakeDate']].rename(columns={'Date_x':'Date','Company Name_x':'Company Name','LGA_x':'LGA','Name_x':'Name','Industry_T':'Sector'})
weekly_outreach_fil.columns

Index(['Date', 'Company Name', 'ID', 'S-ID', 'LGA', 'Country', 'City',
       'Score Corp', 'Course 1', 'Course 2', 'Web page', 'First Name',
       'Last Name', 'Contact Name', 'Score', 'Title', 'Email',
       'LinkedIn Profile', 'Phone', 'Industry', 'Sector', 'Company ID',
       'website', 'Name', 'NeverBounce', 'domain', 'WeekNum', 'Head Industry',
       'LastProspDate', 'LastProspEmailDate', 'LastHubSpotDate',
       'LastMailShakeDate'],
      dtype='object')

### Checking for Email Duplicates and Dropping

Function to check on Duplicates, show them and remove them if needed

In [None]:
check_email_dup = weekly_outreach_fil[['Email','Company Name','LGA']].groupby(['LGA','Email']).count()
check_email_dup['Company Name'] = pd.to_numeric(check_email_dup['Company Name'])
def checkdup(check_email_dup):
  if [i for i in check_email_dup['Company Name'] if i > 1]:
    return print(" ## HAS DUPLICATES THAT WILL BE ELIMINATED ## ", check_email_dup[check_email_dup['Company Name']>1])
  else:
    return print("No Duplicates")
checkdup(check_email_dup)
weekly_outreach = weekly_outreach_fil.drop_duplicates('Email') #outreach file without duplicates

No Duplicates


### Applying Cluster Structure

Dropping Industry & Head Industry columns as keys

In [None]:
df_industries_bdr = df_industries.drop(['Industry','Head Industry'],axis=1)
df_industries_bdr.head(3)

Unnamed: 0,#,Unnamed: 2,HEAD INDUSTRIES,LATAM LGA,LATAM BDR,LATAM KAM,LATAM CLUSTER,BRASIL LGA,BRASIL BDR,BRASIL KAM,BRASIL CLUSTER,MX LGA,MX BDR,MX KAM,MX CLUSTER,INBOUND
0,1,,Automotive,Daniela,David,Luisa,Latam 1A,,Debora,Gabriela,BR 1,Arturo,Monica,Jimena,MX 1,Inbound
1,2,,Construction and Engineering,Daniela,David,Luisa,Latam 1A,,Debora,Gabriela,BR 1,Paola,Roberto,Carlos,MX 1A,Inbound
2,3,,Human Resources,Daniela,David,Luisa,Latam 1A,Renata,Lara,Gabriela,BR 2,Arturo,Monica,Jimena,MX 1,Inbound


Dictonaries based on the [Industries Table](https://docs.google.com/spreadsheets/d/1ASfhwu4pKwET5bz8coI59Bzpn05OuGoZBmWr12nQsyc/edit#gid=0) to map Clusters according to Head Industries

In [None]:
c_mx_dict = dict(zip(df_industries_bdr['HEAD INDUSTRIES'],df_industries['MX CLUSTER']))
c_br_dict = dict(zip(df_industries_bdr['HEAD INDUSTRIES'],df_industries['BRASIL CLUSTER']))
c_latam_dict = dict(zip(df_industries_bdr['HEAD INDUSTRIES'],df_industries['LATAM CLUSTER']))

Function that uses the previous dictionaries to apply .maps based on the Head Industry from prospection and creates a column with results

In [None]:
def cluster_names(weekly_outreach):
  if(weekly_outreach['Country'] == 'Brasil' or weekly_outreach['Country'] == 'Brazil'):
    return weekly_outreach[['Head Industry']].map(c_br_dict)
  elif (weekly_outreach['Country'] == 'Mexico' or weekly_outreach['Country'] == 'México'):
    return weekly_outreach[['Head Industry']].map(c_mx_dict)
  elif (weekly_outreach['Country'] not in ['Brasil','Brazil','Mexico','México']):
    return weekly_outreach[['Head Industry']].map(c_latam_dict)

df_outreach_cluster = weekly_outreach.assign(Cluster=weekly_outreach.apply(cluster_names,axis=1))
df_outreach_cluster.head(3)

Unnamed: 0,Date,Company Name,ID,S-ID,LGA,Country,City,Score Corp,Course 1,Course 2,Web page,First Name,Last Name,Contact Name,Score,Title,Email,LinkedIn Profile,Phone,Industry,Sector,Company ID,website,Name,NeverBounce,domain,WeekNum,Head Industry,LastProspDate,LastProspEmailDate,LastHubSpotDate,LastMailShakeDate,Cluster
0,2022-01-11,Naturmega,,,Isabella,Colombia,Barranquilla,,Recruiting,Medical,,,,,2,Analista RH,mdiaz@naturmega.com.co,https://www.linkedin.com/in/maria-camila-diaz-...,(57) 5 371-9773,Alternative Medicine,de medicina alternativa,,https://naturmega.com/home,Maria Camila Diaz,valid,naturmega.com.co,2022-Jan-w02,Healthcare & Pharmaceuticals,,,,,Latam 1
1,2022-01-11,Naturmega,,,Isabella,Colombia,Barranquilla,,Sales,Medical,,,,,3,Directora administrativa,pnavarro@naturmega.com.co,https://www.linkedin.com/in/paola-navarro-rome...,(57) 5 371-9773,Alternative Medicine,de medicina alternativa,,https://naturmega.com/home,Paola Navarro,unknown,naturmega.com.co,2022-Jan-w02,Healthcare & Pharmaceuticals,,,147.0,1204.0,Latam 1
2,2022-01-11,Naturmega,,,Isabella,Colombia,Barranquilla,,Sales,Medical,,,,,4,Gerente de I&D y Nuevos Negocios,wmartinez@naturmega.com.co,https://www.linkedin.com/in/wilson-martinez-94...,(57) 5 371-9773,Alternative Medicine,de medicina alternativa,,https://naturmega.com/home,Wilson Martinez,valid,naturmega.com.co,2022-Jan-w02,Healthcare & Pharmaceuticals,,,,,Latam 1


Now Dictionaries based on the same table but returning BDRs' Names

In [None]:
c_mx_dict_bdr = dict(zip(df_industries_bdr['HEAD INDUSTRIES'],df_industries['MX BDR']))
c_br_dict_bdr = dict(zip(df_industries_bdr['HEAD INDUSTRIES'],df_industries['BRASIL BDR']))
c_latam_dict_bdr = dict(zip(df_industries_bdr['HEAD INDUSTRIES'],df_industries['LATAM BDR']))

In [None]:
def bdrs_names(df_outreach_cluster):
  if(df_outreach_cluster['Country'] == 'Brasil' or df_outreach_cluster['Country'] == 'Brazil'):
    return df_outreach_cluster[['Head Industry']].map(c_br_dict_bdr)
  elif (df_outreach_cluster['Country'] == 'Mexico' or df_outreach_cluster['Country'] == 'México'):
    return df_outreach_cluster[['Head Industry']].map(c_mx_dict_bdr)
  elif (df_outreach_cluster['Country'] not in ['Brasil','Brazil','Mexico','México']):
    return df_outreach_cluster[['Head Industry']].map(c_latam_dict_bdr)

df_outreach = df_outreach_cluster.assign(BDR=df_outreach_cluster.apply(bdrs_names,axis=1))
df_outreach.tail(3)

Unnamed: 0,Date,Company Name,ID,S-ID,LGA,Country,City,Score Corp,Course 1,Course 2,Web page,First Name,Last Name,Contact Name,Score,Title,Email,LinkedIn Profile,Phone,Industry,Sector,Company ID,website,Name,NeverBounce,domain,WeekNum,Head Industry,LastProspDate,LastProspEmailDate,LastHubSpotDate,LastMailShakeDate,Cluster,BDR
1933,2022-01-14,Novatec Solutions,,,Ericka,Colombia,Bogotá,,Technology,Software Development,,,,,4,Especialista en CRM,john.laverde@novatec.com.co,https://www.linkedin.com/in/johnlaverde,(57 1) 668 0339,Information Technology and Services,de servicios y tecnologias de la informacion,,https://www.novatecsolutions.co/,John jairo laverde gonzalez,valid,novatec.com.co,2022-Jan-w02,Technology & Design,,,,,Latam 2,Melanie
1934,2022-01-14,Fiduagraria sa,,,Ericka,Colombia,Bogotá,,Corporate Finance,Economics,,,,,1,Jefe de Gestión Humana,mlasprilla@fiduagraria.gov.co,https://www.linkedin.com/in/melissa-lasprilla,(601) 560 9886,Financial Services,financiero,,https://www.fiduagraria.gov.co/,Melissa Lasprilla,valid,fiduagraria.gov.co,2022-Jan-w02,Financial Services,1085.0,1085.0,223.0,1016.0,Latam 2,Melanie
1935,2022-01-14,Fiduagraria sa,,,Ericka,Colombia,Bogotá,,Corporate Finance,Economics,,,,,2,Analista de Desarrollo y Formación,mamorales@fiduagraria.gov.co,https://www.linkedin.com/in/mariana-morales-ro...,(601) 560 9886,Financial Services,financiero,,https://www.fiduagraria.gov.co/,Mariana Morales Rojas,unknown,fiduagraria.gov.co,2022-Jan-w02,Financial Services,1085.0,,499.0,,Latam 2,Melanie


### Transforming & Selecting Output

Transforming the dataset columns

Outreach Table must have the following columns in order:


```
BDR	Company Name	Owner	Country	Course 1	Course 2	Industry	Sector  Pagina Web Full Name	First Name	Score	Title	
Email	Linkedin	Telephone	Neverbounce	Head Industry
```


In [None]:
df_outreach.columns

Index(['Date', 'Company Name', 'ID', 'S-ID', 'LGA', 'Country', 'City',
       'Score Corp', 'Course 1', 'Course 2', 'Web page', 'First Name',
       'Last Name', 'Contact Name', 'Score', 'Title', 'Email',
       'LinkedIn Profile', 'Phone', 'Industry', 'Sector', 'Company ID',
       'website', 'Name', 'NeverBounce', 'domain', 'WeekNum', 'Head Industry',
       'LastProspDate', 'LastProspEmailDate', 'LastHubSpotDate',
       'LastMailShakeDate', 'Cluster', 'BDR'],
      dtype='object')

In [None]:
df_outreach.fillna('NA', inplace=True)
df_outreach['Date'] = df_outreach['Date'].astype(str)
df_outreach['First Name'] = df_outreach['Name'].str.split().str[0]
df_outreach.rename(columns={'LGA':'Owner','LinkedIn Profile':'LinkedIn','Industry_T':'Industria'},inplace=True)
outreach_output_df = df_outreach[['Cluster','BDR','Company Name','Owner','Country','Course 1','Course 2','Industry','Sector','Head Industry','website','Name','First Name','Score','Title','Email','LinkedIn','Phone','NeverBounce','LastProspDate','LastProspEmailDate','LastHubSpotDate','LastMailShakeDate']]
outreach_output_df.head(3)

### Building Aditional Metrics Sheet

In [None]:
w_leads_lga = outreach_output_df[['Owner','Company Name']].groupby('Owner').nunique()
w_contacts_lga = outreach_output_df[['Owner','Email']].groupby('Owner').nunique()
w_leads_cluster = outreach_output_df[['Cluster','Company Name']].groupby('Cluster').nunique()
w_leads_lga['LGA'] = w_leads_lga.index
w_contacts_lga['LGA']= w_contacts_lga.index
w_leads_cluster['Cluster']= w_leads_cluster.index

## Creating Outreach List

In [None]:
ClusterList = outreach_output_df['Cluster'].drop_duplicates().tolist()
ClusterList

['Latam 1',
 'Latam 1A',
 'Latam 2',
 'BR 1',
 'BR 2',
 'MX 1A',
 'Latam 3',
 'MX 1',
 'Latam 1B']

Creating FIle and adding metrics sheet

In [None]:
today2 = date.today()
offset2 = (today2.weekday() -1) % 7
last_tuesday2 = today2 - timedelta(days=offset2)
out_tuesday = last_tuesday2.strftime("%m-%d-%y")
out_tuesday

'01-18-22'

In [None]:
ss_create = gc.create("Outreach List "+ out_tuesday,"1sQDyWBzuwKeIxAzn10xmWXEi2yKo6WfB")
ss_1 = ss_create.sheet1
aoa1 = [w_leads_lga.columns.tolist()] + w_leads_lga.to_numpy().tolist()
aoa2 = [w_contacts_lga.columns.tolist()] + w_contacts_lga.to_numpy().tolist()
aoa3 = [w_leads_cluster.columns.tolist()] + w_leads_cluster.to_numpy().tolist()
ss_1.update("A1",aoa1)
ss_1.update("D1",aoa2)
ss_1.update("F1",aoa3)
ss_1.update_title("General Info")

{'replies': [{}],
 'spreadsheetId': '1ILOr2BTKfIXrINjJ-OKwFTwkoR3lAXvNUDQZiMqqz0Y'}

Funtion that uses Cluster matching from Flag column to create sheets with separate ones

In [None]:
outreach_output_df.fillna('', inplace=True) # preventing API error
def createSpreadsheet(Cluster):
  ndf = outreach_output_df[outreach_output_df['Cluster'] == Cluster]
  nlist = [ndf.columns.tolist()] + ndf.to_numpy().tolist()
  nws = ss_create.add_worksheet(title=Cluster,rows=150,cols=30)
  nws.update_title(Cluster)
  nws.update("A1",nlist)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


Update command using the function

In [None]:
for Cluster in ClusterList:
  createSpreadsheet(Cluster)

The output is a table with a 'General Info' sheet with calculations like Leads by Saleperson, Contacts by Salespersion and Leads by Cluster (focused sales operation). 

Along with a separate sheet with cleaned and processed information for  Outreach strategy in form of E-mail Marketing.