In [None]:
!pip install -q --upgrade gspread
!pip install -q fuzzy_pandas

In [None]:
from google.colab import auth
auth.authenticate_user()

import gspread
from oauth2client.client import GoogleCredentials
import pandas as pd
import fuzzy_pandas as fpd
import re

In [None]:
gc = gspread.authorize(GoogleCredentials.get_application_default())

ss = gc.open_by_key('KEY')

Hubspot's Data from Jan 2021 on, filtered by Last Activity date

In [None]:
hubspot_data = ss.worksheet('py_hub')
hubsrows = hubspot_data.get_all_values()
df_hubs = pd.DataFrame.from_records(hubsrows[1:],columns=hubsrows[0])
display(df_hubs.columns)

Index(['Deal ID', 'Deal Name HB', 'Company Name', 'Create Date', 'Country',
       'Industry', 'LGA HB', 'BDR HB', 'Deal Type', 'Sales Strategy', 'SQL',
       'SQL Date', 'SQL Value', 'Last Activity Date', 'BDR First Name'],
      dtype='object')

KPI SQLs Data from Jan 2021 on, filtered by meeting date (sql date)

In [None]:
kpis_data = ss.worksheet('py_kpis')
kpisrows = kpis_data.get_all_values()
df_kpis = pd.DataFrame.from_records(kpisrows[1:],columns=kpisrows[0])
display(df_kpis.columns)

Index(['#', 'Deal Name KPI', 'Names', 'Industry', 'LGA KPI', 'Proactive?',
       'Call Date', 'Call Time', 'BDR KPI', 'Meeting Day', 'Category', 'Owner',
       'Time Meeting', 'CE or CC', 'Market ', 'Campaign', 'List', 'List Owner',
       'Campaign Date', 'Deal Value', 'SQL', 'Opp. Date', 'Opportunity',
       'SQL Date'],
      dtype='object')

Function with string clean-up processes to make it easier to match columns later

In [None]:
def cleanCompanies(series):
  return series.str.lower().str.replace("[^a-z]\s"," ").str.replace(r"\b(company|corp|inc|a.c.|s.c.|mx|uy|br|co|)\b","").str.strip()

Using Column to add clean columns inside the Dataframes

In [None]:
df_kpis["cnCleanKPIS"] = cleanCompanies(df_kpis["Names"])

In [None]:
df_hubs["cnCleanHB"] = cleanCompanies(df_hubs["Company Name"])

In [None]:
matches = fpd.fuzzy_merge(df_hubs,df_kpis,
                          left_on=['cnCleanHB'],
                          right_on=['cnCleanKPIS'],
                          ignore_case=True,
                          #keep='match',  #stop this part to see the full table
                          method="levenshtein",
                          threshold=0.9)
display(matches)

levenshtein:

1. Result before cleaning -> 1776 matches but a lot of them were wrong (threshold 0.7)
2. Result after 1st cleaning -> 1553 matches but apparently less wrong ones (threshold=0.7)
3. Increasing treshold to 0.8 -> 1108 matches but errors in the last two were fixed
4. Winner was treshold to 0.9

jaro:
1. 12634 matches, completely shit (0.7 threshold)
2. increased threshold to 0.8 and it came with 1785, but better. Some duplicates though..


Taking a Look at the results

In [None]:
matches[["Deal Name HB","Company Name","cnCleanHB","Deal Name KPI","Names","cnCleanKPIS"]]

Unnamed: 0,Deal Name HB,Company Name,cnCleanHB,Deal Name KPI,Names,cnCleanKPIS
0,EM: MX: G-Global,G-Global,g-global,EM: MX: G-Global,MX: G-Global,g-global
1,WS: CO: Quality Tech Institute,Quality Tech Institute,quality tech institute,WS: CO: Quality Tech Institute,CO: Quality Tech Institute,quality tech institute
2,BR: Remessa Online,Remessa Online,remessa online,BR: Remessa Online,Remessa Online,remessa online
3,BR: Aperam,Aperam,aperam,BR: Aperam,Aperam,aperam
4,MX: Consejo Potosino de Ciencia y Tecnología,Consejo Potosino de Ciencia y Tecnología,consejo potosino de ciencia y tecnología,MX: Consejo Potosino de Ciencia y Tecnología,Consejo Potosino de Ciencia y Tecnología,consejo potosino de ciencia y tecnología
...,...,...,...,...,...,...
982,PE: Niubiz,Niubiz,niubiz,PE: Niubiz,Niubiz,niubiz
983,CL: I2B Technologies,I2B Technologies,i2b technologies,CL: I2B Technologies,I2B Technologies,i2b technologies
984,CL: I2B Technologies,I2B Technologies,i2b technologies,CL: I2B Technologies,I2B Technologies,i2b technologies
985,CO: Coltanques,Coltanques,coltanques,CO: Coltanques,Coltanques,coltanques


Cleaning Countries to a standard


In [None]:
#Dic that some loving soul shared online <3 
short2long_Countries = {"AF":"Afghanistan",
"AX":"Aland Islands",
"AL":"Albania",
"DZ":"Algeria",
"AS":"American Samoa",
"AD":"Andorra",
"AO":"Angola",
"AI":"Anguilla",
"AQ":"Antarctica",
"AG":"Antigua and Barbuda",
"AR":"Argentina",
"AM":"Armenia",
"AW":"Aruba",
"AU":"Australia",
"AT":"Austria",
"AZ":"Azerbaijan",
"BS":"Bahamas",
"BH":"Bahrain",
"BD":"Bangladesh",
"BB":"Barbados",
"BY":"Belarus",
"BE":"Belgium",
"BZ":"Belize",
"BJ":"Benin",
"BM":"Bermuda",
"BT":"Bhutan",
"BO":"Bolivia, Plurinational State of",
"BQ":"Bonaire, Sint Eustatius and Saba",
"BA":"Bosnia and Herzegovina",
"BW":"Botswana",
"BV":"Bouvet Island",
"BR":"Brazil",
"IO":"British Indian Ocean Territory",
"BN":"Brunei Darussalam",
"BG":"Bulgaria",
"BF":"Burkina Faso",
"BI":"Burundi",
"KH":"Cambodia",
"CM":"Cameroon",
"CA":"Canada",
"CV":"Cape Verde",
"KY":"Cayman Islands",
"CF":"Central African Republic",
"TD":"Chad",
"CL":"Chile",
"CN":"China",
"CX":"Christmas Island",
"CC":"Cocos (Keeling) Islands",
"CO":"Colombia",
"KM":"Comoros",
"CG":"Congo",
"CD":"Congo, The Democratic Republic of the",
"CK":"Cook Islands",
"CR":"Costa Rica",
"CI":"Côte d'Ivoire",
"HR":"Croatia",
"CU":"Cuba",
"CW":"Curaçao",
"CY":"Cyprus",
"CZ":"Czech Republic",
"DK":"Denmark",
"DJ":"Djibouti",
"DM":"Dominica",
"DO":"Dominican Republic",
"EC":"Ecuador",
"EG":"Egypt",
"SV":"El Salvador",
"GQ":"Equatorial Guinea",
"ER":"Eritrea",
"EE":"Estonia",
"ET":"Ethiopia",
"FK":"Falkland Islands (Malvinas)",
"FO":"Faroe Islands",
"FJ":"Fiji",
"FI":"Finland",
"FR":"France",
"GF":"French Guiana",
"PF":"French Polynesia",
"TF":"French Southern Territories",
"GA":"Gabon",
"GM":"Gambia",
"GE":"Georgia",
"DE":"Germany",
"GH":"Ghana",
"GI":"Gibraltar",
"GR":"Greece",
"GL":"Greenland",
"GD":"Grenada",
"GP":"Guadeloupe",
"GU":"Guam",
"GT":"Guatemala",
"GG":"Guernsey",
"GN":"Guinea",
"GW":"Guinea-Bissau",
"GY":"Guyana",
"HT":"Haiti",
"HM":"Heard Island and McDonald Islands",
"VA":"Holy See (Vatican City State)",
"HN":"Honduras",
"HK":"Hong Kong",
"HU":"Hungary",
"IS":"Iceland",
"IN":"India",
"ID":"Indonesia",
"IR":"Iran, Islamic Republic of",
"IQ":"Iraq",
"IE":"Ireland",
"IM":"Isle of Man",
"IL":"Israel",
"IT":"Italy",
"JM":"Jamaica",
"JP":"Japan",
"JE":"Jersey",
"JO":"Jordan",
"KZ":"Kazakhstan",
"KE":"Kenya",
"KI":"Kiribati",
"KP":"Korea, Democratic People's Republic of",
"KR":"Korea, Republic of",
"KW":"Kuwait",
"KG":"Kyrgyzstan",
"LA":"Lao People's Democratic Republic",
"LV":"Latvia",
"LB":"Lebanon",
"LS":"Lesotho",
"LR":"Liberia",
"LY":"Libya",
"LI":"Liechtenstein",
"LT":"Lithuania",
"LU":"Luxembourg",
"MO":"Macao",
"MK":"Macedonia, Republic of",
"MG":"Madagascar",
"MW":"Malawi",
"MY":"Malaysia",
"MV":"Maldives",
"ML":"Mali",
"MT":"Malta",
"MH":"Marshall Islands",
"MQ":"Martinique",
"MR":"Mauritania",
"MU":"Mauritius",
"YT":"Mayotte",
"MX":"Mexico",
"FM":"Micronesia, Federated States of",
"MD":"Moldova, Republic of",
"MC":"Monaco",
"MN":"Mongolia",
"ME":"Montenegro",
"MS":"Montserrat",
"MA":"Morocco",
"MZ":"Mozambique",
"MM":"Myanmar",
"NA":"Namibia",
"NR":"Nauru",
"NP":"Nepal",
"NL":"Netherlands",
"NC":"New Caledonia",
"NZ":"New Zealand",
"NI":"Nicaragua",
"NE":"Niger",
"NG":"Nigeria",
"NU":"Niue",
"NF":"Norfolk Island",
"MP":"Northern Mariana Islands",
"NO":"Norway",
"OM":"Oman",
"PK":"Pakistan",
"PW":"Palau",
"PS":"Palestinian Territory, Occupied",
"PA":"Panama",
"PG":"Papua New Guinea",
"PY":"Paraguay",
"PE":"Peru",
"PH":"Philippines",
"PN":"Pitcairn",
"PL":"Poland",
"PT":"Portugal",
"PR":"Puerto Rico",
"QA":"Qatar",
"RE":"Réunion",
"RO":"Romania",
"RU":"Russian Federation",
"RW":"Rwanda",
"BL":"Saint Barthélemy",
"SH":"Saint Helena, Ascension and Tristan da Cunha",
"KN":"Saint Kitts and Nevis",
"LC":"Saint Lucia",
"MF":"Saint Martin (French part)",
"PM":"Saint Pierre and Miquelon",
"VC":"Saint Vincent and the Grenadines",
"WS":"Samoa",
"SM":"San Marino",
"ST":"Sao Tome and Principe",
"SA":"Saudi Arabia",
"SN":"Senegal",
"RS":"Serbia",
"SC":"Seychelles",
"SL":"Sierra Leone",
"SG":"Singapore",
"SX":"Sint Maarten (Dutch part)",
"SK":"Slovakia",
"SI":"Slovenia",
"SB":"Solomon Islands",
"SO":"Somalia",
"ZA":"South Africa",
"GS":"South Georgia and the South Sandwich Islands",
"ES":"Spain",
"LK":"Sri Lanka",
"SD":"Sudan",
"SR":"Suriname",
"SS":"South Sudan",
"SJ":"Svalbard and Jan Mayen",
"SZ":"Swaziland",
"SE":"Sweden",
"CH":"Switzerland",
"SY":"Syrian Arab Republic",
"TW":"Taiwan, Province of China",
"TJ":"Tajikistan",
"TZ":"Tanzania, United Republic of",
"TH":"Thailand",
"TL":"Timor-Leste",
"TG":"Togo",
"TK":"Tokelau",
"TO":"Tonga",
"TT":"Trinidad and Tobago",
"TN":"Tunisia",
"TR":"Turkey",
"TM":"Turkmenistan",
"TC":"Turks and Caicos Islands",
"TV":"Tuvalu",
"UG":"Uganda",
"UA":"Ukraine",
"AE":"United Arab Emirates",
"GB":"United Kingdom",
"US":"United States",
"UM":"United States Minor Outlying Islands",
"UY":"Uruguay",
"UZ":"Uzbekistan",
"VU":"Vanuatu",
"VE":"Venezuela, Bolivarian Republic of",
"VN":"Viet Nam",
"VG":"Virgin Islands, British",
"VI":"Virgin Islands, U.S.",
"WF":"Wallis and Futuna",
"EH":"Western Sahara",
"YE":"Yemen",
"ZM":"Zambia",
"ZW":"Zimbabwe"}

In [None]:
matches["Countries"] = matches["Market "].map(short2long_Countries)
matches.head()

In [None]:
matches.columns

Index(['Deal ID', 'Deal Name HB', 'Company Name', 'Create Date', 'Country',
       'Industry', 'LGA HB', 'BDR HB', 'Deal Type', 'Sales Strategy', 'SQL',
       'SQL Date', 'SQL Value', 'Last Activity Date', 'BDR First Name',
       'cnCleanHB', '#', 'Deal Name KPI', 'Names', 'Industry', 'LGA KPI',
       'Proactive?', 'Call Date', 'Call Time', 'BDR KPI', 'Meeting Day',
       'Category', 'Owner', 'Time Meeting', 'CE or CC', 'Market ', 'Campaign',
       'List', 'List Owner', 'Campaign Date', 'Deal Value', 'SQL', 'Opp. Date',
       'Opportunity', 'SQL Date', 'cnCleanKPIS', 'Countries'],
      dtype='object')

Cleaning BDRs


In [None]:
#hubspot unique BDRs
print(df_hubs['BDR HB'].unique())

['Pablo Gomez Leyva' 'Juliana Padilla Torres (Deactivated User)'
 'Débora Boschini' 'David Santiago Garcia Rojas' 'Luisa Ramírez'
 'Luis Papagayo (Deactivated User)' 'Jaime Andrés Rincón Montero' ''
 'John Guzmán' 'María Paula Cruz' 'Natalia De Vivero'
 'Laura Daniela Arias Ramirez (Deactivated User)'
 'Juan Felipe Van Strahlen Olave' 'Jimena Martínez'
 'Ignacio Gomez Rubiano' 'Juan Sebastian Rebolledo'
 'Marion Isabela Aviña Orendain' 'Gabriela Iglesias' 'Lara Almeida'
 'Santiago Rodriguez' 'Carlos Yáñez' 'Manuela Alonso (Deactivated User)'
 'Aris Povoa (Deactivated User)' 'Sergio Nieto' 'Federico Suárez Namén'
 'Cristian Camilo Garzon Amortegui' 'Mónica Rivera']


In [None]:
#Kpis unique BDRs
print(df_kpis['BDR KPI'].unique())

['Debora' 'Pablo' 'Jaime' 'Luisa' 'Padilla' 'David' 'Renato' 'Own'
 'Daniela' 'Maria Paula' 'Natalia' 'Juan Felipe' 'Marion' 'Lara'
 'Referral' 'Aris' 'Cristian' 'OWN' 'Federico' 'Monica' 'Paola' 'Jaime ']


In [None]:
BDR_Dic = {"Debora":"Débora Boschini",
           "Pablo":"Pablo Gomez Leyva",
           "Jaime":"Jaime Andrés Rincón Montero",
           "Jaime ":"Jaime Andrés Rincón Montero",
           "Luisa":"Luisa Ramírez",
           "Padilla":"Juliana Padilla Torres (Deactivated User)",
           "David":"David Santiago Garcia Rojas",
           "Renato":"Renato CHECK",
           "Own":"Own CHECK",
           "Daniela":"Laura Daniela Arias Ramirez (Deactivated User)",
           "Maria Paula":"María Paula Cruz",
           "Natalia":"Natalia De Vivero",
           "Juan Felipe":"Juan Felipe Van Strahlen Olave",
           "Marion":"Marion Isabela Aviña Orendain",
           "Lara":"Lara Almeida",
           "Referral":"Referral CHECK",
           "Aris":"Aris Povoa (Deactivated User)",
           "Cristian":"Cristian Camilo Garzon Amortegui",
           "OWN":"Own CHECK",
           "Federico":"Federico Suárez Namén",
           "Monica":"Mónica Rivera",
           "Paola":"Paola CHECK"
           }

BDR Match Column

In [None]:
matches['BDR Cor'] = matches['BDR KPI'].map(BDR_Dic)
matches.head()

Cleaning LGAs

In [None]:
#hubspot unique LGAs
print(df_hubs['LGA HB'].unique())

['Inbound' 'Divermedios' 'Camila Acosta' 'Natalia de Vivero'
 'Maria Lucia Pardo' 'Diana Dávila' 'Brenda Merino' 'Alfredo Loredo' 'Own'
 'Paula Jaramillo' 'Diego Trujillo' 'Aline Omote' 'Daniela Ojeda'
 'Valeria Silvera' 'Harbey Morato' 'Paola Adrianofabre'
 'Juan Manuel Jauregui' 'Isabella Rivera' 'Renata Texeira'
 'Laura Restrepo' 'Juliana Padilla' 'Juan Pablo Peñuela' 'Ginna Acuña'
 'Tatiana Shayo' 'Arturo Salazar' 'Melanie Quintero' 'Vinicius Ramos' ''
 'Referral' 'Angela Martinez']


In [None]:
#Kpis unique LGAs
print(df_kpis['LGA KPI'].unique())

['Diana' 'Inbound' 'Alfredo' 'Camila' 'Natalia' 'Brenda' 'Divermedios'
 'Padilla' 'Diego' 'Own' 'Juan' 'Referral' 'Aline' 'Valeria' 'Daniela'
 'Paula' 'Harbey' 'Angela' 'Paola' 'Maria Lucia' 'Isabella' 'Renata'
 'Laura' 'Juan Pablo' 'Ginna' 'Tatiana' 'Arturo' 'Melanie' 'Vinicius'
 'Juliana' 'OWN' 'Sophie']


In [None]:
LGA_Dic = {"Diana":"Diana Dávila",
           "Inbound":"Inbound",
           "Alfredo":"Alfredo Loredo",
           "Camila":"Camila Acosta",
           "Natalia":"Natalia de Vivero",
           "Brenda":"Brenda Merino",
           "Divermedios":"Divermedios",
           "Padilla":"Juliana Padilla",
           "Diego":"Diego Trujillo",
           "Own":"Own",
           "OWN":"Own",
           "Juan":"Juan Manuel Jauregui",
           "Referral":"Referral",
           "Aline":"Aline Omote",
           "Valeria":"Valeria Silvera",
           "Daniela":"Daniela Ojeda",
           "Paula":"Paula Jaramillo",
           "Harbey":"Harbey Morato",
           "Angela":"Angela Martinez",
           "Paola":"Paola Adrianofabre",
           "Maria Lucia":"Maria Lucia Pardo",
           "Isabella":"Isabella Rivera",
           "Renata":"Renata Texeira",
           "Laura":"Laura Restrepo",
           "Juan Pablo":"Juan Pablo Peñuela",
           "Ginna":"Ginna Acuña",
           "Tatiana":"Tatiana Shayo",
           "Arturo":"Arturo Salazar",
           "Melanie":"Melanie Quintero",
           "Vinicius":"Vinicius Ramos",
           "Juliana":"Juliana Padilla",
           "Sophie":"CHECK"
           }

LGA Match Column


In [None]:
matches['LGA Cor'] = matches['LGA KPI'].map(LGA_Dic)
matches.head()

Upload it to the Spreadsheets


In [None]:
dfMatches = matches[["Deal ID","Deal Name HB","cnCleanHB","Create Date","Country","Industry","LGA HB","BDR HB",
                     "Sales Strategy","SQL Date","SQL Value","Deal Name KPI","cnCleanKPIS","Industry","Proactive?","Market ","BDR KPI","Owner",
                     "SQL Date","Countries","LGA KPI","LGA Cor","BDR Cor"]]

In [None]:
wsMatches = ss.worksheet("Fuzzy Results")

In [None]:
dfMatches.fillna('', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


In [None]:
aoa = [dfMatches.columns.tolist()] + dfMatches.to_numpy().tolist()

In [None]:
wsMatches.update("A1",aoa)