# Load data

In [28]:
import pandas as pd
import numpy as numpy
import os.path

data_folder = "../data"

In [29]:
existing_companies_file = os.path.join(data_folder, 'companies-with-extended-linkedin-data-and-aggregated-findep-industry-and-directors.csv')
existing_df = pd.read_csv(existing_companies_file, index_col=0)

existing_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 639 entries, 0 to 673
Data columns (total 17 columns):
Associated Company ID      639 non-null object
Associated Company         639 non-null object
Director                   467 non-null object
Hubspot url                639 non-null object
LinkedIn url               603 non-null object
LinkedIn size              580 non-null object
LinkedIn specialties       427 non-null object
LinkedIn Industry          596 non-null object
Industry                   235 non-null object
Aggregated Industry        639 non-null object
Deal Stage                 639 non-null object
LinkedIn website           569 non-null object
LinkedIn favicon           0 non-null float64
LinkedIn temporary logo    533 non-null object
LinkedIn headquarter       551 non-null object
Unnamed: 16                0 non-null float64
LinkedIn site favicon      569 non-null object
dtypes: float64(2), object(15)
memory usage: 89.9+ KB


In [30]:
unrecogniezed_companies_file = os.path.join(data_folder, 'unrecognized-companies.csv')
unrecogniezed_df = pd.read_csv(unrecogniezed_companies_file, index_col=0)

unrecogniezed_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 190 entries, 4 to 765
Data columns (total 5 columns):
Associated Company ID    190 non-null object
Associated Company       190 non-null object
Industry                 62 non-null object
Deal Name                190 non-null object
Deal Stage               190 non-null object
dtypes: object(5)
memory usage: 8.9+ KB


In [31]:
all_companies_file = os.path.join(data_folder, 'companies-from-deals.csv')
all_df = pd.read_csv(all_companies_file, index_col=0)

all_df = all_df.rename(columns={"Associated Company ID": "Company ID"})

all_df.head()

Unnamed: 0,Company ID,Associated Company,Industry,Deal Name,Deal Stage
0,1005828045,Big Radical Limited,Non Profit,Web Community Project,0
1,1006470667,psygro.co.za,,Trudy Tanner - 8thlight.com contact,0
2,1006849789,Care Coordination Systems,Health Care,CCS Pathways -- Admin Modules and Web Educatio...,0
3,1011074056,KOS Services LLC,,Erin Steinhardt - 8thlight.com contact,0
4,1013701003,Dan Bocik,,Dan Bocik - 8thlight.com contact,0


In [32]:
input_companies_file = os.path.join(data_folder, 'Hubspot Company Export 20200527.csv')
hubspot_df = pd.read_csv(input_companies_file)

hubspot_df.head()

Unnamed: 0,Company ID,Name,Company owner,Phone Number,City,Country/Region,Industry,LinkedIn Company Page
0,3907984598,"Topexplainers, Inc.",Dennis Moore (Deactivated User),+1 760-563-4014,Newark,United States,,https://www.linkedin.com/company/topexplainers
1,3902130754,"DeveloperTown, LLC",Rich Feller,855-338-8696,Indianapolis,United States,Computer Software,https://www.linkedin.com/company/developertown
2,3899370619,Cove Markets Inc,Paul Pagel,,Chicago,United States,,
3,3895972740,ROVERPASS,Justin Herrick,(512)887-3932,Oljato-Monument Valley,United States,Computer Software,https://www.linkedin.com/company/roverpass
4,3874116280,California Forward,Dennis Moore (Deactivated User),19164910022,Sacramento,United States,Civic & Social Organization,https://www.linkedin.com/company/california-fo...


In [33]:
hubspot_df['Company ID'] = hubspot_df['Company ID'].apply(str)
hubspot_df['Company ID'].value_counts()

727079872    1
814078932    1
727320922    1
727321222    1
727202646    1
            ..
730472941    1
727320991    1
726952507    1
727202479    1
730426024    1
Name: Company ID, Length: 2512, dtype: int64

In [34]:
new_incoming_companies_df = pd.merge(hubspot_df, all_df, on=['Company ID'], how='outer', indicator=True).query('_merge=="left_only"')
new_incoming_companies_df

Unnamed: 0,Company ID,Name,Company owner,Phone Number,City,Country/Region,Industry_x,LinkedIn Company Page,Associated Company,Industry_y,Deal Name,Deal Stage,_merge
0,3907984598,"Topexplainers, Inc.",Dennis Moore (Deactivated User),+1 760-563-4014,Newark,United States,,https://www.linkedin.com/company/topexplainers,,,,,left_only
1,3902130754,"DeveloperTown, LLC",Rich Feller,855-338-8696,Indianapolis,United States,Computer Software,https://www.linkedin.com/company/developertown,,,,,left_only
2,3899370619,Cove Markets Inc,Paul Pagel,,Chicago,United States,,,,,,,left_only
3,3895972740,ROVERPASS,Justin Herrick,(512)887-3932,Oljato-Monument Valley,United States,Computer Software,https://www.linkedin.com/company/roverpass,,,,,left_only
4,3874116280,California Forward,Dennis Moore (Deactivated User),19164910022,Sacramento,United States,Civic & Social Organization,https://www.linkedin.com/company/california-fo...,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2505,598256021,IEEE,Margaret Pagel,+1 732-562-6343,,United States,,https://www.linkedin.com/company/ieee,,,,,left_only
2506,598256019,Norad Limited,Margaret Pagel,18479468286,Gibson City,United States,,,,,,,left_only
2507,597686649,"4C Insights, Inc.",Ryan Verner,312-756-7771,Chicago,United States,Computer Software,https://www.linkedin.com/company/4c-insights,,,,,left_only
2508,730444815,DePaul University,Paul Pagel,3123628000,Chicago,United States,Higher Education,https://www.linkedin.com/company/depaul-univer...,,,,,left_only


In [35]:
df = new_incoming_companies_df
df = df[~df["LinkedIn Company Page"].isnull()]
df

Unnamed: 0,Company ID,Name,Company owner,Phone Number,City,Country/Region,Industry_x,LinkedIn Company Page,Associated Company,Industry_y,Deal Name,Deal Stage,_merge
0,3907984598,"Topexplainers, Inc.",Dennis Moore (Deactivated User),+1 760-563-4014,Newark,United States,,https://www.linkedin.com/company/topexplainers,,,,,left_only
1,3902130754,"DeveloperTown, LLC",Rich Feller,855-338-8696,Indianapolis,United States,Computer Software,https://www.linkedin.com/company/developertown,,,,,left_only
3,3895972740,ROVERPASS,Justin Herrick,(512)887-3932,Oljato-Monument Valley,United States,Computer Software,https://www.linkedin.com/company/roverpass,,,,,left_only
4,3874116280,California Forward,Dennis Moore (Deactivated User),19164910022,Sacramento,United States,Civic & Social Organization,https://www.linkedin.com/company/california-fo...,,,,,left_only
5,3869752703,Belgravia Group,Malcolm Newsome,(312) 751-2777,Chicago,United States,Real Estate,https://www.linkedin.com/company/belgravia-group,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2504,598256022,Autodesk Inc.,Margaret Pagel,+1 415-507-5000,San Rafael,United States,Computer Software,https://www.linkedin.com/company/autodesk,,,,,left_only
2505,598256021,IEEE,Margaret Pagel,+1 732-562-6343,,United States,,https://www.linkedin.com/company/ieee,,,,,left_only
2507,597686649,"4C Insights, Inc.",Ryan Verner,312-756-7771,Chicago,United States,Computer Software,https://www.linkedin.com/company/4c-insights,,,,,left_only
2508,730444815,DePaul University,Paul Pagel,3123628000,Chicago,United States,Higher Education,https://www.linkedin.com/company/depaul-univer...,,,,,left_only


In [36]:
df = df.rename(columns={"Name": "Company Name", "Company owner":"Director", "Industry_x":"Industry", "LinkedIn Company Page":"LinkedIn link"})
df['Hubspot url']='https://app.hubspot.com/contacts/4012159/company/'+df['Company ID'].str.replace(r'[0-9]+, ', '')

df['LinkedIn link'] = df['LinkedIn link'].str.replace('/company-beta/', '/company/')
df['LinkedIn link'] = df['LinkedIn link'].str.replace("/?pathWildcard=15261355", "", regex=False)

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 744 entries, 0 to 2511
Data columns (total 14 columns):
Company ID            744 non-null object
Company Name          744 non-null object
Director              714 non-null object
Phone Number          590 non-null object
City                  677 non-null object
Country/Region        712 non-null object
Industry              552 non-null object
LinkedIn link         744 non-null object
Associated Company    0 non-null object
Industry_y            0 non-null object
Deal Name             0 non-null object
Deal Stage            0 non-null object
_merge                744 non-null category
Hubspot url           744 non-null object
dtypes: category(1), object(13)
memory usage: 82.2+ KB


In [37]:
df_to_save = df

df_to_save = df_to_save[['Company ID', 'Company Name', 'Director', 'Hubspot url', 'Phone Number', 'City', 'Country/Region', 'Industry', 'LinkedIn link']]

target_file = os.path.join(data_folder, 'hubspot-companies-with-linkedin-link.csv')
df_to_save.to_csv(target_file, index=True)

df_to_save

Unnamed: 0,Company ID,Company Name,Director,Hubspot url,Phone Number,City,Country/Region,Industry,LinkedIn link
0,3907984598,"Topexplainers, Inc.",Dennis Moore (Deactivated User),https://app.hubspot.com/contacts/4012159/compa...,+1 760-563-4014,Newark,United States,,https://www.linkedin.com/company/topexplainers
1,3902130754,"DeveloperTown, LLC",Rich Feller,https://app.hubspot.com/contacts/4012159/compa...,855-338-8696,Indianapolis,United States,Computer Software,https://www.linkedin.com/company/developertown
3,3895972740,ROVERPASS,Justin Herrick,https://app.hubspot.com/contacts/4012159/compa...,(512)887-3932,Oljato-Monument Valley,United States,Computer Software,https://www.linkedin.com/company/roverpass
4,3874116280,California Forward,Dennis Moore (Deactivated User),https://app.hubspot.com/contacts/4012159/compa...,19164910022,Sacramento,United States,Civic & Social Organization,https://www.linkedin.com/company/california-fo...
5,3869752703,Belgravia Group,Malcolm Newsome,https://app.hubspot.com/contacts/4012159/compa...,(312) 751-2777,Chicago,United States,Real Estate,https://www.linkedin.com/company/belgravia-group
...,...,...,...,...,...,...,...,...,...
2504,598256022,Autodesk Inc.,Margaret Pagel,https://app.hubspot.com/contacts/4012159/compa...,+1 415-507-5000,San Rafael,United States,Computer Software,https://www.linkedin.com/company/autodesk
2505,598256021,IEEE,Margaret Pagel,https://app.hubspot.com/contacts/4012159/compa...,+1 732-562-6343,,United States,,https://www.linkedin.com/company/ieee
2507,597686649,"4C Insights, Inc.",Ryan Verner,https://app.hubspot.com/contacts/4012159/compa...,312-756-7771,Chicago,United States,Computer Software,https://www.linkedin.com/company/4c-insights
2508,730444815,DePaul University,Paul Pagel,https://app.hubspot.com/contacts/4012159/compa...,3123628000,Chicago,United States,Higher Education,https://www.linkedin.com/company/depaul-univer...
