# Load data

In [22]:
import pandas as pd
import numpy as np
import os.path

data_folder = "../data"

In [23]:
data_file = os.path.join(data_folder, 'hubspot-companies-with-linkedin-link-mined-data.csv')

scrapped_companies = pd.read_csv(data_file, index_col=0)
scrapped_companies.count()

Company ID                 701
Company Name               701
Director                   674
Hubspot url                701
Phone Number               560
City                       639
Country/Region             670
Industry                   525
LinkedIn link              701
LinkedIn description       687
LinkedIn size              683
LinkedIn specialties       544
LinkedIn established       606
LinkedIn id                700
LinkedIn url               701
LinkedIn Industry          689
LinkedIn website           686
LinkedIn temporary logo    657
LinkedIn headquarter       645
LinkedIn Name              694
dtype: int64

In [24]:
df = scrapped_companies.copy()

df['Industry'] = df['Industry'].str.replace(';', ' ')
df['Industry'] = df['Industry'].str.replace('Non Profit', 'NonProfit')
df['Industry'] = df['Industry'].str.replace('Food Production', 'FoodProduction')
df['Industry'] = df['Industry'].str.replace('Health Care', 'HealthCare')

# replace field that's entirely space (or empty) with NaN
df['Industry'] = df['Industry'].replace(r'^\s*$', np.nan, regex=True)

df['Industry'].value_counts()

Computer Software                      182
Higher Education                        65
Information Technology and Services     42
Marketing and Advertising               20
Financial Services                      18
                                      ... 
Biotechnology                            1
Airlines/Aviation                        1
Writing and Editing                      1
Photography                              1
Research                                 1
Name: Industry, Length: 74, dtype: int64

In [25]:
def aggregate_industry_used_by_financial_department(linkedin_industry, hubspot_industry):
    industries = linkedin_industry + ' ' + hubspot_industry
    
    mappings = [
        {
            'words': ['Design', 'Software', 'Engineering', 'Maritime', 'Architecture'],
            'industry': 'Design & Engineering'
        },
        {
            'words': ['Insurance'],
            'industry': 'Insurance'
        },
        {
            'words': ['Philanthropy', 'Non-profit', 'NonProfit', 'Government', 'Civic', 'Social', 'Military', 'Defense', 'Utilities', 'Religious', 'Political'],
            'industry': 'Non Profit'
        },
        {
            'words': ['Travel', 'Hospitality'],
            'industry': 'Travel and Hospitality'
        },
        {
            'words': ['E-commerce', 'Internet', 'Import'],
            'industry': 'Ecommerce'
        },
        {
            'words': ['Oil', 'Energy', 'Renewable'],
            'industry': 'Energy'
        },
        {
            'words': ['Financial', 'Finance', 'Accounting', 'Capital', 'Investment', 'Banking'],
            'industry': 'Financial Services'
        },
        {
            'words': ['Technology', 'Telecommunications', 'Development', 'Information', 'Security', 'Biotechnology', 'Computer'],
            'industry': 'Technology'
        },
        {
            'words': ['Industrial', 'Construction', 'Building', 'Transportation', 'Chemicals'],
            'industry': 'Industrial'
        },
        {
            'words': ['Automotive', 'Machinery', 'Manufacturing', 'Aerospace', 'Equipment', 'Electronics', 'Forest'],
            'industry': 'Manufacturing'
        },
        {
            'words': ['Education', 'E-learning', 'Research', 'Training', 'Think', 'Coaching'],
            'industry': 'Education'
        },
        {
            'words': ['Entertainment', 'Museum', 'Art', 'Arts', 'Events', 'Sports', 'Sport', 'Fitness', 'Gaming', 'Game', 'Gambling', 'Libraries', 'Animation', 'Photography'],
            'industry': 'Entertainment'
        },
        {
            'words': ['Medicine', 'Health', 'HealthCare', 'Medical', 'Pharmaceuticals'],
            'industry': 'Healthcare'
        },
        {
            'words': ['Law', 'Legal'],
            'industry': 'Legal'
        },
        {
            'words': ['Estate'],
            'industry': 'Real Estate'
        },
        {
            'words': ['Media', 'Publishing', 'Writing', 'Printing'],
            'industry': 'Media'
        },
        {
            'words': ['Human', 'Communications', 'Public', 'Management', 'Staffing', 'Logistics', 'Marketing', 'Services', 'Executive'],
            'industry': 'Professional Services'
        },
        {
            'words': ['Retail', 'Goods', 'Consumer', 'Furniture', 'Cosmetics', 'Fashion', 'Wine', 'Beverages', 'Restaurants', 'Wholesale'],
            'industry': 'Retail'
        },
        {
            'words': ['Agriculture', 'FoodProduction'],
            'industry': 'Agriculture'
        }
    ]

    for mapping in mappings:
        if any(word in industries for word in mapping['words']):
            return mapping['industry']

    return industries

In [26]:
df['LinkedIn Industry'] = df['LinkedIn Industry'].fillna('')
df['Industry'] = df['Industry'].fillna('')
df['Aggregated Industry'] = np.vectorize(aggregate_industry_used_by_financial_department)(df['LinkedIn Industry'], df['Industry'])

df['Aggregated Industry'] = df['Aggregated Industry'].replace(' ', np.nan)

df['Aggregated Industry'].value_counts().head(25)

Design & Engineering      222
Technology                104
Education                  92
Professional Services      59
Financial Services         42
Non Profit                 30
Entertainment              27
Healthcare                 18
Retail                     18
Ecommerce                  14
Manufacturing              12
Media                      12
Insurance                  11
Real Estate                 9
Travel and Hospitality      7
Legal                       7
Industrial                  7
Energy                      4
Name: Aggregated Industry, dtype: int64

# Generalize Headquartes

In [27]:
def generalize_headquarter(linkedin_hq):
    linkedin_hq = linkedin_hq.replace('California', 'CA')
    linkedin_hq = linkedin_hq.replace('Florida', 'FL')
    linkedin_hq = linkedin_hq.replace('Ohio', 'OH')
    linkedin_hq = linkedin_hq.replace('Illinois', 'IL')
    linkedin_hq = linkedin_hq.replace('Pennslyvania', 'PA')
    linkedin_hq = linkedin_hq.replace('Virginia', 'VA')
    linkedin_hq = linkedin_hq.replace('Arkansas', 'AR')
    linkedin_hq = linkedin_hq.replace('North Carolina', 'NC')
    linkedin_hq = linkedin_hq.replace('Massachusetts', 'MA')
    linkedin_hq = linkedin_hq.replace('TX - Texas', 'TX').replace('Texas', 'TX')
    linkedin_hq = linkedin_hq.replace('England', 'UK').replace('Essex', 'UK').replace('Cambridgeshire', 'UK')
    
    mappings = [
        {
            'words': ['london'],
            'hq': 'London, UK'
        },
        {
            'words': ['new york', 'brooklyn', 'ny, ny'],
            'hq': 'New York, NY'
        },
        {
            'words': ['san francisco'],
            'hq': 'San Francisco, CA'
        },
        {
            'words': ['berlin'],
            'hq': 'Berlin, Germany'
        },
        {
            'words': ['paris'],
            'hq': 'Paris, France'
        },
        {
            'words': ['warszawa', 'warsaw'],
            'hq': 'Warsaw, Poland'
        },
        {
            'words': ['toronto'],
            'hq': 'Toronto, Ontario'
        },
        {
            'words': ['bristol'],
            'hq': 'Bristol'
        },
        {
            'words': ['manchester'],
            'hq': 'Manchester, UK'
        },
        {
            'words': ['guildford'],
            'hq': 'Guildford, UK'
        },
        {
            'words': ['boston'],
            'hq': 'Boston, MA'
        },
        {
            'words': ['singapore'],
            'hq': 'Singapore'
        },
        {
            'words': ['new delhi'],
            'hq': 'New Delhi'
        },
        {
            'words': ['chicago'],
            'hq': 'Chicago, IL'
        },
        {
            'words': ['austin'],
            'hq': 'Austin, TX'
        },
        {
            'words': ['seattle'],
            'hq': 'Seattle, WA'
        },
        {
            'words': ['madison'],
            'hq': 'Madison, WI'
        },
        {
            'words': ['washington'],
            'hq': 'Washington, DC'
        },
        {
            'words': ['atlanta'],
            'hq': 'Atlanta, GA'
        },
        {
            'words': ['northbrook'],
            'hq': 'Northbrook, IL'
        },
        {
            'words': ['nashville'],
            'hq': 'Nashville, TN'
        },
        {
            'words': ['fremont'],
            'hq': 'Fremont, CA'
        },
        {
            'words': ['irvine'],
            'hq': 'Irvine, CA'
        },
        {
            'words': ['dallas'],
            'hq': 'Dallas, TX'
        },
        {
            'words': ['miami'],
            'hq': 'Miami, FL'
        },
        {
            'words': ['ann arbor'],
            'hq': 'Ann Arbor, MI'
        },
        {
            'words': ['lancaster'],
            'hq': 'Lancaster, PA'
        },
        {
            'words': ['philadelphia'],
            'hq': 'Philadelphia, PA'
        },
        {
            'words': ['indianapolis'],
            'hq': 'Indianapolis, IN'
        },
        {
            'words': ['bangalore'],
            'hq': 'Bangalore, Karnataka'
        },
        {
            'words': ['san jose'],
            'hq': 'San Jose, CA'
        },
        {
            'words': ['mc lean', 'mclean'],
            'hq': 'Mc Lean, VA'
        },
        {
            'words': ['st. louis', 'st louis'],
            'hq': 'St. Louis, MO'
        },
        ]

    for mapping in mappings:
        if any(word in linkedin_hq.lower() for word in mapping['words']):
            return mapping['hq']

    return linkedin_hq

In [28]:
df['LinkedIn headquarter'] = df['LinkedIn headquarter'].fillna('')

df['LinkedIn headquarter'] = np.vectorize(generalize_headquarter)(df['LinkedIn headquarter'])

df['LinkedIn headquarter'] = df['LinkedIn headquarter'].replace(' ', np.nan).replace('', np.nan)
df['LinkedIn headquarter'].value_counts().head(60) #.head(150).iloc[::-1]

Chicago, IL               87
New York, NY              47
London, UK                39
San Francisco, CA         20
Atlanta, GA                9
Austin, TX                 9
Ahmedabad, Gujarat         6
Paris, France              6
Seattle, WA                6
Los Angeles, CA            6
Madison, WI                5
Washington, DC             5
Philadelphia, PA           4
St. Louis, MO              4
Boston, MA                 4
Houston, TX                4
Indianapolis, IN           3
Berlin, Germany            3
Pune, Maharashtra          3
Bangalore, Karnataka       3
Naperville, IL             3
Hannover, Lower Saxony     3
Charlotte, NC              3
Fremont, CA                3
Bristol                    3
Toronto, Ontario           3
Cincinnati, OH             3
Irvine, CA                 3
Warsaw, Poland             3
San Jose, CA               3
New Delhi                  2
Munich                     2
Libertyville, IL           2
Cairo                      2
Milwaukee, WI 

In [29]:
phrase='follower'
df['LinkedIn headquarter'] = df['LinkedIn headquarter'].fillna('')
df[df['LinkedIn headquarter'].str.contains(phrase)]['LinkedIn headquarter'].value_counts().head(60)

1 follower    1
Name: LinkedIn headquarter, dtype: int64

# Merge with existing data!

In [30]:
data_file = os.path.join(data_folder, 'companies-with-extended-linkedin-data-and-aggregated-findep-industry-and-directors.csv')

existing_companies = pd.read_csv(data_file, index_col=0)
existing_companies.count()

Associated Company ID      639
Associated Company         639
Director                   467
Hubspot url                639
LinkedIn url               603
LinkedIn size              580
LinkedIn specialties       427
LinkedIn Industry          596
Industry                   235
Aggregated Industry        639
Deal Stage                 639
LinkedIn website           569
LinkedIn favicon             0
LinkedIn temporary logo    533
LinkedIn headquarter       551
Unnamed: 16                  0
LinkedIn site favicon      569
dtype: int64

In [31]:
df1 = existing_companies[['Associated Company ID', 'Associated Company', 'Director', 'Hubspot url', 'LinkedIn url','LinkedIn website', 'LinkedIn size', 'LinkedIn specialties', 'LinkedIn Industry', 'Industry', 'Aggregated Industry', 'LinkedIn headquarter', 'LinkedIn temporary logo']]
df1['Phone Number']=''
df1 = df1.rename(columns={"Associated Company ID": "Company ID"})
df1 = df1.rename(columns={"Associated Company": "Company Name"})
df1.count()

Company ID                 639
Company Name               639
Director                   467
Hubspot url                639
LinkedIn url               603
LinkedIn website           569
LinkedIn size              580
LinkedIn specialties       427
LinkedIn Industry          596
Industry                   235
Aggregated Industry        639
LinkedIn headquarter       551
LinkedIn temporary logo    533
Phone Number               639
dtype: int64

In [32]:
df1['LinkedIn headquarter'] = df1['LinkedIn headquarter'].fillna('')

df1['LinkedIn headquarter'] = np.vectorize(generalize_headquarter)(df1['LinkedIn headquarter'])

df1['LinkedIn headquarter'] = df1['LinkedIn headquarter'].replace(' ', np.nan).replace('', np.nan)
df1['LinkedIn headquarter'].value_counts().head(60) 

Chicago, IL                      136
London, UK                        54
New York, NY                      35
Los Angeles, CA                   14
San Francisco, CA                 14
Washington, DC                     9
Seattle, WA                        7
9 Global Locations, Worldwide      6
Boston, MA                         5
Deerfield, IL                      4
Irvine, CA                         4
Santa Monica, CA                   4
Philadelphia, PA                   3
Madison, WI                        3
Miami, FL                          3
Tempe, AZ                          3
Denver, Colorado                   2
Zurich                             2
Waltham, MA                        2
Englewood, Colorado                2
Dearborn, Michigan                 2
Northbrook, IL                     2
Plano, TX                          2
Pasadena, CA                       2
Auckland, Auckland                 2
Costa Mesa, CA                     2
Indianapolis, IN                   2
N

In [33]:
phrase='Zurich'
df1['LinkedIn headquarter'] = df1['LinkedIn headquarter'].fillna('')
df1[df1['LinkedIn headquarter'].str.contains(phrase)]['LinkedIn headquarter'].value_counts().head(60)

Zurich             2
Lake Zurich, IL    1
Name: LinkedIn headquarter, dtype: int64

In [34]:
df2 = df[['Company ID', 'Company Name', 'Director', 'Phone Number','Hubspot url', 'LinkedIn url','LinkedIn website', 'LinkedIn size', 'LinkedIn specialties', 'LinkedIn Industry', 'Industry', 'Aggregated Industry', 'LinkedIn headquarter', 'LinkedIn temporary logo']]
df2['Company ID'] = df2['Company ID'].apply(str)
df2.count()

Company ID                 701
Company Name               701
Director                   674
Phone Number               560
Hubspot url                701
LinkedIn url               701
LinkedIn website           686
LinkedIn size              683
LinkedIn specialties       544
LinkedIn Industry          701
Industry                   701
Aggregated Industry        695
LinkedIn headquarter       701
LinkedIn temporary logo    657
dtype: int64

In [35]:
joined_companies = pd.concat([df1, df2])
joined_companies['LinkedIn headquarter'] = joined_companies['LinkedIn headquarter'].replace('1 follower', np.nan)
joined_companies=joined_companies.fillna('')
joined_companies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1340 entries, 0 to 700
Data columns (total 14 columns):
Aggregated Industry        1340 non-null object
Company ID                 1340 non-null object
Company Name               1340 non-null object
Director                   1340 non-null object
Hubspot url                1340 non-null object
Industry                   1340 non-null object
LinkedIn Industry          1340 non-null object
LinkedIn headquarter       1340 non-null object
LinkedIn size              1340 non-null object
LinkedIn specialties       1340 non-null object
LinkedIn temporary logo    1340 non-null object
LinkedIn url               1340 non-null object
LinkedIn website           1340 non-null object
Phone Number               1340 non-null object
dtypes: object(14)
memory usage: 157.0+ KB


In [36]:
joined_companies.count()

Aggregated Industry        1340
Company ID                 1340
Company Name               1340
Director                   1340
Hubspot url                1340
Industry                   1340
LinkedIn Industry          1340
LinkedIn headquarter       1340
LinkedIn size              1340
LinkedIn specialties       1340
LinkedIn temporary logo    1340
LinkedIn url               1340
LinkedIn website           1340
Phone Number               1340
dtype: int64

In [37]:
df_to_save=joined_companies.reset_index(drop=True)

target_file = os.path.join(data_folder, 'merged-existing-and-hubspot-companies-with-linkedin-links-aggregated.csv')
# df_to_save.to_csv(target_file, index=True)

df_to_save

Unnamed: 0,Aggregated Industry,Company ID,Company Name,Director,Hubspot url,Industry,LinkedIn Industry,LinkedIn headquarter,LinkedIn size,LinkedIn specialties,LinkedIn temporary logo,LinkedIn url,LinkedIn website,Phone Number
0,Non Profit,1005828045,Big Radical Limited,Nick Dyer,https://app.hubspot.com/contacts/4012159/compa...,NonProfit,Management Consulting,"London, UK",11-50,"Innovation, Consultancy, design sprints, techn...",https://media-exp1.licdn.com/dms/image/C4E0BAQ...,https://www.linkedin.com/company/big-radical/a...,http://bigradical.com,
1,Healthcare,1006470667,psygro.co.za,,https://app.hubspot.com/contacts/4012159/compa...,,Mental Health Care,,51-200,,https://media-exp1.licdn.com/dms/image/C560BAQ...,https://www.linkedin.com/company/psygro/about/,http://www.psygro.nl,
2,Healthcare,1006849789,Care Coordination Systems,Doug Bradbury,https://app.hubspot.com/contacts/4012159/compa...,HealthCare,Hospital & Health Care,"Akron, OH",2-10,"Community Health, Pathways, HUB, Pathways Conn...",https://media-exp1.licdn.com/dms/image/C4D0BAQ...,https://www.linkedin.com/company/beach-group/a...,https://www.ccspathways.com,
3,Healthcare,1011074056,KOS Services LLC,,https://app.hubspot.com/contacts/4012159/compa...,,Medical Practice,"Chicago, IL",201-500,,,https://www.linkedin.com/company/kos-services-...,http://kosservices.com,
4,Technology,1015911460,Jellyvision,,https://app.hubspot.com/contacts/4012159/compa...,,Information Technology & Services,"Chicago, IL",201-500,"Benefits Communication, Interactive Conversati...",https://media-exp1.licdn.com/dms/image/C4E0BAQ...,https://www.linkedin.com/company/jellyvision/a...,http://www.jellyvision.com,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1335,Design & Engineering,597686649,"4C Insights, Inc.",Ryan Verner,https://app.hubspot.com/contacts/4012159/compa...,Computer Software,Marketing & Advertising,"Chicago, IL",51-200,"social media intelligence, analytics and measu...",https://media-exp1.licdn.com/dms/image/C4D0BAQ...,https://www.linkedin.com/company/4c-insights/a...,http://www.4Cinsights.com/,312-756-7771
1336,Education,730444815,DePaul University,Paul Pagel,https://app.hubspot.com/contacts/4012159/compa...,Higher Education,Higher Education,"Chicago, IL","1,001-5,000","Service_learning, entrepreneurship, MBA, accou...",https://media-exp1.licdn.com/dms/image/C4E0BAQ...,https://www.linkedin.com/company/depaul-univer...,http://depaul.edu,3123628000
1337,Design & Engineering,598254423,Collective Idea,Ryan Verner,https://app.hubspot.com/contacts/4012159/compa...,Computer Software,Computer Software,"Holland, MI",11-50,"Web application development, iOS development, ...",https://media-exp1.licdn.com/dms/image/C4E0BAQ...,https://www.linkedin.com/company/collective-id...,http://collectiveidea.com,6164992122
1338,Design & Engineering,3312503697,StockX,Jenn Imamura (Deactivated User),https://app.hubspot.com/contacts/4012159/compa...,Computer Software,Internet,"Detroit, Michigan","501-1,000","Operations, Internet, e-Commerce, Retail",https://media-exp1.licdn.com/dms/image/C4E0BAQ...,https://www.linkedin.com/company/15261355/about,https://stockx.com,313-373-3000


In [38]:
df_to_save['LinkedIn size'].value_counts().head(60) 

11-50           302
2-10            239
51-200          215
10,001+         143
1,001-5,000     115
201-500         110
501-1,000        80
                 77
5,001-10,000     44
0-1              15
Name: LinkedIn size, dtype: int64