In [82]:
import pandas as pd

airtable_companies = pd.read_csv('../data/airtable-companies-with-linkedin-data.csv', index_col=0)

airtable_companies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 172 entries, 0 to 184
Data columns (total 17 columns):
Associated Company         172 non-null object
Contacts                   105 non-null object
Gmail Contacts in full     50 non-null object
HQ                         172 non-null object
Employees                  171 non-null object
Industry                   171 non-null object
Website                    171 non-null object
LinkedIn description       169 non-null object
LinkedIn size              166 non-null object
LinkedIn specialties       125 non-null object
LinkedIn established       154 non-null float64
LinkedIn id                172 non-null object
LinkedIn url               172 non-null object
LinkedIn Industry          171 non-null object
LinkedIn website           170 non-null object
LinkedIn temporary logo    166 non-null object
LinkedIn headquarter       162 non-null object
dtypes: float64(1), object(16)
memory usage: 24.2+ KB


In [83]:
airtable_companies['Industry']=airtable_companies['Industry'].replace(' ','')
airtable_companies['Industry']=airtable_companies['Industry'].replace('Online Platform','OnlinePlatform')
airtable_companies['Industry'].value_counts()

☮️ Social Impact           22
🖥 Agency                   16
📲Online Platform           16
💼 Consultancy              14
🛍️ Consumer Products       13
📑Online Platform           10
⚕️Health Tech               8
🏦 Finance                   8
🏘 Property technology       7
🔬 Technology                6
🎓 Education                 6
💰Venture Capital            5
💭 AdTech                    4
🥋Online Platform            4
📰 News & Media              4
🏢 Infrastructure            4
🎭 Media & Entertainment     3
🖥️ Machine Learning, AI     3
🔐Cyber Security             3
🍕 FoodTech                  3
📊 Data Visualisation        2
⛑️ Insurance                2
📡 Data Services             2
📚Culture                    1
📱 Telecom                   1
⛓️ BlockChain               1
🚀 Work Space                1
🚊 Public Sector             1
🏛Government                 1
Name: Industry, dtype: int64

In [84]:
import numpy as np
def aggregate_industry_used_by_financial_department(linkedin_industry, airtable_industry):
    industries = linkedin_industry + ' ' + airtable_industry
    
    mappings = [
        {
            'words': ['Design', 'Software', 'Engineering', 'Maritime', 'Architecture'],
            'industry': 'Design & Engineering'
        },
        {
            'words': ['Insurance'],
            'industry': 'Insurance'
        },
        {
            'words': ['Philanthropy', 'Non-profit', 'NonProfit', 'Government', 'Civic', 'Social', 'Military', 'Utilities'],
            'industry': 'Non Profit'
        },
        {
            'words': ['Travel', 'Hospitality'],
            'industry': 'Travel and Hospitality'
        },
        {
            'words': ['E-commerce', 'Internet', 'Import'],
            'industry': 'Ecommerce'
        },
        {
            'words': ['Oil', 'Energy', 'Renewable'],
            'industry': 'Energy'
        },
        {
            'words': ['Financial', 'Finance', 'Accounting', 'Capital', 'Investment', 'Banking'],
            'industry': 'Financial Services'
        },
        {
            'words': ['Technology', 'Telecommunications', 'Development', 'Information', 'Security', 'Biotechnology'],
            'industry': 'Technology'
        },
        {
            'words': ['Industrial', 'Construction', 'Building', 'Transportation'],
            'industry': 'Industrial'
        },
        {
            'words': ['Automotive', 'Machinery', 'Manufacturing', 'Aerospace', 'Equipment', 'Electronics'],
            'industry': 'Manufacturing'
        },
        {
            'words': ['Education', 'E-learning', 'Research', 'Training', 'Think', 'Coaching'],
            'industry': 'Education'
        },
        {
            'words': ['Entertainment', 'Museum', 'Art', 'Arts', 'Events', 'Sports', 'Sport', 'Fitness', 'Gaming', 'Game', 'Gambling', 'Libraries'],
            'industry': 'Entertainment'
        },
        {
            'words': ['Medicine', 'Health', 'HealthCare', 'Medical'],
            'industry': 'Healthcare'
        },
        {
            'words': ['Law', 'Legal'],
            'industry': 'Legal'
        },
        {
            'words': ['Estate'],
            'industry': 'Real Estate'
        },
        {
            'words': ['Media', 'Publishing', 'Writing', 'Printing', 'Photography', '🥋Online Platform'],
            'industry': 'Media'
        },
        {
            'words': ['Human', 'Communications', 'Public', 'Management', 'Staffing', 'Logistics', 'Marketing', 'Services'],
            'industry': 'Professional Services'
        },
        {
            'words': ['Retail', 'Goods', 'Consumer', 'Furniture', 'Cosmetics', 'Fashion', 'Wine', 'Beverages', 'Restaurants'],
            'industry': 'Retail'
        },
        {
            'words': ['Agriculture', 'FoodProduction'],
            'industry': 'Agriculture'
        }
    ]

    for mapping in mappings:
        if any(word in industries for word in mapping['words']):
            return mapping['industry']

    return industries

In [85]:
df = airtable_companies
df['LinkedIn Industry'] = df['LinkedIn Industry'].fillna('')
df['Industry'] = df['Industry'].fillna('')

df['Aggregated Industry'] = np.vectorize(aggregate_industry_used_by_financial_department)(df['LinkedIn Industry'], df['Industry'])

df['Aggregated Industry'] = df['Aggregated Industry'].replace(' ', np.nan)

df['Aggregated Industry'].value_counts().head(25)

Design & Engineering      30
Technology                25
Non Profit                21
Financial Services        17
Ecommerce                 16
Professional Services     16
Retail                    12
Entertainment              9
Education                  7
Media                      6
Real Estate                4
Healthcare                 3
Insurance                  2
Manufacturing              2
Industrial                 1
Travel and Hospitality     1
Name: Aggregated Industry, dtype: int64

In [86]:
industry_substring = "Photography"

df['LinkedIn Industry'] = df['LinkedIn Industry'].fillna('')

matching_companies = df[df['LinkedIn Industry'].str.contains(industry_substring)]
matching_companies[['Associated Company', 'LinkedIn url', 'LinkedIn Industry', 'Aggregated Industry']].head(25)

Unnamed: 0,Associated Company,LinkedIn url,LinkedIn Industry,Aggregated Industry
124,PhotoCrowd,https://www.linkedin.com/company/photocrowd/ab...,Photography,Media


In [87]:
industry_substring = '🥋Online Platform'

matching_companies = df[df['Industry'].str.contains(industry_substring)]
matching_companies = matching_companies[~df['LinkedIn Industry'].isnull()]

matching_companies[['Associated Company','Industry', 'LinkedIn url', 'LinkedIn Industry', 'Aggregated Industry']].head(25)

Unnamed: 0,Associated Company,Industry,LinkedIn url,LinkedIn Industry,Aggregated Industry
7,Amazon (AWS),🥋Online Platform,https://www.linkedin.com/company/amazon-web-se...,Information Technology & Services,Technology
39,DAZN (perform),🥋Online Platform,https://www.linkedin.com/company/dazn-limited/...,,Media
68,GoSweat,🥋Online Platform,https://www.linkedin.com/company/gosweat/about/,"Health, Wellness & Fitness",Entertainment
86,Killer.Football,🥋Online Platform,https://www.linkedin.com/company/killer-dot-fo...,Gambling & Casinos,Entertainment


# Generalize Headquartes

In [88]:
def generalize_headquarter(linkedin_hq):
    linkedin_hq = linkedin_hq
    mappings = [
        {
            'words': ['london'],
            'hq': 'London, UK'
        },
        {
            'words': ['new york', 'brooklyn'],
            'hq': 'New York, NY'
        },
        {
            'words': ['san francisco'],
            'hq': 'San Francisco, CA'
        },
        {
            'words': ['berlin'],
            'hq': 'Berlin, Germany'
        },
        {
            'words': ['boston'],
            'hq': 'Boston, MA'
        },
        {
            'words': ['singapore'],
            'hq': 'Singapore'
        },
        {
            'words': ['chicago'],
            'hq': 'Chicago, IL'
        },
        ]

    for mapping in mappings:
        if any(word in linkedin_hq.lower() for word in mapping['words']):
            return mapping['hq']

    return linkedin_hq

In [89]:
#df = airtable_companies
df['LinkedIn headquarter'] = df['LinkedIn headquarter'].fillna('')

df['LinkedIn headquarter'] = np.vectorize(generalize_headquarter)(df['LinkedIn headquarter'])

df['LinkedIn headquarter'] = df['LinkedIn headquarter'].replace(' ', np.nan).replace('', np.nan)
df['LinkedIn headquarter'].value_counts().head(35)

London, UK                        123
New York, NY                        3
San Francisco, CA                   3
Boston, MA                          3
Berlin, Germany                     3
Seattle, WA                         3
Chicago, IL                         2
Singapore                           2
Rickmansworth, Hertfordshire        1
Stevenage                           1
Horsham, West Sussex                1
Falmouth, Cornwall                  1
Cowes, England                      1
Cardiff                             1
Costa Mesa, CA                      1
Leamington Spa, Warwickshire        1
Manchester, North West              1
Poole, Dorset                       1
Cambridge, Cambridgeshire           1
Oxford, OXON                        1
Gaza, Gaza                          1
Menlo Park, CA                      1
Sutton, Surrey                      1
Bristol, UK                         1
Schlieren, ZH                       1
Leicester, Leicestershire           1
Milton Keyne

# Save results

In [90]:
df_to_save = df
df_to_save = df_to_save.fillna('')
df_to_save.to_csv('../data/airtable-companies-with-linkedin-data-and-aggregated-findep-industry-genalised-hqs.csv', index=True)

df_to_save

Unnamed: 0,Associated Company,Contacts,Gmail Contacts in full,HQ,Employees,Industry,Website,LinkedIn description,LinkedIn size,LinkedIn specialties,LinkedIn established,LinkedIn id,LinkedIn url,LinkedIn Industry,LinkedIn website,LinkedIn temporary logo,LinkedIn headquarter,Aggregated Industry
0,8th Light,"Jim Suchy,Becca Townsend","Ashley Bye, Dennis Moore","""Angel, London""",51-200,💼 Consultancy,https://8thlight.com,"Software is our Craft.™ At 8th Light, we craft...",51-200,"Agile, UX/UI, Software Craftsmanship, Web Desi...",2006,8th-light,https://www.linkedin.com/company/8th-light/about/,Computer Software,https://8thlight.com,https://media-exp1.licdn.com/dms/image/C560BAQ...,"Chicago, IL",Design & Engineering
1,Accurx,"Louise Hughes,Jacob Haddad",,"""Dalston, London""",11-50,⚕️Health Tech,https://www.accurx.com/,"At accu Rx, we're on a mission to change lives...",11-50,,2016,accurx,https://www.linkedin.com/company/accurx/about/,Computer Software,https://www.accurx.com,https://media-exp1.licdn.com/dms/image/C560BAQ...,"London, UK",Design & Engineering
2,Acuris Global,,,City of London,501-1000,🏢 Infrastructure,https://www.acuris.com/,Acuris powers business growth for financial an...,"1,001-5,000",,2000,acuris-global,https://www.linkedin.com/company/acuris-global...,Financial Services,http://www.acuris.com,https://media-exp1.licdn.com/dms/image/C510BAQ...,"New York, NY",Financial Services
3,Ad Hoc Global Ltd,,,"""Covent Garden, London""",1-10,🖥 Agency,http://www.adhocglobal.com/,"Ad Hoc Global provides user-centred, service d...",2-10,"UX, Human factors, Digital Behaviour, Strategy...",2012,ad-hoc-global-ltd,https://www.linkedin.com/company/ad-hoc-global...,Research,http://www.adhocglobal.com,https://media-exp1.licdn.com/dms/image/C4E0BAQ...,"London, UK",Education
4,Airtable,,"Alex Dytrych, David Peterson","""Old Street, London""",11-50,📲Online Platform,https://airtable.com,Airtable's mission is to democratize software ...,51-200,,2013,airtable,https://www.linkedin.com/company/airtable/about/,Computer Software,https://airtable.com,https://media-exp1.licdn.com/dms/image/C560BAQ...,"San Francisco, CA",Design & Engineering
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,YuLife,,,"""Kings Cross, London""",11-50,⛑️ Insurance,https://www.yulife.com/,yulife is the world’s first life insurance com...,11-50,,2016,yulife,https://www.linkedin.com/company/yulife/about/,Insurance,https://www.yulife.com,https://media-exp1.licdn.com/dms/image/C4D0BAQ...,"London, UK",Insurance
181,YYT,Jonathan Chikly,,"""Fitzrovia, London""",1-10,🖥 Agency,https://yyt.dev,"We build content management systems, e-commerc...",0-1,,,yyt,https://www.linkedin.com/company/yyt/about/,Information Technology & Services,https://yyt.dev,https://media-exp1.licdn.com/dms/image/C4D0BAQ...,"London, UK",Technology
182,Zero Deposit,Pete Graham,Andrew Doyle,"""Liverpool Street, London""",11-50,🏘 Property technology,https://www.zerodeposit.com/,Our vision is a better renting process for eve...,11-50,"real estate, landlords, letting agents, tenant...",2016,zerodeposituk,https://www.linkedin.com/company/zerodeposituk...,Real Estate,http://www.zerodeposit.com,https://media-exp1.licdn.com/dms/image/C4D0BAQ...,Stevenage,Real Estate
183,Zinc,Julia Ross,,City of London,11-50,💰Venture Capital,https://www.zinc.vc,Zinc exists to build and scale a brand-new way...,2-10,,2017,zincvc,https://www.linkedin.com/company/zincvc/about/,Venture Capital & Private Equity,http://www.zinc.vc,https://media-exp1.licdn.com/dms/image/C4D0BAQ...,"London, UK",Financial Services
