## Collecting and analysing data from LinkedIn

In [102]:
#pragma nodebook off
#Use nodebook for better reproducibility https://github.com/uoa-eResearch/nodebook
%reload_ext nodebook.ipython
%nodebook disk phase4

<IPython.core.display.Javascript object>

In [None]:
from linkedin_api import Linkedin
import pandas as pd  # tabular data
from tqdm.auto import tqdm # progress bars
tqdm.pandas()
import json
from pprint import pprint
import time

In [None]:
secrets = json.load(open("secrets.json"))
api = Linkedin(secrets["username"], secrets["password"])

Sample search: https://www.linkedin.com/search/results/people/?geoUrn=%5B%22105490917%22%5D&origin=FACETED_SEARCH&pastCompany=%5B%221073%22%2C%221038%22%2C%223255299%22%2C%22397575%22%5D&sid=O0Q

KPMG New Zealand: https://www.linkedin.com/company/kpmg-new-zealand/: 1,083 employees, ~2500 past employees  
PwC New Zealand: https://www.linkedin.com/company/pwc-new-zealand/: 1,093 employees, ~1600 past employees  
Deloitte (Worldwide): https://www.linkedin.com/company/deloitte/: 363,547 employees ~1500 in NZ, ~4100 past employees in NZ  
EY (Worldwide): https://www.linkedin.com/company/ernstandyoung/: 319,968 employees, ~1200 in NZ, ~3400 past employees in NZ  

Summative past employees in NZ
~9700

375 people are currently working at one of the 4 and have past worked for one of the 4

In [None]:
companies = [
    "3255299", # PwC
    "397575", # KPMG
    "1038", # Deloitte
    "1073" # EY
]
regions = [
    "105490917" # New Zealand
]
people = []
for company in tqdm(companies):
    people.extend(api.search_people(past_companies=[company], regions=regions))

  0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
len(people)

3939

In [None]:
df = pd.DataFrame(people).drop_duplicates()
df

Unnamed: 0,urn_id,distance,public_id,tracking_id
0,ACoAABYLkwwBZlmZPuUAK9ZUaloP8tnTfb1tUeo,DISTANCE_2,daniela-dunn-950a45a3,369857292
1,ACoAAAL-g0sBdldYgucu1UpPGQa3tuwyEHyOt3k,DISTANCE_2,pieta-brown-8639a614,50234187
2,ACoAAADRaF4BRkwRZSQns6ix3T8mIcxdiEPyvxo,DISTANCE_2,sarahlocknz,13723742
3,ACoAAAR370cBo_Bgfx0jX4qvqnKeqTDz4BeY88Y,DISTANCE_2,yen-shih-94713421,74968903
4,ACoAAAUIBmwBMvRQptMG6uAstyPnHrJV65EJNpw,DISTANCE_2,henryhirsch,84412012
...,...,...,...,...
3926,ACoAAAJ__00BAF6a2ef0jG0sdSa5uQkUpQwZEbI,DISTANCE_3,ramunaidoo,41942861
3927,ACoAAB7LNS0Bl1usYfYGWfVfYdoCKZjeJdcsMlI,DISTANCE_3,yueyue-song-b51029125,516633901
3931,ACoAADGKPG0Bll5Jaczcxjxg7BRolLmp0v98xeo,DISTANCE_3,seohee-lee,831143021
3937,ACoAAANANKABuGRu00OWbNkB7ill6DrxXljB4Ts,DISTANCE_3,liz-libby-garvie-20032216,54539424


In [None]:
df.distance.value_counts()

DISTANCE_3        3176
DISTANCE_2         595
DISTANCE_1           1
OUT_OF_NETWORK       1
Name: distance, dtype: int64

In [None]:
pprint(api.get_profile('daniela-dunn-950a45a3'))

{'certifications': [],
 'education': [{'degreeName': 'BCom (Hons)',
                'description': 'Completed courses in computer networks, '
                               'enterprise resource planning systems, '
                               'security, and information systems research.  '
                               'Wrote a dissertation on multicast network cost '
                               'allocation.\n'
                               '\n'
                               'Achieved first class distinction.',
                'entityUrn': 'urn:li:fs_education:(ACoAABYLkwwBZlmZPuUAK9ZUaloP8tnTfb1tUeo,239453468)',
                'fieldOfStudy': 'Information Systems',
                'school': {'active': True,
                           'entityUrn': 'urn:li:fs_miniSchool:15518',
                           'logoUrl': 'https://media-exp1.licdn.com/dms/image/C560BAQEhkI7FEsIdTg/company-logo_',
                           'objectUrn': 'urn:li:school:15518',
                          

In [None]:
print(f"To extract all {len(df)} profiles, at a rate of about 1 per second would take around {round(len(df) / 60 / 60, 2)} hours")

To extract all 3773 profiles, at a rate of about 1 per second would take around 1.05 hours


In [None]:
results = []
e = None
for public_id in tqdm(df.public_id):
    try:
        results.append(api.get_profile(public_id))
        time.sleep(10)
    except Exception as e:
        print(e)
        break

  0%|          | 0/3773 [00:00<?, ?it/s]

In [None]:
json.dump(people, open("people.json", "w"))
json.dump(results, open("profiles.json", "w"))

In [None]:
current_people = []
for company in tqdm(companies):
    current_people.extend(api.search_people(current_company=[company], regions=regions))

  0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
print(len(current_people))

3824


In [None]:
current_people_df = pd.DataFrame(current_people).drop_duplicates()
current_people_df

Unnamed: 0,urn_id,distance,public_id,tracking_id
0,ACoAAC9yMMkBqS4Ly1MpdFe-omzBQJPp1K_H6C4,DISTANCE_2,max-brown-6356bb1a2,796012745
1,ACoAAByQMl0BB2jLhlEAMDAA5U8hZlx9_O6RkPg,DISTANCE_2,lucyxunz,479212125
2,ACoAABYPa1wBTpPmf2rJiDCzSYZv4q1IyrhewCs,DISTANCE_2,shonballu,370109276
3,ACoAAAabgWsBpsglqxXEEBb82CR9KbNu9VZ9GkI,DISTANCE_2,nourashasan,110854507
4,ACoAABTMPCIBPBZCHPMLp2jBPNUFqGeNYSFEbkg,DISTANCE_2,melaniehyy,348929058
...,...,...,...,...
3815,ACoAABWHvKEBWryGHdfqeolRzBGyqcwUnTnoGqY,DISTANCE_3,nichola-bennett-881b79a0,361217185
3817,ACoAAAdpSUcB1aUSa_k_SCQKJjP7dkJYiTD9f7c,DISTANCE_3,lara-truman-40778435,124340551
3819,ACoAAALnZ4oByVXX-K26VkBrsv8ae3rCfrCjl1M,DISTANCE_3,chris-money,48719754
3822,ACoAAAM0k7cB1Gy6vWzmNG2I7VSjyKX7ITrgwlA,DISTANCE_3,louise-theunissen-18701516,53777335


In [None]:
current_people_df = current_people_df[~current_people_df.urn_id.isin(df.urn_id)]
current_people_df

Unnamed: 0,urn_id,distance,public_id,tracking_id
0,ACoAAC9yMMkBqS4Ly1MpdFe-omzBQJPp1K_H6C4,DISTANCE_2,max-brown-6356bb1a2,796012745
1,ACoAAByQMl0BB2jLhlEAMDAA5U8hZlx9_O6RkPg,DISTANCE_2,lucyxunz,479212125
2,ACoAABYPa1wBTpPmf2rJiDCzSYZv4q1IyrhewCs,DISTANCE_2,shonballu,370109276
3,ACoAAAabgWsBpsglqxXEEBb82CR9KbNu9VZ9GkI,DISTANCE_2,nourashasan,110854507
4,ACoAABTMPCIBPBZCHPMLp2jBPNUFqGeNYSFEbkg,DISTANCE_2,melaniehyy,348929058
...,...,...,...,...
3812,ACoAAATDlEEBTCZtt4Gz_8Gb7HRs4UTxRXOTLNI,DISTANCE_3,scott-bishop-80192522,79926337
3817,ACoAAAdpSUcB1aUSa_k_SCQKJjP7dkJYiTD9f7c,DISTANCE_3,lara-truman-40778435,124340551
3819,ACoAAALnZ4oByVXX-K26VkBrsv8ae3rCfrCjl1M,DISTANCE_3,chris-money,48719754
3822,ACoAAAM0k7cB1Gy6vWzmNG2I7VSjyKX7ITrgwlA,DISTANCE_3,louise-theunissen-18701516,53777335


In [None]:
e = None
for public_id in tqdm(current_people_df.public_id):
    try:
        results.append(api.get_profile(public_id))
        time.sleep(10)
    except Exception as e:
        print(e)
        break

  0%|          | 0/3569 [00:00<?, ?it/s]

In [None]:
json.dump(people + current_people, open("people.json", "w"))
json.dump(results, open("profiles.json", "w"))

In [None]:
companies[:2]

['3255299', '397575']

In [None]:
all_people_pwc_kpmg = []
for company in tqdm(companies[:2]):
    all_people_pwc_kpmg.extend(api.search_people(past_companies=[company]))
for company in tqdm(companies[:2]):
    all_people_pwc_kpmg.extend(api.search_people(current_company=[company]))

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
all_people_pwc_kpmg_df = pd.DataFrame(all_people_pwc_kpmg).drop_duplicates()
all_people_pwc_kpmg_df

Unnamed: 0,urn_id,distance,public_id,tracking_id
0,ACoAABYLkwwBZlmZPuUAK9ZUaloP8tnTfb1tUeo,DISTANCE_2,daniela-dunn-950a45a3,369857292
1,ACoAACDBlscBlJhLnJMvcdx_SYU3u9aK1B2klJY,DISTANCE_2,aimee-moss,549557959
2,ACoAAAL-g0sBdldYgucu1UpPGQa3tuwyEHyOt3k,DISTANCE_2,pieta-brown-8639a614,50234187
3,ACoAAAp9oQoBW2V1VsWdzS9isI3xV7J3U5FmCFQ,DISTANCE_2,nwmwong,176005386
4,ACoAAABUkUYBb4qsZ2IXBE2TQCcAFFUBSBfGVLg,DISTANCE_2,thea-myers-372a331,5542214
...,...,...,...,...
3865,ACoAADVuedEBbeJppGQssOC1DQmoquedWXgydtk,DISTANCE_3,claudia-fransen-a41268210,896432593
3866,ACoAAAtZL94BnJS8q9ye3PpuUfo2GUxiNu7BVJE,DISTANCE_3,peter-mora-aa691953,190394334
3868,ACoAAAqcH1kBCJVw-vj5fEbugKzpsAeDWWX_cvI,DISTANCE_3,sheila-sampson-5617434b,178003801
3880,ACoAAAFQ2voBEIaO79CFf8qSrLH9tnaZMIHmNZo,DISTANCE_3,margaret-topkins-6764877,22076154


In [None]:
all_people_pwc_kpmg_df = all_people_pwc_kpmg_df[
    ~all_people_pwc_kpmg_df.urn_id.isin(df.urn_id) &
    ~all_people_pwc_kpmg_df.urn_id.isin(current_people_df.urn_id)]
len(all_people_pwc_kpmg_df)

98

In [None]:
e = None
for public_id in tqdm(all_people_pwc_kpmg_df.public_id):
    try:
        results.append(api.get_profile(public_id))
        time.sleep(10)
    except Exception as e:
        print(e)
        break

  0%|          | 0/98 [00:00<?, ?it/s]

In [None]:
json.dump(people + current_people + all_people_pwc_kpmg, open("people.json", "w"))
json.dump(results, open("profiles.json", "w"))

In [None]:
nodes = {}
edges = {}
for r in tqdm(results):
    for i, e in enumerate(r["experience"]):
        companyName = e.get("companyName")
        if not companyName:
            pprint(e)
            continue
        if companyName not in nodes:
            nodes[companyName] = {
                "id": e.get("companyUrn"),
                "name": companyName,
                "geoLocationName": e.get("geoLocationName"),
                "locationName": e.get("locationName"),
                "industries": e.get("industries"),
                "companyLogoUrl": e.get("companyLogoUrl"),
                "val": 0
            }
        nodes[companyName]["val"] += 1
        if i < (len(r["experience"]) - 1):
            prevCompany = r["experience"][i + 1]
            prevCompanyName = prevCompany.get("companyName")
            if not prevCompanyName:
                pprint(prevCompany)
                continue
            edge_id = prevCompanyName + "_" + companyName
            if edge_id not in edges:
                edges[edge_id] = {
                    "source": prevCompanyName,
                    "target": companyName,
                    "val": 0
                }
            edges[edge_id]["val"] += 1

  0%|          | 0/7440 [00:00<?, ?it/s]

{'$anti_abuse_metadata': {'/companyName': {'sourceUrns': {'com.linkedin.common.urn.MemberUrn': 'urn:li:member:347225297'}},
                          '/companyUrn': {'sourceUrns': {'com.linkedin.common.urn.MemberUrn': 'urn:li:member:347225297'}},
                          '/description': {'sourceUrns': {'com.linkedin.common.urn.MemberUrn': 'urn:li:member:347225297'}},
                          '/entityUrn': {'sourceUrns': {'com.linkedin.common.urn.MemberUrn': 'urn:li:member:347225297'}},
                          '/geoLocationName': {'sourceUrns': {'com.linkedin.common.urn.MemberUrn': 'urn:li:member:347225297'}},
                          '/geoUrn': {'sourceUrns': {'com.linkedin.common.urn.MemberUrn': 'urn:li:member:347225297'}},
                          '/locationName': {'sourceUrns': {'com.linkedin.common.urn.MemberUrn': 'urn:li:member:347225297'}},
                          '/promotion': {'sourceUrns': {'com.linkedin.common.urn.MemberUrn': 'urn:li:member:347225297'}},
             

In [None]:
node_df = pd.DataFrame(nodes.values())
pd.set_option('display.max_rows', 100)
node_df.sort_values(by="val", ascending=False).head(20)

Unnamed: 0,id,name,geoLocationName,locationName,industries,companyLogoUrl,val
1,urn:li:fs_miniCompany:3255299,PwC New Zealand,"Auckland, New Zealand","Auckland, New Zealand",,https://media-exp1.licdn.com/dms/image/C4D0BAQ...,2919
104,urn:li:fs_miniCompany:397575,KPMG New Zealand,"Auckland, New Zealand","Auckland, New Zealand",,https://media-exp1.licdn.com/dms/image/C560BAQ...,2578
176,urn:li:fs_miniCompany:1073,EY,"Auckland, New Zealand","Auckland, New Zealand",,https://media-exp1.licdn.com/dms/image/C510BAQ...,2455
151,urn:li:fs_miniCompany:1038,Deloitte,"Wellington, Wellington Region, New Zealand","Wellington, Wellington Region, New Zealand",,https://media-exp1.licdn.com/dms/image/C4E0BAQ...,1236
38,urn:li:fs_miniCompany:1038,Deloitte New Zealand,"Auckland, New Zealand","Auckland, New Zealand",,https://media-exp1.licdn.com/dms/image/C4E0BAQ...,1233
52,,PwC,"London, England, United Kingdom","London, England, United Kingdom",,,464
3,urn:li:fs_miniCompany:3691,The University of Auckland,"Auckland, New Zealand","Auckland, New Zealand",,https://media-exp1.licdn.com/dms/image/C560BAQ...,417
54,urn:li:fs_miniCompany:270126,ASB Bank,"Auckland, New Zealand","Auckland, New Zealand",,https://media-exp1.licdn.com/dms/image/C560BAQ...,250
164,urn:li:fs_miniCompany:2437,ANZ,"Auckland, New Zealand","Auckland, New Zealand",,https://media-exp1.licdn.com/dms/image/C560BAQ...,232
732,urn:li:fs_miniCompany:1073,Ernst & Young,,,,https://media-exp1.licdn.com/dms/image/C510BAQ...,225


In [None]:
node_df[~pd.isna(node_df.geoLocationName) & ~pd.isna(node_df.locationName) & (node_df.geoLocationName!=node_df.locationName)]

Unnamed: 0,id,name,geoLocationName,locationName,industries,companyLogoUrl,val
3900,urn:li:fs_miniCompany:1038,Deloitte España,"Madrid y alrededores, España","Madrid Area, Spain",,https://media-exp1.licdn.com/dms/image/C4E0BAQ...,1
3901,urn:li:fs_miniCompany:137323,Comwave,"Bogotá D.C., Colombia","Bogotá D.C. Area, Colombia",,https://media-exp1.licdn.com/dms/image/C4E0BAQ...,1
3902,urn:li:fs_miniCompany:268836,Politécnico Grancolombiano,"Bogotá D.C., Colombia","Bogotá D.C. Area, Colombia",,https://media-exp1.licdn.com/dms/image/C4D0BAQ...,1
4061,,"Advokátní kancelář JUDr. Sáša Navrátilová, MBA","Okres Brno-město, Česká republika","District Brno-City, Czech Republic",,,1
4388,urn:li:fs_miniCompany:1038,Deloitte France,"Région de Paris, France","Paris Area, France",,https://media-exp1.licdn.com/dms/image/C4E0BAQ...,2
4544,urn:li:fs_miniCompany:801428,EF English First,中国 上海,"Shanghai City, China",,https://media-exp1.licdn.com/dms/image/C4D0BAQ...,1
4611,urn:li:fs_miniCompany:163134,Coca-Cola FEMSA,"Ciudad de México y alrededores, México","Mexico City Area, Mexico",,https://media-exp1.licdn.com/dms/image/C560BAQ...,2
6414,urn:li:fs_miniCompany:396744,The Hunting Dynasty,"Londres, Reino Unido","London, United Kingdom",,https://media-exp1.licdn.com/dms/image/C510BAQ...,1
7248,,Conselho Regional de Psicologia (Psychology Re...,"São Paulo e Região, Brasil","São Paulo Area, Brazil",,,1
7249,urn:li:fs_miniCompany:1508541,P.J. Clarke's,"São Paulo e Região, Brasil","São Paulo Area, Brazil",,,1


In [None]:
print(f"{sum(node_df.geoLocationName.str.contains('New Zealand', na=False))} companies in NZ out of {len(node_df)}")

4239 companies in NZ out of 9595


In [None]:
edge_df = pd.DataFrame(edges.values())
edge_df.sort_values(by="val", ascending=False).head(20)

Unnamed: 0,source,target,val
18,PwC New Zealand,PwC New Zealand,983
437,EY,EY,848
130,KPMG New Zealand,KPMG New Zealand,797
42,Deloitte New Zealand,Deloitte New Zealand,372
418,Deloitte,Deloitte,231
59,PwC,PwC,108
221,PwC,PwC New Zealand,104
268,Deloitte New Zealand,Deloitte,98
356,ANZ,ANZ,68
76,ASB Bank,ASB Bank,63


In [None]:
len(nodes), len(edges)

(9595, 17643)

In [None]:
network = {
    "nodes": list(nodes.values()),
    "links": list(edges.values())
}
json.dump(network, open("network.json", "w"))