In [1]:
import re
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
api_url = 'https://www.bakermckenzie.com/en/api/sitecore/people/getfilters?typeQueryKey=professionals'
agent = 'Carlos Toruno using Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
headers = {
    "User-Agent"       : agent,
    "X-Requested-With" : "XMLHttpRequest"
}

In [3]:
response_database = requests.post(api_url, headers=headers)
lawyer_database   = response_database.json()[0]['Filter']

In [32]:
def get_lawyer_data(url, headers):

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'lxml')

    full_name    = soup.find('h1').text.strip()
    try:
        position = soup.find('div', class_ = 'title').text.strip()
    except AttributeError:
        position = 'NA'

    try:
        organization = soup.find('div', class_ = 'association').text.strip()
    except AttributeError:
        organization = 'Not Found'

    locations    = soup.find_all('li', class_ = 'office')
    locations    = ', '.join([x.text.strip() for x in locations])

    email_user = soup.find('a', string = re.compile('Email')).get('data-email').lower()
    email_dom  = soup.find('a', string = re.compile('Email')).get('data-emaildom').lower()
    email = f'{email_user}@{email_dom}'

    phones  = soup.find_all('li', class_ = 'number')
    phones  = ', '.join([x.text.strip() for x in phones])

    try:
        practice = soup.find('h3', string = re.compile('Related Expertise')).find_next_sibling('ul').find_all('li')
        practice = ', '.join([x.text.strip() for x in practice])
    except AttributeError:
        practice = 'Not Found'

    try:
        languages = soup.find('h3', string = re.compile('Languages')).find_next_sibling('ul').find_all('li')
        languages = ', '.join([x.text.strip() for x in languages])
    except AttributeError:
        languages = 'Not Found'

    gender = 'NA'

    lawyer_entry = {
            "location"        : locations,
            "full_name"       : full_name,
            "gender"          : gender,
            "email"           : email,
            "languages"       : languages,
            "position"        : position,
            "organization"    : organization,
            "phone"           : phones,
            "practice"        : practice,
            "full_href"       : url
        }
    
    return lawyer_entry

In [None]:
results = []

In [41]:
counter = 2867
for item in lawyer_database[counter:]:
    name = item['DisplayName']
    href = item['Url']
    print(f"Getting information for individual {counter}: {name}")

    full_href   = f'https://www.bakermckenzie.com{href}'
    print(full_href)
    lawyer_data = get_lawyer_data(full_href, headers)
    results.append(lawyer_data)

    counter += 1
    print('==============================================================')
    time.sleep(1)

Getting information for individual 2867: Kaylea Sher-Fisher
https://www.bakermckenzie.com/en/people/s/sher-fisher-kaylea
Getting information for individual 2868: Irina A. Shestakova
https://www.bakermckenzie.com/en/people/s/shestakova-irina-a
Getting information for individual 2869: Caroline Shih
https://www.bakermckenzie.com/en/people/s/shih-caroline
Getting information for individual 2870: Sean J.C. Shih
https://www.bakermckenzie.com/en/people/s/shih-sean
Getting information for individual 2871: Toshio Shimada
https://www.bakermckenzie.com/en/people/s/shimada-toshio
Getting information for individual 2872: Eunkyung Kim Shin
https://www.bakermckenzie.com/en/people/s/shin-eunkyung-kim
Getting information for individual 2873: Michelle Shin
https://www.bakermckenzie.com/en/people/s/shin-michelle
Getting information for individual 2874: Masayuki Shinoura
https://www.bakermckenzie.com/en/people/s/shinoura-masayuki
Getting information for individual 2875: Howard Shiu
https://www.bakermckenz

In [47]:
master_data = pd.DataFrame.from_dict(results)
master_data.to_csv("../data/bakermckenzie.csv", index = False, encoding = "utf-8")

In [46]:
x = master_data['location'].value_counts()