# Notebook to scrape data about law school faculty members

Requirements:
    
    !pip install pandas
    !pip install bs4
    !pip install requests

Currently scrapes:

    (1) Osgoode Hall Law School, York University (osgoode_bios.json)
    (2) Faculty of Law, University of Toronto (u_toronto_bios.json)
    (3) Lincoln Alexander School of Law, Toronto Metropolitan University (tmu_bios.json)
    (4) Faculty of Law, Queen's University (queens_bios.json)
    (5) Faculty of Law, Western University (western_bios.json)
    (6) Faculty of Law, University of Windsor (windsor_bios.json)
    (7) Bora Laskin Faculty of Law, Lakehead University (lakehead_bios.json)

Creates combined scraped file:
    all_bios.json

Bios are in the data subfolder

    
License: [CC BY-NC 3.0](https://creativecommons.org/licenses/by-nc/3.0/)

### Setup

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import re

# set paths
json_outpath = 'data/'

### (1) Scrape Osgoode website

In [73]:
# Get all links for individual faculty members webpages

# load main faculty page
url = 'https://www.osgoode.yorku.ca/faculty/'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')

# get the main table on faculty page, convert to dataframe and clean
tables = soup.find_all('table')
df = pd.read_html(str(tables[0]))[0]
df.columns = df.columns.str.lower()
df.drop('unnamed: 0', axis=1, inplace=True)

# get all the links (hrefs) on faculty page
results = []
links = soup.find_all('a')
for link in links:
    if 'https://www.osgoode.yorku.ca/faculty-and-staff/' in link.get('href'):
        results.append(link.get('href'))

# delete every second link (b/c the links are duplicated)
results = results[::2]

# add links to the dataframe
df['href'] = results

df

Unnamed: 0,name,title,email,telephone,office,href
0,Rabiat Akande,Assistant Professor,rakande@osgoode.yorku.ca,416-650-8422,3048,https://www.osgoode.yorku.ca/faculty-and-staff...
1,Harry Arthurs,Professor Emeritus,harthurs@osgoode.yorku.ca,,3015,https://www.osgoode.yorku.ca/faculty-and-staff...
2,Saptarishi Bandopadhyay,Associate Professor,sbandopadhyay@osgoode.yorku.ca,416-736-5488,4053,https://www.osgoode.yorku.ca/faculty-and-staff...
3,Stephanie Ben-Ishai,Professor and York University Distinguished Re...,sbenishai@osgoode.yorku.ca,416-650-8239,3043,https://www.osgoode.yorku.ca/faculty-and-staff...
4,Benjamin L. Berger,Professor & York Research Chair in Pluralism a...,bberger@osgoode.yorku.ca,416-736-5867,3030,https://www.osgoode.yorku.ca/faculty-and-staff...
...,...,...,...,...,...,...
73,Emily Kidd White,Assistant Professor,ekwhite@osgoode.yorku.ca,416-736-5826,3033,https://www.osgoode.yorku.ca/faculty-and-staff...
74,J. Scott Wilkie,Distinguished Professor of Practice,swilkie@osgoode.yorku.ca,416-736-2100 ext. 22189,4065,https://www.osgoode.yorku.ca/faculty-and-staff...
75,Cynthia Williams,Professor Emeritus,cwilliams@osgoode.yorku.ca,416-736-5545,4021,https://www.osgoode.yorku.ca/faculty-and-staff...
76,Alan N. Young,Professor Emeritus,ayoung@osgoode.yorku.ca,,3015,https://www.osgoode.yorku.ca/faculty-and-staff...


In [74]:
# Scrape bios from individual faculty member webpages

# function to parse faculty member page
def parse_faculty_page(url):

    # load faculty member page
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')

    # get text from .entry-content tag
    bio = soup.find('div', {'class': 'entry-content'}).text
  
    return bio

# apply function to each row in dataframe
df['bio'] = df['href'].apply(parse_faculty_page) 

# get cleaned bios
def clean_bio(row):
    bio = row['bio']
    bio = bio.split('Graduate Research Supervision (LLM')[0]   # remove everything after 'Graduate Research Supervision (LLM, PhD):'
    if 'Research Interests:' in bio:
        listed_research = bio.split('Research Interests:')[1]   # get everything after 'Research Interests:'
        listed_research = listed_research.split('\n')[0]   
        listed_research = listed_research.replace('\xa0', ' ') # Remove non-breaking spaces
        listed_research = listed_research.replace(',', ';') 
        listed_research = listed_research.replace('.', '')
        listed_research = ' '.join(listed_research.split()) # Remove extra whitespace
        listed_research = listed_research.strip() # Remove leading and trailing whitespace

        row['listed_research_areas'] = listed_research
    else: 
        row['listed_research_areas'] = None
    bio = bio.replace('\xa0', ' ') # Remove non-breaking spaces
    bio = ' '.join(bio.split()) # Remove extra whitespace
    bio = bio.strip() # Remove leading and trailing whitespace
    bio = bio.replace('\n', ' ').strip()
    row['bio']=bio
    return row

df = df.apply(clean_bio, axis=1)

df['faculty'] = 'osgoode'

# reorder columns & drop unnecessary columns
df = df[['faculty', 'name', 'title', 'email', 'href', 'bio', 'listed_research_areas']]

# Remove non current faculty
df = df[~df['title'].str.contains('Emerit')]
df = df[~df['title'].str.contains('Distinguished Professor of Practice')]

# Save to json for future use
df.to_json(json_outpath+'osgoode_bios.json', orient='records', indent = 2)

df


Unnamed: 0,faculty,name,title,email,href,bio,listed_research_areas
0,osgoode,Rabiat Akande,Assistant Professor,rakande@osgoode.yorku.ca,https://www.osgoode.yorku.ca/faculty-and-staff...,Professor Rabiat Akande works in the fields of...,legal history; law and religion; constitutiona...
2,osgoode,Saptarishi Bandopadhyay,Associate Professor,sbandopadhyay@osgoode.yorku.ca,https://www.osgoode.yorku.ca/faculty-and-staff...,I am an Associate Professor at Osgoode Hall La...,Law; history; and politics of Disasters; Inter...
3,osgoode,Stephanie Ben-Ishai,Professor and York University Distinguished Re...,sbenishai@osgoode.yorku.ca,https://www.osgoode.yorku.ca/faculty-and-staff...,Professor Stephanie Ben-Ishai is a Distinguish...,Corporate/Commercial Law
4,osgoode,Benjamin L. Berger,Professor & York Research Chair in Pluralism a...,bberger@osgoode.yorku.ca,https://www.osgoode.yorku.ca/faculty-and-staff...,Professor Benjamin L. Berger is Professor and ...,Law and Religion; Criminal and Constitutional ...
5,osgoode,Kate Glover Berger,Associate Professor,kgberger@osgoode.yorku.ca,https://www.osgoode.yorku.ca/faculty-and-staff...,Professor Kate Glover Berger joined the facult...,
...,...,...,...,...,...,...,...
69,osgoode,Estair Van Wagner,Associate Professor,evanwagner@osgoode.yorku.ca,https://www.osgoode.yorku.ca/faculty-and-staff...,Professor Estair Van Wagner researches and tea...,
70,osgoode,David Vaver,Professor of Intellectual Property Law,dvaver@osgoode.yorku.ca,https://www.osgoode.yorku.ca/faculty-and-staff...,David Vaver is a member of IP Osgoode and Emer...,
71,osgoode,Robert S. Wai,Associate Professor,rwai@osgoode.yorku.ca,https://www.osgoode.yorku.ca/faculty-and-staff...,Robert Wai has been a member of the faculty at...,
72,osgoode,Janet Walker,Professor,jwalker@osgoode.yorku.ca,https://www.osgoode.yorku.ca/faculty-and-staff...,Janet Walker is a full professor and past Asso...,International Litigation and Arbitration; Comp...


### (2) Scrape University of Toronto Website

In [None]:
# Define the base url for the faculty website
base_url = "https://www.law.utoronto.ca"

# Get the html content of the faculty page
response = requests.get(base_url+ '/faculty-staff/full-time-faculty')
soup = BeautifulSoup(response.content, "html.parser")

# Find all tables in soup
tables = soup.find_all("table")

# Iterate through all tables, getting 'name', 'phone' and 'email' for each faculty member
results = []
for table in tables:
    result = {}
    rows = table.find_all("tr")
    for row in rows:
        cols = row.find_all("td")
        if len(cols) > 0:
            name = cols[0].text.strip()
            phone = cols[1].text.strip()
            email = cols[2].text.strip()
            href = cols[0].find("a").get("href")
            result = {'name': name, 'email': email, 'href': base_url + href}
            results.append(result)

# convert result to df
df = pd.DataFrame(results)

df

In [None]:
# get faculty bios from each faculty member's page

def get_bio(row):
    bio = []
    response = requests.get(row['href'])
    soup = BeautifulSoup(response.content, "html.parser")

    texts = soup.find_all("div", class_="field")
    for text in texts:
        bio.append(text.text.strip())
    
    bio = '\n'.join(bio)
    row['bio'] = bio

    research_areas = soup.find("div", class_="bottom-right")
    
    if not research_areas:
        row['listed_research_areas'] = None
    else:
        research_areas = research_areas.get_text(separator= ',') # use this to prevent words from being concatenated
        
        if 'Research areas,' in research_areas or 'Research Areas,' in research_areas:
            research_areas = research_areas.split('Research areas,')[-1].split('Research Areas,')[-1]
            research_areas = research_areas.split('\n')[0]
            if research_areas[-1] == ',':
                research_areas = research_areas[:-1]
            research_areas = research_areas.split(',')
            research_areas = '; '.join(research_areas)
            row['listed_research_areas'] = research_areas
        else:
            row['listed_research_areas'] = None

    return row

df = df.apply(get_bio, axis=1)

# get cleaned bios
def clean_bio(bio):
    bio = bio.split('\nEducation')[0]
    bio = bio.split('\nSelected Publications')[0]
    bio = bio.split('\nSelected publications')[0]
    bio = bio.split('\nSee also Professor')[0]
    bio = bio.replace('\xa0', ' ')
    bio = bio.replace('\n', ' ').strip()
    return bio

df['bio'] = df['bio'].apply(clean_bio)

# append research areas if any
def append_research_areas(row):
    if row['listed_research_areas']:
        return row['bio'] + ' Research Interests: ' + row['listed_research_areas']
    else:
        return row['bio']

df['bio'] = df.apply(append_research_areas, axis=1)

# add faculty name
df['faculty'] = 'u_toronto'

# revise names to go from Last, first to First Last
def clean_name(name):
    name = name.split(', ')
    name = name[1] + ' ' + name[0]
    return name

df['name'] = df['name'].apply(clean_name)

# create new column for title, with NaN values
df['title'] = None

# reorder columns & drop unnecessary columns
df = df[['faculty', 'name', 'title', 'email', 'href', 'bio', 'listed_research_areas']]

# Save to json for future use
df.to_json(json_outpath + 'u_toronto_bios.json', orient='records', indent = 2)

df

### (3) Scrape Toronto Metropolitan University website

In [None]:
# Get api data for faculty

url = 'https://www.torontomu.ca/law/faculty-and-research/faculty/jcr:content/content/resbiographystack.data.1.json'
response = requests.get(url)
json_data = response.json()
df = pd.DataFrame(json_data['data'])

# Get api data for cross-appointed faculty
url = 'https://www.torontomu.ca/law/faculty-and-research/faculty/jcr:content/content/resbiographystack_1397589177.data.1.json'
response = requests.get(url)
json_data = response.json()
df2 = pd.DataFrame(json_data['data'])

# Combine faculty data
df = pd.concat([df, df2], ignore_index=True)

df['page']=df['page'].str.replace('/content/ryerson/','https://www.torontomu.ca/')

df

In [None]:
# Get faculty bios from faculty pages

def get_bio(page):
    response = requests.get(page)
    soup = BeautifulSoup(response.text, 'html.parser')

    # get text from the first div that includes .resText
    bio = soup.find('div', {'class': 'resText'}).text

    # remove newlines
    bio = bio.replace('\n', ' ')

    # remove multiple spaces
    bio = ' '.join(bio.split())

    bio = bio.strip()

    time.sleep(0.25)

    return bio

df['bio'] = df['page'].apply(get_bio)

# rename page
df = df.rename(columns={'page': 'href'})

# combine first and last name
df['name'] = df['firstname'] + ' ' + df['lastname']

# Append specializations to bio
def append_specialization(row):
    bio = row['bio']
    specialization = row['specialization']
    if specialization:
        bio = bio + ' Research Interests: ' + specialization
    return bio

df['bio'] = df.apply(append_specialization, axis=1)

# rename specialization column
df = df.rename(columns={'specialization': 'listed_research_areas'})

df['faculty'] = 'tmu'

# reorder columns & drop unnecessary columns
df = df[['faculty', 'name', 'title', 'email', 'href', 'bio', 'listed_research_areas']]

# Manually add the dean
faculty = 'tmu'
name = 'Donna Young'
title = 'Dean'
email = 'deanoflaw@torontomu.ca'
hfref = 'https://www.torontomu.ca/law/about/our-dean/'
bio = 'Donna E. Young is the Founding Dean of the Lincoln Alexander School of Law. Before assuming her deanship, she was the President William McKinley Distinguished Professor of Law and Public Policy at Albany Law School and a joint faculty member at the University at Albany\'s Department of Women\'s, Gender, and Sexuality Studies. Her teaching and scholarship focus on law and inequality, race and gender discrimination, and academic freedom and university governance. She has taught courses in Criminal Law, Employment Law; U.S. Federal Civil Procedure; Gender and Work; and Race, Rape Culture, and Law. Dean Young is much sought after as a speaker and has been invited to present her work at conferences and other venues around the world. She has been a staff member at the American Association of University Professors\' (AAUP) Department of Academic Freedom, Tenure, and Governance, in Washington, D.C. and was a member of the AAUP\'s Committee A, the preeminent national body setting standards and investigating academic freedom disputes in the United States. She has been a Fellow at Cornell Law School\'s Gender, Sexuality, and Family Project; a Visiting Scholar at Osgoode Hall Law School\'s Institute of Feminist Legal Studies; an Associate in Law at Columbia Law School; a Visiting Scholar at the Faculty of Law at Roma Tre University in Rome, Italy; and a consultant to the International Development Law Organization for whom she traveled to Uganda to conduct field research on the relationship between gender inequality and law in the context of the HIV/AIDS crisis. Dean Young\'s previous professional experiences include articling at Cornish Roland - a labour law firm in Toronto; serving as a consultant with the Ontario Human Rights Commission; and working as a researcher with the NYC Office of Labor Relations. She is admitted to practice in New York State. Research interests: Criminal Law; Employment Law; US Federal Civil Procedure; Antidiscrimination Law and Civil Rights; Critical Race Theory and Feminist Legal Theory; Academic freedom and due process, and university governance; Title IX'
listed_research_areas = 'Criminal Law; Employment Law; US Federal Civil Procedure; Antidiscrimination Law and Civil Rights; Critical Race Theory and Feminist Legal Theory; Academic freedom and due process, and university governance; Title IX'
df2 = pd.DataFrame([[faculty, name, title, email, hfref, bio, listed_research_areas]], columns=['faculty', 'name', 'title', 'email', 'href', 'bio', 'listed_research_areas'])
df = pd.concat([df, df2], ignore_index=True)

# Save to json for future use
df.to_json(json_outpath + 'tmu_bios.json', orient='records', indent = 2)

df

### (4) Scrape Queen's University Faculty Bios

In [None]:
# get list of faculty and their pages

# Define the base url for the faculty website
base_url = "https://law.queensu.ca"

# Get the html content of the faculty page
response = requests.get(base_url+ '/directory')
soup = BeautifulSoup(response.content, "html.parser")

# Find all divs that include 'person-type-1 views-row', including classes that include other information at the beginning of the string 
profs = soup.find_all('div', {'class': re.compile('person-type-1 views-row')})

results = []
for prof in profs:
    result = {}
    result['faculty'] = 'queens'
    result['name'] = prof.find('h2').text.strip()
    result['name'] = result['name'].split(',')[0]
    if prof.find('h3'):
        result['title'] = prof.find('h3').text.strip()
    else:
        result['title'] = None
    result['href'] = base_url + prof.find('a')['href']
    results.append(result)
    
# convert to df
df = pd.DataFrame(results)
df

In [None]:
def get_bio(row):
    response = requests.get(row['href'])
    soup = BeautifulSoup(response.text, 'html.parser')

    # get text from the first div class 'node__content'
    bio = soup.find('div', {'class': 'node__content'})
    if bio:
        bio = bio.text
    else:
        # check if there is a div class "people-about"
        bio = soup.find('div', {'class': 'people-about'})
        if bio:
            bio = bio.text
        else:
            bio = None

    # if there are two divs with class 'node__content', get the second one
    if len(soup.find_all('div', {'class': 'node__content'})) > 1:
        bio2 = soup.find_all('div', {'class': 'node__content'})[1].text
    else:
        bio2 = None
    
    # append research_areas to bio
    if bio2:
        bio = bio + ' ' + bio2

    # get email        
    if bio2:
        if '@queensu.ca' in bio2:
            email = bio2.split('@queensu.ca')[0].split('\n')[-1] + '@queensu.ca'
            email = email.strip()
        else:
            email = None
    else:
        if bio: 
            if '@queensu.ca' in bio:
                email = bio2.split('@queensu.ca')[0].split('\n')[-1] + '@queensu.ca'
                email = email.strip()
            else:
                email = None
        else:
            email = None
    
    if bio:
        bio = bio.replace('\n', ' ')

        # remove non-breaking spaces
        bio = bio.replace(u'\xa0', u' ')

        # remove multiple spaces
        bio = ' '.join(bio.split())

        bio = bio.strip()

    # get areas of research expertise if listed by lookin for div class: "field field--name-field-teaching-and-research field--type-entity-reference field--label-above"
    research_areas = soup.find('div', {'class': 'field field--name-field-teaching-and-research field--type-entity-reference field--label-above'})
    if research_areas:
        research_areas = research_areas.text
        research_areas = research_areas.replace('Teaching and Research Topics', ' ')
        research_areas = research_areas.replace(u'\xa0', '; ')
        research_areas = research_areas.replace('\n', '; ')
        research_areas = research_areas.replace('/', '; ')
        research_areas = [x.strip() for x in research_areas.split(';') if x.strip()]
        research_areas = '; '.join(research_areas).lstrip(';')
        research_areas = research_areas.replace('’', "'")
        research_areas = research_areas.replace("`", "'")
        research_areas = research_areas.title().strip()
        research_areas = research_areas.replace('Children\'S', 'Children\'s')
    else:
        research_areas = None

    row['email']=email
    row['bio']=bio
    row['listed_research_areas']=research_areas

    return row

df = df.apply(get_bio, axis=1)

# Change order of columns
df = df[['faculty', 'name', 'title', 'email', 'href', 'bio', 'listed_research_areas']]

# Save to json for future use
df.to_json(json_outpath + 'queens_bios.json', orient='records', indent = 2)

df

### (5) Scrape Western University Faculty Bios


In [None]:
# get list of faculty and their pages

# Define the base url for the faculty website
base_url = "https://law.uwo.ca/about_us/faculty/"

# Get the html content of the faculty page
response = requests.get(base_url+ '/index.html')
soup = BeautifulSoup(response.content, "html.parser")

# Find all divs that are class 'teamgrid'
profs = soup.find_all('div', {'class': 'teamgrid'})

results = []
for prof in profs:
    result = {}
    result['faculty'] = 'western'
    infoleft = prof.find('div', {'class': 'infoleft'})
    result['name'] = infoleft.find('a').text.strip()
    result['name'] = result['name'].replace('(On Sabbatical Leave)', '').strip()
    result['title'] = infoleft.find('a').next_sibling.strip()
    inforight = prof.find('div', {'class': 'inforight'})
    result['email'] = inforight.find('a').text.strip()
    result['href'] = base_url + infoleft.find('a')['href']
    research_area = infoleft.find('a').find_next_sibling()
    sibling = infoleft.find('a').next_sibling.next_sibling.next_sibling.find_next_sibling()
    if sibling:
        result['listed_research_areas'] = sibling.next_sibling
        result['listed_research_areas'] = result['listed_research_areas'].replace(',', ';').strip()
    else:
        result['listed_research_areas'] = None
    
    results.append(result)

    
# convert to df
df = pd.DataFrame(results)

# drop rows where not research faculty
df = df[~df['title'].str.startswith('Director of Clinics')] # unclear whether tenure stream research faculty
df = df[~df['title'].str.startswith('Assistant Dean')]

df


In [None]:
def get_bio(row):
    response = requests.get(row['href'])
    soup = BeautifulSoup(response.text, 'html.parser')
    # get text from class grid_9
    bio = soup.find('div', {'class': 'grid_9'})
    if bio:
        if row['listed_research_areas']:
            bio = bio.text + ' Research areas: ' + row['listed_research_areas']
        else:
            bio = bio.text
        bio = bio.replace('\n', ' ')
        bio = ' '.join(bio.split())
        bio = bio.strip()
        row['bio']=bio
    else:
        row['bio']=None
    
    time.sleep(0.25)

    return row

df = df.apply(get_bio, axis=1)

# Change order of columns
df = df[['faculty', 'name', 'title', 'email', 'href', 'bio', 'listed_research_areas']]

# Save to json for future use
df.to_json(json_outpath + 'western_bios.json', orient='records', indent = 2)

df

### (6) Scrape U Windsor Faculty Bios

In [16]:
base_url = 'https://www.uwindsor.ca/law/382/full-time-faculty'

response = requests.get(base_url)
soup = BeautifulSoup(response.text, 'html.parser')

# get all div class 'btgrid'
initial_profs = soup.find_all('div', {'class': 'btgrid'})
initial_profs = initial_profs[1:] # remove first

# stop when get to Professors Emeriti
profs = []
for prof in initial_profs:
    if 'Professors Emeriti' in prof.text:
        break
    else:
        profs.append(prof)

results = []
for prof in profs:
    result = {}

    result['faculty'] = 'windsor'
    
    result['name'] = prof.find('a').text     
    
    result['title'] = prof.find('div', {'class': 'col col-md-10'}).find('p').text
    result['title'] = result['title'].split('\n')[0]
    result['title'] = result['title'].replace(u'\xa0', u' ')

    hrefs = prof.find_all('a')
    if len(hrefs) > 1:
        result['email'] = hrefs[1].text
    else:
        result['email'] = None
    
    result['href'] = prof.find('a')['href']

    # Correct errors where no faculty bio link
    if '@' in result['name']:
        result['email']=result['name']
        result['href']=None
        # get first line of text from fist h3
        result['name'] = prof.find('h3').text
    
    results.append(result)    

df = pd.DataFrame(results)
df

Unnamed: 0,faculty,name,title,email,href
0,windsor,Paul Ocheje,Associate Dean (Research and Graduate Studies),pocheje@uwindsor.ca,http://www.uwindsor.ca/law/pocheje/
1,windsor,Chris Fredette,Associate Dean (Academic),fredette@uwindsor.ca,https://www.uwindsor.ca/business/430/chris-fre...
2,windsor,Wissam Aoun,Assistant Professor,waoun@uwindsor.ca,https://www.uwindsor.ca/law/Wissam-Aoun
3,windsor,Jeff Berryman,Distinguished University Professor,jberrym@uwindsor.ca,http://www.uwindsor.ca/law/berryman/
4,windsor,Irina Ceric,Assistant Professor,Irina.Ceric@uwindsor.ca,https://www.uwindsor.ca/law/3049/irina-ceric-a...
5,windsor,Pascale Chapdelaine,Associate Professor,chapdel@uwindsor.ca,http://www.uwindsor.ca/law/chapdel/
6,windsor,Patricia Galvão Ferreira,Assistant Professor,Patricia.Galvao@uwindsor.ca,https://www.uwindsor.ca/law/Patr%C3%ADcia-Galv...
7,windsor,Beverly Jacobs,Currently on secondment to President's Office,beverly.jacobs@uwindsor.ca,https://www.uwindsor.ca/law/Beverly-Jacobs
8,windsor,Laverne Jacobs,Professor,ljacobs@uwindsor.ca,http://www.uwindsor.ca/law/ljacobs/
9,windsor,Danardo Jones,Assistant Professor,Danardo.Jones@uwindsor.ca,https://www.uwindsor.ca/law/Danardo-Jones


In [41]:
### Add faculty bios

def get_bio(row):
    if not row['href']:
        row['bio'] = None
        return row
    
    response = requests.get(row['href'])
    soup = BeautifulSoup(response.text, 'html.parser')

    # try to get bio
    bio = soup.find('div', {'id': 'block-system-main'})
    if bio:
        bio = bio.text
        bio = bio.replace('\n', ' ').strip()
    else:
        bio = soup.find('div', {'class': 'region region-content'})
        bio = bio.text
        bio = bio.replace('\n', ' ').strip()
    if bio:
        row['bio']=bio
    else:
        row['bio']=None
    
    # try to get listed_research_areas
    sidebar = soup.find('div', {'class': 'region region-sidebar-second'})
    if sidebar:
        sidebar = sidebar.text
        if 'Expertise' in sidebar:
            sidebar = sidebar.split('Expertise')[1]
            sidebar = sidebar.split('..')[0]
            sidebar = sidebar.strip()
            sidebar = sidebar.replace('\n', '; ').strip()
            sidebar = sidebar.title()
            
            row['listed_research_areas'] = sidebar
            row['bio'] = bio + ' Research areas: ' + sidebar
        else:
            row['listed_research_areas'] = None
    else:
        row['listed_research_areas'] = None

    return row

df = df.apply(get_bio, axis=1)

# Change order of columns
df = df[['faculty', 'name', 'title', 'email', 'href', 'bio', 'listed_research_areas']]

# Save to json for future use
df.to_json(json_outpath + 'windsor_bios.json', orient='records', indent = 2)

df


Unnamed: 0,faculty,name,title,email,href,bio,listed_research_areas
0,windsor,Paul Ocheje,Associate Dean (Research and Graduate Studies),pocheje@uwindsor.ca,http://www.uwindsor.ca/law/pocheje/,Professor Paul D. Ocheje joined the Faculty in...,International Development Law; Public Internat...
1,windsor,Chris Fredette,Associate Dean (Academic),fredette@uwindsor.ca,https://www.uwindsor.ca/business/430/chris-fre...,"Chris Fredette, Ph.D. Associate Dean Academic,...",
2,windsor,Wissam Aoun,Assistant Professor,waoun@uwindsor.ca,https://www.uwindsor.ca/law/Wissam-Aoun,Wissam Aoun joined the Faculty of Law in July ...,Intellectual Property Law; Patent Law ; Profes...
3,windsor,Jeff Berryman,Distinguished University Professor,jberrym@uwindsor.ca,http://www.uwindsor.ca/law/berryman/,Jeff Berryman is a Distinguished University Pr...,
4,windsor,Irina Ceric,Assistant Professor,Irina.Ceric@uwindsor.ca,https://www.uwindsor.ca/law/3049/irina-ceric-a...,Irina Ceric (she/her) joined the Faculty of La...,Law And Social Movements; Access To Justice; M...
5,windsor,Pascale Chapdelaine,Associate Professor,chapdel@uwindsor.ca,http://www.uwindsor.ca/law/chapdel/,Pascale Chapdelaine is Associate Professor at ...,Contracts; Intellectual Property; Consumer Law
6,windsor,Patricia Galvão Ferreira,Assistant Professor,Patricia.Galvao@uwindsor.ca,https://www.uwindsor.ca/law/Patr%C3%ADcia-Galv...,Patrícia Galvão Ferreira joined the Faculty of...,Law And Society; Transnational Climate Law; Cl...
7,windsor,Beverly Jacobs,Currently on secondment to President's Office,beverly.jacobs@uwindsor.ca,https://www.uwindsor.ca/law/Beverly-Jacobs,Dr. Beverly Jacobs is the Senior Advisor to th...,Indigenous Legal Orders; Indigenous Wholistic ...
8,windsor,Laverne Jacobs,Professor,ljacobs@uwindsor.ca,http://www.uwindsor.ca/law/ljacobs/,"Laverne Jacobs, PhD Full Professor Laverne Ja...",Administrative Law; Disability Rights; Socio-L...
9,windsor,Danardo Jones,Assistant Professor,Danardo.Jones@uwindsor.ca,https://www.uwindsor.ca/law/Danardo-Jones,Danardo Jones joined the Faculty of Law as an ...,Criminal Law; Criminal Procedure; Criminal Sen...


### (7) Scrape Bora Laskin Faculty of Law (Lakehead)


In [62]:
base_url = 'https://www.lakeheadu.ca'

response = requests.get(base_url + '/programs/departments/law/faculty')
soup = BeautifulSoup(response.text, 'html.parser')

# get all div class 'staff-listing'
profs = soup.find_all('div', {'class': 'staff-listing'})

results = []
for prof in profs:
    result = {}
    result['faculty'] = 'lakehead'
    first_name = prof.find('h2')
    last_name = first_name.next_sibling.next_sibling
    result['name'] = first_name.text.strip() + ' ' + last_name.text.strip()
    result['name'] = result['name'].replace('Dr. ', '').replace('Prof. ', '')
    result['title'] = prof.find('h3').text.strip()
    result['email'] = re.findall(r'[\w\.-]+@[\w\.-]+', prof.text)[0]
    # get link with the text that starts with "Read more"
    result['href'] = base_url + prof.find('a', text=re.compile('^Read more'))['href']
    

    results.append(result)

# create df
df = pd.DataFrame(results)
df


Unnamed: 0,faculty,name,title,email,href
0,lakehead,Ryan Alford,Professor,ralford@lakeheadu.ca,https://www.lakeheadu.ca/users/A/ralford/node/...
1,lakehead,Joan Braun,Assistant Professor,jbraun2@lakeheadu.ca,https://www.lakeheadu.ca/users/B/jbraun2/node/...
2,lakehead,Mariette Brennan,Associate Professor,mbrennan@lakeheadu.ca,https://www.lakeheadu.ca/users/B/mbrennan/node...
3,lakehead,Tenille E. Brown,Assistant Professor,tenille.brown@lakeheadu.ca,https://www.lakeheadu.ca/users/B/tbrown5/node/...
4,lakehead,Frances E. Chapman,Professor,fchapman@lakeheadu.ca,https://www.lakeheadu.ca/users/C/fchapman/node...
5,lakehead,Daniel Dylan,Associate Professor,dwdylan@lakeheadu.ca,https://www.lakeheadu.ca/users/D/dwdylan/node/...
6,lakehead,Martin-Joe Ezeudu,Assistant Professor,mezeudu@lakeheadu.ca,https://www.lakeheadu.ca/users/E/mezeudu/node/...
7,lakehead,Jula Hughes,"Dean, Professor",jhughe11@lakeheadu.ca,https://www.lakeheadu.ca/users/H/jhughe11/node...
8,lakehead,Phil Lord,Assistant Professor,plord@lakeheadu.ca,https://www.lakeheadu.ca/users/L/plord/node/66558
9,lakehead,Wendy Parkes,Assistant Professor,wparkes@lakeheadu.ca,https://www.lakeheadu.ca/users/P/wparkes/node/...


In [71]:
# get bios from each href
def get_bio(row):
    response = requests.get(row['href'])
    soup = BeautifulSoup(response.text, 'html.parser')
    bio = soup.find('div', {'class': 'region region-content'}).text
    bio = bio.replace('\n', ' ').strip()
    bio = bio.replace(u'\xa0', u' ')
    bio = ' '.join(bio.split()).strip()
    if 'Academic Qualifications' in bio:
        bio = 'Academic Qualifications: '+ bio.split('Academic Qualifications:')[1]
    
    row['bio'] = bio
    row['listed_research_areas'] = None
    # NOTE: Some of the bios do have research areas listed, but they are mostly in narrative
    # rather than keyword form. Need to use NLP to extract.

    return row

df = df.apply(get_bio, axis=1)

# save to json
df.to_json(json_outpath + 'lakehead_bios.json', orient='records', indent = 2)
df

Unnamed: 0,faculty,name,title,email,href,bio,listed_research_areas
0,lakehead,Ryan Alford,Professor,ralford@lakeheadu.ca,https://www.lakeheadu.ca/users/A/ralford/node/...,Academic Qualifications: Professor Alford rec...,
1,lakehead,Joan Braun,Assistant Professor,jbraun2@lakeheadu.ca,https://www.lakeheadu.ca/users/B/jbraun2/node/...,Academic Qualifications: PhD Candidate - Pete...,
2,lakehead,Mariette Brennan,Associate Professor,mbrennan@lakeheadu.ca,https://www.lakeheadu.ca/users/B/mbrennan/node...,Academic Qualifications: Dr. Brennan complete...,
3,lakehead,Tenille E. Brown,Assistant Professor,tenille.brown@lakeheadu.ca,https://www.lakeheadu.ca/users/B/tbrown5/node/...,"Academic Qualifications: LLB (Scots, Honours)...",
4,lakehead,Frances E. Chapman,Professor,fchapman@lakeheadu.ca,https://www.lakeheadu.ca/users/C/fchapman/node...,Academic Qualifications: Dr. Chapman was admi...,
5,lakehead,Daniel Dylan,Associate Professor,dwdylan@lakeheadu.ca,https://www.lakeheadu.ca/users/D/dwdylan/node/...,Academic Qualifications: LL.M. (University of...,
6,lakehead,Martin-Joe Ezeudu,Assistant Professor,mezeudu@lakeheadu.ca,https://www.lakeheadu.ca/users/E/mezeudu/node/...,Academic Qualifications: Dr. Martin-Joe Ezeud...,
7,lakehead,Jula Hughes,"Dean, Professor",jhughe11@lakeheadu.ca,https://www.lakeheadu.ca/users/H/jhughe11/node...,"Academic Qualifications: M.A., Ph.D. (Erlange...",
8,lakehead,Phil Lord,Assistant Professor,plord@lakeheadu.ca,https://www.lakeheadu.ca/users/L/plord/node/66558,Phil Lord Assistant Professor Department: Bora...,
9,lakehead,Wendy Parkes,Assistant Professor,wparkes@lakeheadu.ca,https://www.lakeheadu.ca/users/P/wparkes/node/...,Academic Qualifications: Professor Parkes has...,


### Combine scraped data into a single file

In [75]:
# load Osgoode df
df = pd.read_json(json_outpath + 'osgoode_bios.json')

# add U of T df
tempdf = pd.read_json(json_outpath + 'u_toronto_bios.json')
df = pd.concat([df, tempdf], ignore_index=True)

# add TMU df
tempdf = pd.read_json(json_outpath + 'tmu_bios.json')
df = pd.concat([df, tempdf], ignore_index=True)

# add Queens df
tempdf = pd.read_json(json_outpath + 'queens_bios.json')
df = pd.concat([df, tempdf], ignore_index=True)

# add Western df
tempdf = pd.read_json(json_outpath + 'western_bios.json')
df = pd.concat([df, tempdf], ignore_index=True)

# add Windsor df
tempdf = pd.read_json(json_outpath + 'windsor_bios.json')
df = pd.concat([df, tempdf], ignore_index=True)

# add Lakehead df
tempdf = pd.read_json(json_outpath + 'lakehead_bios.json')
df = pd.concat([df, tempdf], ignore_index=True)

# Save to json for future use
df.to_json(json_outpath + 'all_bios.json', orient='records', indent = 2)

df

Unnamed: 0,faculty,name,title,email,href,bio,listed_research_areas
0,osgoode,Rabiat Akande,Assistant Professor,rakande@osgoode.yorku.ca,https://www.osgoode.yorku.ca/faculty-and-staff...,Professor Rabiat Akande works in the fields of...,legal history; law and religion; constitutiona...
1,osgoode,Saptarishi Bandopadhyay,Associate Professor,sbandopadhyay@osgoode.yorku.ca,https://www.osgoode.yorku.ca/faculty-and-staff...,I am an Associate Professor at Osgoode Hall La...,Law; history; and politics of Disasters; Inter...
2,osgoode,Stephanie Ben-Ishai,Professor and York University Distinguished Re...,sbenishai@osgoode.yorku.ca,https://www.osgoode.yorku.ca/faculty-and-staff...,Professor Stephanie Ben-Ishai is a Distinguish...,Corporate/Commercial Law
3,osgoode,Benjamin L. Berger,Professor & York Research Chair in Pluralism a...,bberger@osgoode.yorku.ca,https://www.osgoode.yorku.ca/faculty-and-staff...,Professor Benjamin L. Berger is Professor and ...,Law and Religion; Criminal and Constitutional ...
4,osgoode,Kate Glover Berger,Associate Professor,kgberger@osgoode.yorku.ca,https://www.osgoode.yorku.ca/faculty-and-staff...,Professor Kate Glover Berger joined the facult...,
...,...,...,...,...,...,...,...
267,lakehead,Wendy Parkes,Assistant Professor,wparkes@lakeheadu.ca,https://www.lakeheadu.ca/users/P/wparkes/node/...,Academic Qualifications: Professor Parkes has...,
268,lakehead,David Rosenberg,Assistant Professor,drosenbe@lakeheadu.ca,https://www.lakeheadu.ca/users/R/drosenbe/node...,Academic Qualifications: Hons. B.A. from the ...,
269,lakehead,Semie Sama,Assistant Professor,ssama@lakeheadu.ca,https://www.lakeheadu.ca/users/S/ssama/node/57002,Academic Qualifications: Ph.D. (University of...,
270,lakehead,Larissa Speak,Assistant Professor,lspeak@lakeheadu.ca,https://www.lakeheadu.ca/users/S/lspeak/node/7...,Larissa Speak Assistant Professor Department: ...,
