In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
import matplotlib.pyplot as plt
import tqdm

In [2]:
# Set the URL you want to webscrape from

url_2021 = 'https://easychair.org/smart-program/IC2S2-2021/talk_author_index.html'

url_2020 = 'https://ic2s2.mit.edu/program'
url2_2020 = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vTX9_1Xftn7D-nSI8X9b7tafr_Z0kAbphKdfZ8qUSU9p-syXNsGPdhHl5ZyTnKKL-T6dCEJqtsrn3wy/pubhtml?gid=181378784&amp;single=true&amp;widget=true&amp;headers=false'

url_2019_oral = 'https://2019.ic2s2.org/oral-presentations/'
url_2019_poster = 'https://2019.ic2s2.org/posters/'

In [3]:
# Oral:
oral_link = 'https://2019.ic2s2.org/oral-presentations/'
oral_r = requests.get(oral_link)
oral_soup = BeautifulSoup(oral_r.text)

# Find the chair persons for each section
chairs = [i.text.replace('<em>Chair:','').replace('Chair: ','') for i in oral_soup.find_all("em")]

# Define list for speakers and edge case wording found through quality check of final list
oral_speakers = []
edge_cases = ['No Presentation', 'No presentation', 'No Presentation (cancelled)',  'No presentation (cancelled)',
              'Analyzing and Countermeasures', 'Policy Documents','and Twitter', 'UK and German Elections', 'collaboration',
              'and impact', 'cultural and scientific careers','Places and People: strategies',
              'limitations and trade-offs in the physical and digital worlds', 'which nodes should have the last word in multiplayer ultimatum bargaining?',
              'Stable Exchange Networks through Quenched Merchant Location and Idiosyncratic Trading Costs','Connecting Degree Programs from Individuals’ Choices',
              '000 Leagues Under the City','family and neighbor network','and Security']

for i in oral_soup.find_all('p')[3:len(chairs)+3]:

    # Split the program string into a list by looking at the the time stamps in the program
    program = re.split('(?:[0-9])(?:[0-9]):(?:[0-9])(?:[0-9]) – (?:[0-9])(?:[0-9]):(?:[0-9])(?:[0-9]) – ',
                   i.text)[1:]
    
    # Loop through the program list to find and split the speakers 
    for presentation in program:
        for speaker in presentation.split(','):
            # Remove space if there is a space at the front of the speaker name
            if speaker[0] == ' ':
                speaker = speaker[1:]
            
            # if the speaker name is longer then 20 characters, that is a pretty good indication that the string includes the title of the paper
            # remove the paper title
            if len(speaker) > 20:
                speaker = speaker.split('.')[0]
                speaker = speaker.split('–')[0]
            
            # Edge case handling
            if ': ' in speaker:
                speaker = speaker.split(': ')[-1]
            
            # Remove space at the end if present
            if speaker[-1] == ' ':
                speaker = speaker[:-1]
            
            # Last edgecase handling
            if speaker not in edge_cases:
                if speaker == 'James P':
                    speaker = 'James P. Gleeson'
                if speaker == '(Moved to 3D Text Analysis) Ivan Smirnov':
                    speaker = 'Ivan Smirnov'
                if speaker == 'Alex Pentland. Fair':
                    speaker = 'Alex Pentland'
                
                # Save to speakers list 
                oral_speakers.append(speaker)

In [4]:
# Poster:
poster_link = 'https://2019.ic2s2.org/posters/'
poster_r = requests.get(poster_link)
poster_soup = BeautifulSoup(poster_r.text)

# Define list for speakers and edge case titles found through quality check of final list
poster_speakers = []
edge_cases = ['Structure and Evolution of the Network of Countries Signing Global Environmental Treaties',
              'Evolution of the Network of Countries Signing Global Environmental Treaties',
              'Social capital at IC2S2: A network analysis of the conference participants between 2015-2018.',
              'Invisible college in large co-authorship networks – iterative sampling approach',
              'Founders Dynamics: Interpersonal Relationships and Between-Team Interaction in Early Startups',
              'Bio, psycho or social – Discursive framing of depression in online health communities',
              'Evolution of Employment in the United States: A Longitudinal Study of Job Polarisation',
              'Tipping Points in Polarized Networks']

# Loop through the posters found on the webpage
for poster in poster_soup.find('div', class_="col-md-8 page-content-wrap").find_all('li'):

    # Cut the titles
    speakers = poster.text.split('\xa0')[0]
    speakers = speakers.split('\n')[0]

    # Check that the "found speaker" is not one of the edge cases
    if speakers not in edge_cases:
        
        # Split the speakers string into a speaker list
        for speaker in speakers.split(','):

            # Remove first letter if it is a space
            if speaker[0] == ' ':
                speaker = speaker[1:]
            
            # Separate speakers that are separated by an and
            if 'and' in speaker:
                for sp in speaker.split(' and '):
                    # Save to speakers list
                    poster_speakers.append(sp)
            else:
                # Save to speakers list
                poster_speakers.append(speaker)

In [8]:
# Get data from 2020
r = requests.get(url2_2020)
soup = BeautifulSoup(r.content)
table = soup.find("table",{"class":"waffle"})
table = table.find("tbody")
table_rows = table.find_all("tr")

# Get header
ths = table_rows[0].find_all("td")
header = [th.text for th in ths]


# Get data
data = []
for tr in table_rows[1:]:
    tds = tr.find_all("td")
    row = [td.text for td in tds]
    data.append(row)

# Convert to dataframe
df2020 = pd.DataFrame(data, columns=header)

# Add 2020 data
speakers_2020 = []
for idx,row in df2020.iterrows():
    for reasearcher in row['Presenters'].split(', '):
        speakers_2020.append(reasearcher)


In [10]:
r = requests.get(url_2021)
soup = BeautifulSoup(r.content)
table = soup.find("table",{"class":"index"})
rows = table.find_all("tr")

speakers_2021 = []
for row in rows[1:]:
    try: 
        name = row.find("td",{"class":"name"}).text.split(", ")
    except:
        continue
    
    #print(name)
    Author = f"{name[1]} {name[0]}"
    speakers_2021.append(Author)

In [12]:
all_speakers = oral_speakers + poster_speakers + speakers_2020 + speakers_2021
print(f"Total number of speakers: {len(all_speakers)}")

Total number of speakers: 3287


In [17]:
all_unique_speakers = list(set(all_speakers))
print(f"Total number of unique speakers: {len(all_unique_speakers)}")

Total number of unique speakers: 2211


In [28]:
import pickle as pl

with open('all_unique_speakers.pkl', 'wb') as f:
    pl.dump(all_unique_speakers, f)

TypeError: 'NoneType' object is not iterable

In [None]:
for