In [119]:
import requests 
from bs4 import BeautifulSoup 
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
import time
from tqdm import tqdm
import csv
import pandas as pd
import json

In [113]:
domain = "https://ki.se"
url0 = "https://ki.se/en/research/research-areas-centres-and-networks/research-groups" 
session = requests.Session()

In [58]:
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.5615.121 Safari/537.36")

service = Service('/usr/local/bin/chromedriver')
driver = webdriver.Chrome(options=options, service=service)

In [None]:
def get_text_or_default(element, default=""):
    return element.get_text(strip=True) if element else default

In [114]:
def try_request(url):
    try:
        response = session.get(url, timeout=10)
        response.raise_for_status()
        return response
    except requests.RequestException as e:
        print(f"Error fetching {url0}: {e}")

## Subject groups

In [116]:
response  = try_request(url0)
soup = BeautifulSoup(response.content, 'html.parser')

In [117]:
subject_groups = []
for card in soup.select('article.promo.type--link'):
    for group in card.select('a'):
        subject_groups.append({ 
            "name": group.get_text(strip=True),
            "link": domain + group.get('href')
            })

subject_groups

[{'name': 'Anesthesiology and intensive care',
  'link': 'https://ki.se/en/research/research-areas-centres-and-networks/research-groups/research-groups-in-anesthesiology-and-intensive-care'},
 {'name': 'Artificial intelligence',
  'link': 'https://ki.se/en/research/research-areas-centres-and-networks/research-groups/research-groups-in-artificial-intelligence'},
 {'name': 'Biochemistry',
  'link': 'https://ki.se/en/research/research-areas-centres-and-networks/research-groups/research-groups-in-biochemistry'},
 {'name': 'Bioinformatics and systems biology',
  'link': 'https://ki.se/en/research/research-areas-centres-and-networks/research-groups/research-groups-in-bioinformatics-and-systems-biology'},
 {'name': 'Biomaterials science',
  'link': 'https://ki.se/en/research/research-areas-centres-and-networks/research-groups/research-groups-in-biomaterials-science'},
 {'name': 'Biophysics',
  'link': 'https://ki.se/en/research/research-areas-centres-and-networks/research-groups/research-grou

In [120]:
with open("data/subject_groups.json", "w", encoding="utf-8") as f:
    json.dump(subject_groups, f, indent=4)

## Subgroups

In [None]:
for group in subject_groups:
    url = group["link"]
    try:
        driver.get(url)
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div.rims-filter-group"))
        )

        html = driver.page_source

    except (TimeoutException, WebDriverException) as e:
        print(f"Error fetching {url}: {e}")
        
    soup = BeautifulSoup(html, 'html.parser')

    subgroups = []
    for subgroup in soup.select("div.rims-filter-group"):  
        subgroups.append({
            "name" : subgroup.select_one("a").get_text(strip=True), 
            "link" : domain + subgroup.select_one("a").get("href")
        })
    
    group["subgroups"] = subgroups

In [None]:
with open("data/subgroups.json", "w", encoding="utf-8") as f:
    json.dump(subject_groups, f, indent=4)

## Researchers

In [108]:
for group in tqdm(subject_groups, desc=f"Processing subgroups in {group['name']}", leave=True):
    for subgroup in tqdm(group["subgroups"], desc=f"Processing researchers in {subgroup['name']}", leave=True):
        
        url = subgroup["link"]+"#tab-staff-and-contact"
        time.sleep(.15) 
        response = session.get(url)
        soup = BeautifulSoup(response.content, "html.parser")
        researchers = []
        
        for researcher in soup.select("div.profile-card.profile-card--small"): 
            researchers.append({
                "name": researcher.select_one("a.profile-card__name").get_text(strip=True) if researcher.select_one("a.profile-card__name") else "",
                "title": researcher.select_one("span.profile-card__title").get_text(strip=True) if researcher.select_one("span.profile-card__title") else "",
                "contact": [a.get_text(strip=True) for a in researcher.select_one("div.profile-card__contact").select("a")] if researcher.select_one("div.profile-card__contact") else [],  
                "link": researcher.select_one("a.profile-card__name").get('href') if researcher.select_one("a.profile-card__name") else ""
            })
        
        subgroup["researchers"] = researchers


Processing researchers in Urology – Olof Akre's research group: 100%|██████████| 17/17 [00:20<00:00,  1.22s/it]
Processing researchers in Traumatic brain injuries and neuro-monitoring – Eric Thelin's research group: 100%|██████████| 9/9 [00:09<00:00,  1.00s/it]
Processing researchers in Structural studies of fertilisation and zona pellucida module proteins – Luca Jovine's research group: 100%|██████████| 7/7 [00:07<00:00,  1.12s/it]
Processing researchers in Structural studies of fertilisation and zona pellucida module proteins – Luca Jovine's research group: 100%|██████████| 28/28 [00:26<00:00,  1.05it/s]
Processing researchers in Ziegenhain lab: 100%|██████████| 6/6 [00:06<00:00,  1.08s/it]26s/it]
Processing researchers in Spider silk biology for biomedical applications – Anna Rising's research group: 100%|██████████| 8/8 [00:05<00:00,  1.52it/s]
Processing researchers in Structural studies of fertilisation and zona pellucida module proteins – Luca Jovine's research group:  29%|██▊  

KeyboardInterrupt: 

In [None]:
with open("data/researchers.json", "w", encoding="utf-8") as f:
    json.dump(subject_groups, f, indent=4)

## Dataset

In [104]:
data = []
for group in subject_groups:
    for subgroup in group["subgroups"]:
        for researcher in subgroup["researchers"]:
            data.append({
                "research subject": group["name"],
                "research group": subgroup["name"],
                "name": researcher["name"],
                "title": researcher["title"],
                "number": researcher["contact"][0] if len(researcher["contact"]) == 2 else "",
                "email": researcher["contact"][1] if len(researcher["contact"]) == 2 else researcher["contact"][0],
                # "address": researcher.get("address", "N/A")
            })

data

[{'research subject': 'Anesthesiology and intensive care',
  'research group': "Anaesthesia and Intensive care – Rebecka Rubenson Wahlin/Anna Schandl's research group",
  'name': 'Anca Balintescu',
  'title': 'Affiliated to Research',
  'number': '',
  'email': 'anca.balintescu@ki.se'},
 {'research subject': 'Anesthesiology and intensive care',
  'research group': "Anaesthesia and Intensive care – Rebecka Rubenson Wahlin/Anna Schandl's research group",
  'name': 'Karin Berggren',
  'title': 'Phd Student',
  'number': '',
  'email': 'karin.berggren@ki.se'},
 {'research subject': 'Anesthesiology and intensive care',
  'research group': "Anaesthesia and Intensive care – Rebecka Rubenson Wahlin/Anna Schandl's research group",
  'name': 'Jacob Broms',
  'title': 'Phd Student',
  'number': '',
  'email': 'jacob.broms@ki.se'},
 {'research subject': 'Anesthesiology and intensive care',
  'research group': "Anaesthesia and Intensive care – Rebecka Rubenson Wahlin/Anna Schandl's research group",

In [None]:
with open("data/data.json", "w", encoding="utf-8") as f:
    json.dump(data, f, indent=4)

In [105]:
df = pd.DataFrame(data)
df

Unnamed: 0,research subject,research group,name,title,number,email
0,Anesthesiology and intensive care,Anaesthesia and Intensive care – Rebecka Ruben...,Anca Balintescu,Affiliated to Research,,anca.balintescu@ki.se
1,Anesthesiology and intensive care,Anaesthesia and Intensive care – Rebecka Ruben...,Karin Berggren,Phd Student,,karin.berggren@ki.se
2,Anesthesiology and intensive care,Anaesthesia and Intensive care – Rebecka Ruben...,Jacob Broms,Phd Student,,jacob.broms@ki.se
3,Anesthesiology and intensive care,Anaesthesia and Intensive care – Rebecka Ruben...,Jens Christensen,Phd Student,,jens.christensen@ki.se
4,Anesthesiology and intensive care,Anaesthesia and Intensive care – Rebecka Ruben...,Maria Cronhjort,Affiliated to Research,,maria.cronhjort@ki.se
...,...,...,...,...,...,...
24438,Urology and nephrology,Urology – Olof Akre's research group,Kalle Svennersten,Affiliated to Research,,karl.svennersten@ki.se
24439,Urology and nephrology,Urology – Olof Akre's research group,Per Henrik Vincent,Affiliated to Research,,per.henrik.vincent@ki.se
24440,Urology and nephrology,Urology – Olof Akre's research group,N Peter Wiklund,Professor/Senior Physician,,peter.wiklund@ki.se
24441,Urology and nephrology,Urology – Olof Akre's research group,Helena Zander Ögren,Administrator,,helena.zander.ogren@ki.se


In [107]:
df.to_excel("Karolinska_researchers.xlsx", index=False, engine='openpyxl')