In [1]:
import requests
from bs4 import BeautifulSoup
import json
import time

In [2]:
# prepare all the urls needed to be scraped
urls = """Browse dictionary by letter:<br><a href="/medical-dictionary-of-health-terms/a-through-c#A-terms">A</a> | <a href="/medical-dictionary-of-health-terms/a-through-c#B-terms">B</a> | <a href="/medical-dictionary-of-health-terms/a-through-c#C-terms">C</a> | <a href="/medical-dictionary-of-health-terms/d-through-i#D-terms">D</a> | <a href="/medical-dictionary-of-health-terms/d-through-i#E-terms">E</a> | <a href="/medical-dictionary-of-health-terms/d-through-i#F-terms">F</a> | <a href="/medical-dictionary-of-health-terms/d-through-i#G-terms">G</a> | <a href="/medical-dictionary-of-health-terms/d-through-i#H-terms">H</a> | <a href="/medical-dictionary-of-health-terms/d-through-i#I-terms">I</a> | <a href="/medical-dictionary-of-health-terms/j-through-p#J-terms">J</a> | <a href="/medical-dictionary-of-health-terms/j-through-p#K-terms">K</a> | <a href="/medical-dictionary-of-health-terms/j-through-p#L-terms">L</a> | <a href="/medical-dictionary-of-health-terms/j-through-p#M-terms">M</a> | <a href="/medical-dictionary-of-health-terms/j-through-p#N-terms">N</a> | <a href="/medical-dictionary-of-health-terms/j-through-p#O-terms">O</a> | <a href="/medical-dictionary-of-health-terms/j-through-p#P-terms">P</a> | <a href="/medical-dictionary-of-health-terms/q-through-z#Q-terms">Q</a> | <a href="/medical-dictionary-of-health-terms/q-through-z#R-terms">R</a> | <a href="/medical-dictionary-of-health-terms/q-through-z#S-terms">S</a> | <a href="/medical-dictionary-of-health-terms/q-through-z#T-terms">T</a> | <a href="/medical-dictionary-of-health-terms/q-through-z#U-terms">U</a> | <a href="/medical-dictionary-of-health-terms/q-through-z#V-terms">V</a> | <a href="/medical-dictionary-of-health-terms/q-through-z#W-terms">W</a> | <a href="/medical-dictionary-of-health-terms/q-through-z#X-terms">X</a> | <a href="/medical-dictionary-of-health-terms/q-through-z#Y-terms">Y</a> | <a href="/medical-dictionary-of-health-terms/q-through-z#Z-terms">Z</a>"""
soup = BeautifulSoup(urls, 'html.parser')
links = soup.find_all("a", href=True)
part_links = set()
for link in links:
    part_links.add(link["href"].split('/')[2].split('#')[0])

In [3]:
# scrape all the terminology-explanation pairs
terminology_pairs = {}
root_link = "https://www.health.harvard.edu/"

for part_link in part_links:
    print(f"Extracting {part_link}")
    start_len = (len(terminology_pairs))
    
    # Fetch the webpage content
    url = root_link + part_link
    response = requests.get(url)
    content = response.content
    
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(content, 'html.parser')
    
    div_class = "content-repository-content prose max-w-md-lg mx-auto flow-root getShouldDisplayAdsAttribute"
    target_div = soup.find("div", class_=div_class)

    if target_div:
        # extract terminology-explanation pairs
        strong_elements = target_div.find_all("strong")
        for strong in strong_elements:
            term = strong.get_text(strip=True)
            # exclude the ":" at the end
            term = term[:-1]
            explanation = strong.next_sibling.strip()
            terminology_pairs[term] = explanation

    end_len = (len(terminology_pairs))
    print(f"Previous items: {start_len}, Current items: {end_len}, Increases: {end_len - start_len}\n")
    time.sleep(5)

Extracting j-through-p
Previous items: 0, Current items: 539, Increases: 539

Extracting a-through-c
Previous items: 539, Current items: 1069, Increases: 530

Extracting d-through-i
Previous items: 1069, Current items: 1591, Increases: 522

Extracting q-through-z
Previous items: 1591, Current items: 2052, Increases: 461



In [4]:
# Save the data dictionary as a JSON file
with open("harvard_medical_dictionary_test.json", "w", encoding="utf-8") as json_file:
    json.dump(terminology_pairs, json_file, indent=4, ensure_ascii=False)