In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import random

CSV_PATH = "./data/drugsComTrain_raw.csv"
DRUG_COLUMN = "drugName"

# read drug names from CSV file
df = pd.read_csv(CSV_PATH)
drug_names = df[DRUG_COLUMN].unique().tolist()
print(f"find {len(drug_names)} unique drug names in the {CSV_PATH} file.")


find 3436 unique drug names in the ./data/drugsComTrain_raw.csv file.


In [2]:
import re

def reconstruct(drug_name):
    """
    Reconstruct the drug name by removing special characters and spaces.
    """
    # Remove special characters and spaces
    drug_name = re.split(r'[^a-zA-Z0-9]', drug_name)
    return '_'.join(filter(None, drug_name))

# example usage
drug_name = "Aspirin 81 mg (Enteric Coated) (Bayer)"
reconstructed_name = reconstruct(drug_name)
print(f"Original drug name: {drug_name}")
print(f"Reconstructed drug name: {reconstructed_name}")

Original drug name: Aspirin 81 mg (Enteric Coated) (Bayer)
Reconstructed drug name: Aspirin_81_mg_Enteric_Coated_Bayer


In [None]:
import pdfkit
import os
import tempfile

path_to_wkhtmltopdf = r'C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe'
config = pdfkit.configuration(wkhtmltopdf=path_to_wkhtmltopdf)

output_dir = './test1/'
os.makedirs(output_dir, exist_ok=True)

def generate_pdf(url, output_name, output_dir):
    """
    抓取药物网页信息，清洗内容，并生成PDF文件
    
    Args:
        url (str): 药物信息页面的URL
        output_name (str): 输出PDF文件名
        output_dir (str): 输出目录
        
    Returns:
        int: 成功返回1，失败返回None
    """
    # 创建HTML子目录用于保存中间文件
    html_dir = os.path.join(output_dir, 'html')
    os.makedirs(html_dir, exist_ok=True)
    
    # 检查URL是否存在
    try:
        response = requests.get(url)
        if response.status_code != 200:
            print(f"URL {url} 不存在或无法访问 (状态码: {response.status_code})，跳过PDF生成")
            return None
    except Exception as e:
        print(f"访问 {url} 时出错: {str(e)}，跳过PDF生成")
        return None
    
    # 解析HTML内容
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # 清洗内容
    
    # 1. 移除广告相关内容
    for ad_div in soup.select('.display-ad, .display-ad-leaderboard, .ddc-content-promo-interactions'):
        if ad_div:
            ad_div.decompose()
    
    # 2. 移除Related/similar drugs部分
    related_drugs_header = soup.find('h2', id='related-drugs')
    if related_drugs_header:
        # 移除标题及之后的轮播内容
        carousel = related_drugs_header.find_next('div', class_='ddc-carousel-scroll')
        if related_drugs_header:
            related_drugs_header.decompose()
        if carousel:
            carousel.decompose()
    
    # 3. 移除More about药物部分及其后内容
    more_resources = soup.find('div', id='more-resources')
    if more_resources:
        # 移除该div及之后的所有兄弟元素
        current = more_resources
        while current:
            next_sibling = current.find_next_sibling()
            current.decompose()
            current = next_sibling
    
    # 4. 移除页脚
    footer = soup.find('footer')
    if footer:
        footer.decompose()
    
    # 5. 移除头部导航栏
    header = soup.find('header')
    if header:
        header.decompose()
    
    # 6. 移除侧边栏
    sidebar = soup.find('div', id='sidebar')
    if sidebar:
        sidebar.decompose()
    
    # 7. 移除社交分享按钮
    social_share = soup.find('div', class_='ddc-social-share')
    if social_share:
        social_share.decompose()
    
    # 8. 移除页面图标按钮
    page_icons = soup.find('div', class_='page-icons')
    if page_icons:
        page_icons.decompose()
        
    # 9. 移除Play pronunciation按钮
    pronunciation_button = soup.find('button', id='pronunciation')
    if pronunciation_button:
        pronunciation_button.decompose()
        
    # 10. 移除发音音频元素
    audio_element = soup.find('audio', id='pronounce-audio')
    if audio_element:
        audio_element.decompose()
        
    # 保存原始HTML (可选，用于调试)
    html_filename = output_name.replace('.pdf', '.html')
    original_html_path = os.path.join(html_dir, f"original_{html_filename}")
    with open(original_html_path, 'wb') as f:
        f.write(response.content)
    
    # 保存清洗后的HTML
    cleaned_html_path = os.path.join(html_dir, html_filename)
    with open(cleaned_html_path, 'w', encoding='utf-8') as f:
        f.write(str(soup))
    
    try:
        # 设置wkhtmltopdf配置
        path_to_wkhtmltopdf = r'C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe'
        config = pdfkit.configuration(wkhtmltopdf=path_to_wkhtmltopdf)
        
        # 设置PDF生成选项，提高稳定性
        options = {
            'quiet': '',
            'encoding': 'UTF-8',
            'no-outline': None
        }
        
        # 从本地清洗后的HTML文件生成PDF
        output_path = os.path.join(output_dir, output_name)
        pdfkit.from_file(cleaned_html_path, output_path, configuration=config, options=options)
        print(f"PDF saved as {output_name}")
        
        return 1
    except Exception as e:
        print(f"生成PDF时出错: {str(e)}")
        return None

# drug_names = drug_names[:10]  # 仅处理前10个药物名称以进行测试
# 将找不到的药物名称存储在一个列表中
not_found_drugs = []
found_drugs = []
for drug_name in drug_names:
    # 生成药物信息页面的 URL
    new_drug_name = drug_name.replace(" ", "-").lower()
    url = f"https://www.drugs.com/{new_drug_name}.html"
    print(f"Fetching {url}")
    # 将所有的非字母数字下划线的符号转为下划线
    new_drug_name = reconstruct(drug_name)
    output_name = f"{new_drug_name}.pdf"
    result = generate_pdf(url, output_name,output_dir)
    if not result:
        not_found_drugs.append(drug_name)
    else:
        found_drugs.append(drug_name)
print(not_found_drugs)

print(f'found {len(found_drugs)} drugs, not found {len(not_found_drugs)} drugs.')
# 将找不到的药物名称保存到 CSV 文件
pd.DataFrame(not_found_drugs, columns=["not_found_1"]).to_csv(os.path.join(output_dir, "not_found_drugs.csv"), index=False)
# 将找到的药物名称保存到 CSV 文件
pd.DataFrame(found_drugs, columns=["found_1"]).to_csv(os.path.join(output_dir, "found_drugs.csv"), index=False)


Fetching https://www.drugs.com/valsartan.html
生成PDF时出错: wkhtmltopdf reported an error:
Exit with code 1 due to network error: ProtocolUnknownError

Fetching https://www.drugs.com/guanfacine.html
生成PDF时出错: wkhtmltopdf reported an error:
Exit with code 1 due to network error: ProtocolUnknownError

Fetching https://www.drugs.com/lybrel.html


In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

chrome_driver_path = "C:\\Program Files\\Google\\Chrome\\Application\\chromedriver.exe"
# 设置Chrome选项
chrome_options = Options()
chrome_options.add_argument("--headless")  # 无头模式，不显示浏览器窗口
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--enable-unsafe-swiftshader")
chrome_options.add_argument("--ignore-certificate-errors")
chrome_options.add_argument("--ignore-ssl-errors")

# 创建一个单独的WebDriver实例，避免多次创建和销毁
driver = webdriver.Chrome(service=Service(executable_path=chrome_driver_path), options=chrome_options)

def get_first_drug_result_url(drug_name):
    """
    根据药品名称在drugs.com搜索并获取第一个结果的URL
    """
    try:
        search_url = f"https://www.drugs.com/search.php?searchterm={drug_name}"
        print(f"正在搜索: {drug_name} - {search_url}")
        driver.get(search_url)
        
        # 基于实际HTML结构更新选择器列表
        selectors = [
            "a.ddc-search-result-link-wrap",  # 主要目标：直接链接包装元素
        ]
        
        result_url = None
        for selector in selectors:
            try:
                # 短暂等待元素加载
                WebDriverWait(driver, 3).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, selector))
                )
                elements = driver.find_elements(By.CSS_SELECTOR, selector)
                if elements:
                    for element in elements:
                        url = element.get_attribute("href")
                    # 如果没找到精确匹配，返回第一个结果
                    result_url = elements[0].get_attribute("href")
                    print(f"使用选择器 '{selector}' 找到结果: {result_url}")
                    break
            except Exception as e:
                print(f"选择器 '{selector}' 查找失败")
        
        if not result_url:
            print(f"未找到 {drug_name} 的搜索结果")
            return None
            
        return result_url
        
    except Exception as e:
        print(f"搜索 {drug_name} 时发生错误")
        return None

not_found_drugs = pd.read_csv(os.path.join(output_dir, "not_found_drugs.csv"))["not_found_1"].tolist()
output_dir = './test2/'

os.makedirs(output_dir, exist_ok=True)
not_found_drugs_2 = []
found_drugs_2 = []
for drug_name in not_found_drugs:
    result_url = get_first_drug_result_url(drug_name)

    if result_url:
        print(f"最终获取的URL是: {result_url}")
        new_drug_name = reconstruct(drug_name)
        output_name = f"{new_drug_name}.pdf"
        generate_pdf(result_url, output_name,output_dir)
        found_drugs_2.append(drug_name)
    else:
        print(f"未能获取 {drug_name} 的URL")
        not_found_drugs_2.append(drug_name)

driver.quit()
print(f'found {len(found_drugs_2)} drugs, not found {len(not_found_drugs_2)} drugs.')
pd.DataFrame(not_found_drugs_2, columns=["not_found_2"]).to_csv(os.path.join(output_dir, "not_found_drugs_2.csv"), index=False)
pd.DataFrame(found_drugs_2, columns=["found_2"]).to_csv(os.path.join(output_dir, "found_drugs_2.csv"), index=False)