In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import random

CSV_PATH = "./data/drugsComTest_raw.csv"
DRUG_COLUMN = "drugName"

# read drug names from CSV file
df = pd.read_csv(CSV_PATH)
drug_names = df[DRUG_COLUMN].unique().tolist()
print(f"find {len(drug_names)} unique drug names in the {CSV_PATH} file.")


In [None]:
import re

def reconstruct(drug_name):
    """
    Reconstruct the drug name by removing special characters and spaces.
    """
    # Remove special characters and spaces
    drug_name = re.split(r'[^a-zA-Z0-9]', drug_name)
    return '_'.join(filter(None, drug_name))

# example usage
drug_name = "Aspirin 81 mg (Enteric Coated) (Bayer)"
reconstructed_name = reconstruct(drug_name)
print(f"Original drug name: {drug_name}")
print(f"Reconstructed drug name: {reconstructed_name}")

In [None]:
import pdfkit
import os

path_to_wkhtmltopdf = r'C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe'
config = pdfkit.configuration(wkhtmltopdf=path_to_wkhtmltopdf)

output_dir = './drug_info_pdfs/'
os.makedirs(output_dir, exist_ok=True)


def generate_pdf(url, output_name, output_dir):
    # 检查URL是否存在
    try:
        response = requests.get(url)
        if response.status_code != 200:
            print(f"URL {url} 不存在或无法访问 (状态码: {response.status_code})，跳过PDF生成")
            return
    except Exception as e:
        print(f"访问 {url} 时出错: {str(e)}，跳过PDF生成")
        return
        
    # URL存在，生成PDF
    output_path = os.path.join(output_dir, output_name)
    pdfkit.from_url(url, output_path, configuration=config)
    print(f"PDF saved as {output_name}")
    return 1

drug_names = drug_names[:10]  # 仅处理前10个药物名称以进行测试
# 将找不到的药物名称存储在一个列表中
not_found_drugs = []
found_drugs = []
for drug_name in drug_names:
    # 生成药物信息页面的 URL
    new_drug_name = drug_name.replace(" ", "-").lower()
    url = f"https://www.drugs.com/{new_drug_name}.html"
    print(f"Fetching {url}")
    # 将所有的非字母数字下划线的符号转为下划线
    new_drug_name = reconstruct(drug_name)
    output_name = f"{new_drug_name}.pdf"
    result = generate_pdf(url, output_name,output_dir)
    if not result:
        not_found_drugs.append(drug_name)
    else:
        found_drugs.append(drug_name)
print(not_found_drugs)
# 将找不到的药物名称保存到 CSV 文件
pd.DataFrame(not_found_drugs, columns=["not_found_1"]).to_csv(os.path.join(output_dir, "not_found_drugs.csv"), index=False)
# 将找到的药物名称保存到 CSV 文件
pd.DataFrame(found_drugs, columns=["found_1"]).to_csv(os.path.join(output_dir, "found_drugs.csv"), index=False)


In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

chrome_driver_path = "C:\\Program Files\\Google\\Chrome\\Application\\chromedriver.exe"
# 设置Chrome选项
chrome_options = Options()
chrome_options.add_argument("--headless")  # 无头模式，不显示浏览器窗口
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--enable-unsafe-swiftshader")
chrome_options.add_argument("--ignore-certificate-errors")
chrome_options.add_argument("--ignore-ssl-errors")

# 创建一个单独的WebDriver实例，避免多次创建和销毁
driver = webdriver.Chrome(service=Service(executable_path=chrome_driver_path), options=chrome_options)

def get_first_drug_result_url(drug_name):
    """
    根据药品名称在drugs.com搜索并获取第一个结果的URL
    """
    try:
        search_url = f"https://www.drugs.com/search.php?searchterm={drug_name}"
        print(f"正在搜索: {drug_name} - {search_url}")
        driver.get(search_url)
        
        # 基于实际HTML结构更新选择器列表
        selectors = [
            "a.ddc-search-result-link-wrap",  # 主要目标：直接链接包装元素
        ]
        
        result_url = None
        for selector in selectors:
            try:
                # 短暂等待元素加载
                WebDriverWait(driver, 3).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, selector))
                )
                elements = driver.find_elements(By.CSS_SELECTOR, selector)
                if elements:
                    for element in elements:
                        url = element.get_attribute("href")
                    # 如果没找到精确匹配，返回第一个结果
                    result_url = elements[0].get_attribute("href")
                    print(f"使用选择器 '{selector}' 找到结果: {result_url}")
                    break
            except Exception as e:
                print(f"选择器 '{selector}' 查找失败")
        
        if not result_url:
            print(f"未找到 {drug_name} 的搜索结果")
            return None
            
        return result_url
        
    except Exception as e:
        print(f"搜索 {drug_name} 时发生错误")
        return None

not_found_drugs = pd.read_csv(os.path.join(output_dir, "not_found_drugs.csv"))["not_found_1"].tolist()
output_dir = './drug_info_pdfs2/'

os.makedirs(output_dir, exist_ok=True)
not_found_drugs_2 = []
found_drugs_2 = []
for drug_name in not_found_drugs:
    result_url = get_first_drug_result_url(drug_name)

    if result_url:
        print(f"最终获取的URL是: {result_url}")
        new_drug_name = reconstruct(drug_name)
        output_name = f"{new_drug_name}.pdf"
        generate_pdf(result_url, output_name,output_dir)
        found_drugs_2.append(drug_name)
    else:
        print(f"未能获取 {drug_name} 的URL")
        not_found_drugs_2.append(drug_name)

driver.quit()
pd.DataFrame(not_found_drugs_2, columns=["not_found_2"]).to_csv(os.path.join(output_dir, "not_found_drugs_2.csv"), index=False)
pd.DataFrame(found_drugs_2, columns=["found_2"]).to_csv(os.path.join(output_dir, "found_drugs_2.csv"), index=False)