## 第三周作业

#### 提交人学号：10245501425
#### 提交人姓名：刘至晗

#### 作业题目：
ESI学科数据获取及分析


In [8]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import pandas as pd
import time
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service


ESI_SUBJECTS = [
    "Agricultural Sciences",
    "Biology & Biochemistry",
    "Chemistry",
    "Clinical Medicine",
    "Computer Science",
    "Economics & Business",
    "Engineering",
    "Environment/Ecology",
    "Geosciences",
    "Immunology",
    "Materials Science",
    "Mathematics",
    "Microbiology",
    "Molecular Biology & Genetics",
    "Multidisciplinary",
    "Neuroscience & Behavior",
    "Pharmacology & Toxicology",
    "Physics",
    "Plant & Animal Science",
    "Psychiatry/Psychology",
    "Social Sciences, General",
    "Space Science"
]
# 数据保存路径
OUTPUT_EXCEL = "ESI各学科机构排名数据.xlsx"
ESI_PAGE_URL = "https://esi.clarivate.com/IndicatorsAction.action?app=esi&Init=Yes&authCode=null&SrcApp=IC2LS&SID=H4-x2BZzGy70LpHfaWqsn34C92MkUkG5J5onM-18x2duCbH4Zb2x2F2xxTPTpUU0DGKQx3Dx3DRmRYSQaxxQ1S5U5pyJ5a2vgx3Dx3D-deDoSViHIQYUGXyhfV4d4Ax3Dx3D-ucx2FlMPFCLJrFFs0K4gTuzQx3Dx3D"  # 替换为实际可访问的页面URL


def init_browser():
    options = Options()
    options.add_argument("--start-maximized")
    options.add_argument("--disable-gpu")
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)
    driver.implicitly_wait(10)
    return driver

def crawl_subject_data(driver, subject):
    """
    按指定学科筛选数据，爬取机构排名
    :param driver: 浏览器驱动对象
    :param subject: 目标学科
    :return: 学科排名数据（DataFrame）
    """
    print(f"\n开始爬取【{subject}】学科数据...")
    
    try:
        add_filter_btn = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Add Filter')]"))
        )
        add_filter_btn.click()
        time.sleep(2)
        
        research_field_filter = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.XPATH, "//div[contains(text(), 'Research Fields')]"))
        )
        research_field_filter.click()
        time.sleep(2)
        
        subject_option = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.XPATH, f"//label[contains(text(), '{subject}')]"))
        )
        subject_option.click()
        time.sleep(3)  
        
    except Exception as e:
        print(f"筛选学科{subject}失败：{str(e)}")
        return None
    
    try:
        results_list_select = Select(WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.ID, "results-list-select"))  
        ))
        if results_list_select.first_selected_option.text != "Institutions":
            results_list_select.select_by_visible_text("Institutions")
            time.sleep(3)
        print(f"已确认结果维度为：Institutions")
    
    except Exception as e:
        print(f"切换Results List至Institutions失败：{str(e)}")
        return None
    

    try:
        table = WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.XPATH, "//table[contains(@class, 'esi-ranking-table')]"))  # 假设表格类名为esi-ranking-table
        )
        
        headers = []
        th_elements = table.find_elements(By.TAG_NAME, "th")
        for th in th_elements:
            headers.append(th.text.strip())
        if len(headers) < 6:
            headers = ["Rank", "Institution", "Country/Region", "Web of Science Documents", "Cites", "Cites/Paper"]
        
        rows = []
        tr_elements = table.find_elements(By.TAG_NAME, "tr")
        for tr in tr_elements[1:]: 
            td_elements = tr.find_elements(By.TAG_NAME, "td")
            if len(td_elements) < 6:
                continue  
            row = [td.text.strip() for td in td_elements]
            rows.append(row)
        
        df = pd.DataFrame(rows, columns=headers)
        df = df.dropna(subset=["Institution"])  
        numeric_cols = ["Web of Science Documents", "Cites", "Cites/Paper"]
        for col in numeric_cols:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col].str.replace(",", ""), errors="coerce")
        
        print(f"【{subject}】学科爬取完成，共{len(df)}条机构数据")
        return df
    
    except Exception as e:
        print(f"爬取{subject}学科表格数据失败：{str(e)}")
        return None

def main_crawl():
    driver = init_browser()
    try:
        driver.get(ESI_PAGE_URL)
        print(f"已访问ESI页面：{ESI_PAGE_URL}")
        time.sleep(5) 
        
        with pd.ExcelWriter(OUTPUT_EXCEL, engine="openpyxl") as writer:
            for subject in ESI_SUBJECTS:
                subject_df = crawl_subject_data(driver, subject)
                if subject_df is not None and not subject_df.empty:
                    subject_df.to_excel(writer, sheet_name=subject, index=False)
                    print(f"已保存【{subject}】学科数据到Excel")
                else:
                    print(f"【{subject}】学科无有效数据，跳过保存")
                
                try:
                    clear_filter_btn = WebDriverWait(driver, 10).until(
                        EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Clear')]"))
                    )
                    clear_filter_btn.click()
                    time.sleep(2)
                except Exception as e:
                    print(f"重置筛选器失败：{str(e)}，将刷新页面重试")
                    driver.refresh()
                    time.sleep(5)
        
        print(f"\n所有学科爬取完成！数据已保存至：{OUTPUT_EXCEL}")
    
    finally:
        driver.quit()
        print("浏览器已关闭")

# 执行爬虫
if __name__ == "__main__":
    main_crawl()

已访问ESI页面：https://esi.clarivate.com/IndicatorsAction.action?app=esi&Init=Yes&authCode=null&SrcApp=IC2LS&SID=H4-x2BZzGy70LpHfaWqsn34C92MkUkG5J5onM-18x2duCbH4Zb2x2F2xxTPTpUU0DGKQx3Dx3DRmRYSQaxxQ1S5U5pyJ5a2vgx3Dx3D-deDoSViHIQYUGXyhfV4d4Ax3Dx3D-ucx2FlMPFCLJrFFs0K4gTuzQx3Dx3D

开始爬取【Agricultural Sciences】学科数据...
筛选学科Agricultural Sciences失败：Message: 
Stacktrace:
	GetHandleVerifier [0x0x12dfea3+66515]
	GetHandleVerifier [0x0x12dfee4+66580]
	(No symbol) [0x0x10cdc48]
	(No symbol) [0x0x1118704]
	(No symbol) [0x0x1118aab]
	(No symbol) [0x0x115f482]
	(No symbol) [0x0x113b214]
	(No symbol) [0x0x115cba7]
	(No symbol) [0x0x113afc6]
	(No symbol) [0x0x110c2ca]
	(No symbol) [0x0x110d154]
	GetHandleVerifier [0x0x15373d3+2521347]
	GetHandleVerifier [0x0x1532353+2500739]
	GetHandleVerifier [0x0x1307cf4+229924]
	GetHandleVerifier [0x0x12f8258+165768]
	GetHandleVerifier [0x0x12fed0d+193085]
	GetHandleVerifier [0x0x12e81b8+100072]
	GetHandleVerifier [0x0x12e8350+100480]
	GetHandleVerifier [0x0x12d260a+11066]
	

IndexError: At least one sheet must be visible