In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
from pandas import DataFrame
import base64
from PIL import Image
from io import BytesIO
import numpy as np
import time
from io import StringIO
import requests
import requests_cache

# 初始化缓存（如果需要）
requests_cache.install_cache('demo_cache', expire_after=36000)

options = Options()
options.add_argument('user-data-dir=C:\\Users\\20586\\AppData\\Local\\Google\\Chrome\\User Data')
options.add_argument('profile-directory=Default')
service = Service('chromedriver.exe')

driver = webdriver.Chrome(options=options, service=service)

In [2]:

driver.get('https://www.morningstar.cn/quickrank/default.aspx')
wait = WebDriverWait(driver, 30)



In [3]:

# 定义获取 base64 编码的后端方法
def get_image_base64(img_src) -> str:
    base64_img = driver.execute_script("""
    function getGifAsBase64(targetSrc) {
  // Find all image elements on the page
  const images = document.getElementsByTagName('img');
  
  // Initialize a variable to store the target image
  let targetImage = null;
  
  // Loop through the images to find the first one with the matching src
  for (let img of images) {
    if (img.src === targetSrc) {
      targetImage = img;
      break;
    }
  }
  
  // If no matching image is found, return null
  if (!targetImage) {
    return null;
  }
  
  // Create a canvas to draw the image
  const canvas = document.createElement('canvas');
  const context = canvas.getContext('2d');
  
  // Set canvas dimensions to the image dimensions
  canvas.width = targetImage.width;
  canvas.height = targetImage.height;
  
  // Draw the image onto the canvas
  context.drawImage(targetImage, 0, 0);
  
  // Get the data URL of the image in Base64 format
  const dataUrl = canvas.toDataURL('image/gif');
  
  // Extract the Base64 string from the data URL
  const base64String = dataUrl.split(',')[1];
  
  return base64String;
}
return getGifAsBase64(arguments[0]);
    """, img_src)
    print(base64_img)
    return base64_img



In [8]:
def readhtml(driver):
    return pd.read_html(StringIO(driver.page_source))[-1]


# 解析表格
def parse_table(driver) -> DataFrame:
    try:
        page_source = driver.page_source
        table = BeautifulSoup(page_source, 'html.parser').find_all('table')[-1]
        rows = []
        for row in table.find_all('tr'):
            cols = []
            for col in row.find_all(['td', 'th']):
                img_tag = col.find('img')
                if img_tag:
                    img_src = img_tag['src']
                    cols.append(img_src)
                else:
                    cols.append(col.text.strip())
            rows.append(cols)

        df = DataFrame(rows[1:], columns=rows[0])
        return df[['代码', '基金名称', '基金分类', '晨星评级(三年)', '晨星评级(五年)', '净值日期', '单位净值(元)',
                   '净值日变动(元)', '今年以来回报(%)']]
    except Exception as e:
        print(f"Error parsing table: {e}")
        return pd.DataFrame()


df_origin = parse_table(driver)
df_origin

Unnamed: 0,代码,基金名称,基金分类,晨星评级(三年),晨星评级(五年),净值日期,单位净值(元),净值日变动(元),今年以来回报(%)
0,21620,天弘中证油气产业指数发起C,行业股票 - 其它,https://www.morningstar.cn/sitedataapi/Cryptog...,https://www.morningstar.cn/sitedataapi/Cryptog...,-,-,-,-
1,21728,兴业福益债券C,普通债券,https://www.morningstar.cn/sitedataapi/Cryptog...,https://www.morningstar.cn/sitedataapi/Cryptog...,2024-07-04,1.1263,-0.0018,-
2,21741,嘉实新财富混合 C,灵活配置,https://www.morningstar.cn/sitedataapi/Cryptog...,https://www.morningstar.cn/sitedataapi/Cryptog...,2024-07-04,0.7260,0.0000,-
3,159585,富国中证全指软件ETF,行业股票-科技、传媒及通讯,https://www.morningstar.cn/sitedataapi/Cryptog...,https://www.morningstar.cn/sitedataapi/Cryptog...,-,-,-,-
4,20530,汇安中债0-3年政金债指数A,短债,https://www.morningstar.cn/sitedataapi/Cryptog...,https://www.morningstar.cn/sitedataapi/Cryptog...,-,-,-,-
5,21187,摩根红利优选股票A,大盘价值股票,https://www.morningstar.cn/sitedataapi/Cryptog...,https://www.morningstar.cn/sitedataapi/Cryptog...,-,-,-,-
6,20531,汇安中债0-3年政金债指数C,短债,https://www.morningstar.cn/sitedataapi/Cryptog...,https://www.morningstar.cn/sitedataapi/Cryptog...,-,-,-,-
7,21188,摩根红利优选股票C,大盘价值股票,https://www.morningstar.cn/sitedataapi/Cryptog...,https://www.morningstar.cn/sitedataapi/Cryptog...,-,-,-,-
8,21241,永赢逸享债券A,积极债券,https://www.morningstar.cn/sitedataapi/Cryptog...,https://www.morningstar.cn/sitedataapi/Cryptog...,-,-,-,-
9,21242,永赢逸享债券C,积极债券,https://www.morningstar.cn/sitedataapi/Cryptog...,https://www.morningstar.cn/sitedataapi/Cryptog...,-,-,-,-


In [4]:

# 预加载 star 图片
pattern_base64s = {}
for i in range(6):
    base64_img = base64.b64encode(open(f'star/{i}.gif', 'rb').read()).decode('utf-8')
    pattern_base64s[base64_img] = i

pattern_hashs = {}

for i in range(6):
    img = Image.open(f'star/{i}.gif')
    img_hash = hash(img.tobytes())
    pattern_hashs[img_hash] = i

pd.DataFrame(pattern_base64s.items(), columns=['base64', 'star'])

Unnamed: 0,base64,star
0,R0lGODlhQgARAPcAAP//////zP//mf//Zv//M///AP/M//...,0
1,R0lGODlhQgARAPcAAP//////zP//mf//Zv//M///AP/M//...,1
2,R0lGODlhQgARAPcAAP//////zP//mf//Zv//M///AP/M//...,2
3,R0lGODlhQgARAPcAAP//////zP//mf//Zv//M///AP/M//...,3
4,R0lGODlhQgARAPcAAP//////zP//mf//Zv//M///AP/M//...,4
5,R0lGODlhQgARAPcAAP//////zP//mf//Zv//M///AP/M//...,5


In [5]:

def count_star_by_base64(image_base64: str) -> int:
    return pattern_base64s.get(image_base64, -1)


def count_star_by_hash(url: str) -> int:
    image = requests.get(url).content
    img = Image.open(BytesIO(image))
    img_hash = hash(img.tobytes())
    return pattern_hashs.get(img_hash, -1)




In [47]:

df = df_origin.copy()
df['晨星评级(三年)'] = df['晨星评级(三年)'].apply(count_star_by_hash)
df['晨星评级(五年)'] = df['晨星评级(五年)'].apply(count_star_by_hash)

df_star_counted = df

df_star_counted.to_csv('fund_list.csv', index=False, encoding='utf-8-sig', mode='w', header=True)


In [9]:
def save_to_csv(df: DataFrame, path: str):
    df.to_csv(path, index=False, encoding='utf-8-sig', mode='a', header=False)


while True:
    try:
        next_page_btn = wait.until(
            EC.element_to_be_clickable((By.XPATH, '//*[@id="ctl00_cphMain_AspNetPager1"]/a[8]')))
        next_page_btn.click()
        time.sleep(2)  # 等待页面加载

        df_tmp = parse_table(driver)
        if df_tmp.empty:
            break

        df_tmp['晨星评级(三年)'] = df_tmp['晨星评级(三年)'].apply(count_star_by_hash)
        df_tmp['晨星评级(五年)'] = df_tmp['晨星评级(五年)'].apply(count_star_by_hash)
        save_to_csv(df_tmp, 'fund__1_300.csv')
    except Exception as e:
        print(f"Error in pagination: {e}")
        break


KeyboardInterrupt: 

In [4]:
import pandas as pd

# 读取所有文件
df_fund_list_1 = pd.read_csv('fund_list_1.csv')
df_fund_list_2 = pd.read_csv('fund_list_2.csv')

df_fund_list = pd.concat([df_fund_list_2, df_fund_list_1], axis=0)
print(df_fund_list.shape)
df_replaced =df_fund_list.drop_duplicates()  # 去重

df_replaced.to_csv('fund_list.csv', index=False, encoding='utf-8-sig', mode='w', header=True)
df_replaced

(22230, 9)


Unnamed: 0,代码,基金名称,基金分类,晨星评级(三年),晨星评级(五年),净值日期,单位净值(元),净值日变动(元),今年以来回报(%)
0,519212,万家宏观择时多策略灵活配置混合A,积极配置 - 大盘平衡,5,4,2024-07-04,2.5697,-0.0365,10.75
1,004685,金元顺安元启灵活配置混合,灵活配置,5,5,2024-07-04,3.9589,-0.0696,-10.26
2,519191,万家新利灵活配置混合,积极配置 - 大盘平衡,5,4,2024-07-04,2.0499,-0.0276,10.85
3,519185,万家精选混合A,积极配置 - 大盘平衡,5,3,2024-07-04,1.8186,-0.0244,11.59
4,004475,华泰柏瑞富利灵活配置混合A,积极配置 - 大盘平衡,5,5,2024-07-04,2.0506,-0.0088,8.72
...,...,...,...,...,...,...,...,...,...
8943,21709.0,华泰紫金同存AAA指数7天持有发起,短债,0,0,-,-,-,-
8944,21785.0,国泰润利纯债债券C,信用债,0,0,-,-,-,-
8945,21808.0,国泰聚享纯债债券C,信用债,0,0,-,-,-,-
8946,20708.0,中加瑞利纯债债券D,纯债,0,0,-,-,-,-


|    |   代码 | 基金名称                        | 基金分类            |   晨星评级(三年) |   晨星评级(五年) | 净值日期   |   单位净值(元) |   净值日变动(元) |   今年以来回报(%) |\n|---:|-------:|:--------------------------------|:--------------------|-----------------:|-----------------:|:-----------|---------------:|-----------------:|------------------:|\n|  0 | 519212 | 万家宏观择时多策略灵活配置混合A | 积极配置 - 大盘平衡 |                5 |                4 | 2024-07-04 |         2.5697 |          -0.0365 |             10.75 |\n|  1 | 004685 | 金元顺安元启灵活配置混合        | 灵活配置            |                5 |                5 | 2024-07-04 |         3.9589 |          -0.0696 |            -10.26 |\n|  2 | 519191 | 万家新利灵活配置混合            | 积极配置 - 大盘平衡 |                5 |                4 | 2024-07-04 |         2.0499 |          -0.0276 |             10.85 |\n|  3 | 519185 | 万家精选混合A                   | 积极配置 - 大盘平衡 |                5 |                3 | 2024-07-04 |         1.8186 |          -0.0244 |             11.59 |\n|  4 | 004475 | 华泰柏瑞富利灵活配置混合A       | 积极配置 - 大盘平衡 |                5 |                5 | 2024-07-04 |         2.0506 |          -0.0088 |              8.72 |