# 思路

## 关于晨星评级图片的处理
- 晨星评级信息时一个图片且url地址无规律因此采用图片识别的办法判断星级
- 首先获取到星级的url后，通过`request`下载对应的图片`gif`格式
- 将`gif`转化为`png`
- 将`png`读入为一个数组
- 星级图片像素：17\*66的黑白图片，因此直接对数组求和即可得到唯一值

# 代码实现

In [76]:
from PIL import Image
import os 
from selenium import webdriver
import requests
import numpy as np
import time
from datetime import datetime
import pandas as pd 

## 公用函数

### 将`gif`转换为`png`

In [4]:
def gif2png(gifPath):
#例如：gifPath =  '4.gif'
    im = Image.open(gifPath)
    #使用Image模块的open()方法打开gif动态图像时，默认是第一帧

    pngPath = gifPath[:-4]

    try:
        while True:
            #保存当前帧图片
            current = im.tell()
            im.save(pngPath+'.png')
            #获取下一帧图片
            im.seek(current+1)
    except EOFError:
            pass

### 计算`png`的`sum`值

In [1]:
def sum_png(png_path):
    image = Image.open(png_path)
    image_arr = np.array(image)
    return np.sum(image_arr)

### 将星级图片的`url`转换为`sum`值

In [2]:
def url2sum(url):
    r = requests.get(url)
    with open("temp.png", "wb")as f:
        f.write(r.content)
    return sum_png('temp.png')   

### 将列表中的数字文本转为`float`类型

In [59]:
def text2float(text_list):
    float_list=[]
    for item in text_list:
        try:
            item = float(item)
        except:
            pass
#         if item !='-' and len(item)<=5:  #没有业绩的跳过
#             item = float(item)
        float_list.append(item)
    return float_list

### 将字典转化为`series`
- 输入两个`list`：`keys`、`valus`
- 输出一个 `sereis`

In [112]:
def dic2series(keys,values):
    make_dic = dict(zip(keys,values))
    return pd.Series(make_dic)

## 函数形式获取某只基金的信息

In [20]:
def get_info_by_fund_id(fund_id):
    fund_url = 'https://cn.morningstar.com/quicktake/' + fund_id 
    browser =webdriver.Firefox()

    #只加载10s
    browser.set_page_load_timeout(10)

    try:
        browser.get(fund_url)
    except Exception:
        browser.execute_script('window.stop()')

    #模拟用户登录
    username = browser.find_element_by_id('emailTxt')
    password = browser.find_element_by_id('pwdValue')

    username.send_keys('ryan_jin@sina.com')  
    password.send_keys('pa_wo@17.ms')
    submit = browser.find_element_by_id('loginGo')
    a = submit.text
    submit.click()

    time.sleep(10)
    browser.execute_script('window.stop()')
    
    
    browser.quit()

## 按步骤调试

### 打开页面

In [23]:
fund_id='F0000004AI'

In [24]:
fund_url = 'https://cn.morningstar.com/quicktake/' + fund_id 
browser =webdriver.Firefox()

#只加载10s
browser.set_page_load_timeout(10)

try:
    browser.get(fund_url)
except Exception:
    browser.execute_script('window.stop()')

#模拟用户登录
username = browser.find_element_by_id('emailTxt')
password = browser.find_element_by_id('pwdValue')

username.send_keys('ryan_jin@sina.com')  
password.send_keys('pa_wo@17.ms')
submit = browser.find_element_by_id('loginGo')
a = submit.text
submit.click()

time.sleep(10)
browser.execute_script('window.stop()')

### 基金基本信息

In [100]:
qt_base = browser.find_element_by_id('qt_base')

#基金代码 & 名称
f_title = browser.find_element_by_id('qt_fund').text
f_code =f_title[:6]
f_name = f_title[7:]

#净值
jingZhi = float(qt_base.find_element_by_xpath('//div/ul/li[2]/span').text)

#净值日期
jingZhiDate = qt_base.find_element_by_class_name('date').text[5:]

#基金类型
f_type = qt_base.find_element_by_class_name('category').text

#创建时间
found_date = qt_base.find_element_by_class_name('inception').text

#基金风格箱
f_style = qt_base.find_element_by_class_name('sbdesc').text

#总净资产（亿元）
total_asset = float(qt_base.find_element_by_class_name('asset').text)

#[基金代码，名称，基金类型，基金风格箱，总净资产,净值，净值日期，创建时间，]
base_info_values = text2float([f_code,f_name,f_type,f_style,total_asset,jingZhi,jingZhiDate,found_date])
base_info_keys = ['代码','名称','分类','风格','总净资产(亿元)','净值','净值日期','创建时间']
base_info =dict(zip(base_info_keys,base_info_values))
base_info_series  = pd.Series(base_info)

### 历史业绩

In [99]:
qt_per = browser.find_element_by_id('qt_per')

#获取近8年的业绩表现
performance_8_values =text2float([qt_per.find_element_by_class_name(r_x).text 
                                  for r_x in ['r0','r1','r2','r3','r4','r5','r6','r7']])

this_year = datetime.now().year

performance_8_keys = ['今年业绩%',
                      str(this_year-1 ) + "业绩(%)" ,
                      str(this_year-2 ) + "业绩(%)" ,
                      str(this_year-3 ) + "业绩(%)" ,
                      str(this_year-4 ) + "业绩(%)" ,
                      str(this_year-5 ) + "业绩(%)" ,
                      str(this_year-6 ) + "业绩(%)" ,
                      str(this_year-7 ) + "业绩(%)" ,]

performance_8 = dict(zip(performance_8_keys,performance_8_values))

performance_8_series  = pd.Series(performance_8)

### 历史回报

In [113]:
qt_return1= browser.find_element_by_id('qt_return1')

#历史回报[一个月，三个月，六个月，今年以来，一年，二年年化，三年年化，五年年化，十年年化]
history_return_values = text2float([qt_return1.find_elements_by_tag_name('li')[li_no].text 
                                    for li_no in [6,11,16,21,26,31,36,41,46]])

history_return_keys = ['一个月回报',
                      '三个月回报',
                      '六个月回报',
                      '今年回报',
                      '一年年化',
                      '两年年化',
                      '三年年化',
                      '五年年化',
                      '十年年化']
history_return = dict(zip(history_return_keys,history_return_values))
history_return_series = pd.Series(history_return)

### 历史最差回报

In [127]:
qt_worst = browser.find_element_by_id('qt_worst')

#最差3个月，最差6个月回报
worst3_6_values = text2float([qt_worst.find_elements_by_tag_name('li')[li_no].text for li_no in [1,3]])
worst3_6_keys = ['最差3个月回报','最差6个月回报']
worst3_6_series = dic2series(worst3_6_keys,worst3_6_values)

### 晨星评级

In [128]:
#获取评级的图片url
qt_star =browser.find_element_by_id('qt_star')
star_elements = qt_star.find_elements_by_tag_name('img')
star_urls = [star_elements[no_img].get_attribute("src") for no_img in [0,1,2]]

#计算0-5星的sum值以备查找
stars_sum = [sum_png('0.png'),sum_png('1.png'),sum_png('2.png'),sum_png('3.png'),sum_png('4.png'),sum_png('5.png')]

#获取本基金3年/5年/10年评级图片sum值
these_stars = [url2sum(url) for url in star_urls]

#比对sum值获得评级
stars_level_values = [stars_sum.index(sum_value) for sum_value in these_stars]
stars_level_keys = ['3年评级','5年评级','10年评级']
stars_level_series = dic2series(stars_level_keys,stars_level_values)

### 风险评估
输出下记形式列表：【标准差，晨星风险系数，夏普比率，阿尔法系数，贝塔系数，R平方】

In [129]:
qt_risk = browser.find_element_by_id('qt_risk')
qt_risk_lists = qt_risk.find_elements_by_tag_name('li')

#标准差，晨星风险系数，夏普比率
risks1 = [qt_risk_lists[li_no].text for li_no in [15,22,29]]

qt_riskstats =browser.find_element_by_id('qt_riskstats')
qt_riskstats_lists = qt_riskstats.find_elements_by_tag_name('li')

#阿尔法系数，贝塔系数，R平方
risks2 = [qt_riskstats_lists[li_no].text for li_no in [4,7,10]]

#risk输出
risks_values = text2float(risks1 + risks2)
risks_keys = ['标准差',
             '晨星风险系数',
             '夏普比率',
             '阿尔法系数',
             '贝塔系数',
             'R平方']
risks_series = dic2series(risks_keys,risks_values)

### 风险评级

In [149]:
qt_rating = browser.find_element_by_id('qt_rating')

#获取src字符串的星级数字
these_rating_stars = [qt_rating.find_elements_by_tag_name('img')[li_no].get_attribute("src")[-10] for li_no in [0,1,2,3]]
risk_stars_values = text2float(these_rating_stars)
risk_stars_keys = ['2年风险',
                    '3年风险',
                    '5年风险',
                    '10年风险']
risk_stars_seriis = dic2series(risk_stars_keys,risk_stars_values)

### 资产分布
输出：`asset_distribution` 【现金，股票，债券】

In [131]:
qt_asset = browser.find_element_by_id('qt_asset')

cash_pencent = qt_asset.find_element_by_class_name('cash').text
stock_pencent = qt_asset.find_element_by_class_name('stock').text
bonds_pencent = qt_asset.find_element_by_class_name('bonds').text

#【现金，股票，债券】
asset_distribution_values = text2float([cash_pencent,stock_pencent,bonds_pencent])
asset_distribution_keys = ['现金比例(%)',
                          '股票比例(%)',
                          '债券比例(%)']
asset_distribution_series = dic2series(asset_distribution_keys,asset_distribution_values)

### 数据汇总

In [150]:
data_series = pd.concat([base_info_series,
                         performance_8_series,
                        history_return_series,
                        worst3_6_series,
                        stars_level_series,
                        risks_series,
                        risk_stars_seriis,
                        asset_distribution_series])

In [151]:
now_time=datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
csv_path = now_time + '.csv'

## 数据输出

In [154]:
df = pd.DataFrame(data_series,columns=[fund_id])
df.to_csv(csv_path, sep=',', header=True, index=True,encoding='utf_8_sig')