## 0. 加载模型

In [1]:
from keras.models import load_model
from sklearn import metrics
import pandas as pd

model_path = './models/lstm_finetuning_model.h5'
model = load_model(model_path)

## 1. 数据预处理
### 1.1 根据股票代码划分数据

In [3]:
# 训练集
cols = [0,1,2,3,4,6,7,9,13]

import pandas as pd

df = pd.read_csv("./data/train.csv", usecols=cols)
stocks_code = df["kdcode"].unique()
stock_num = len(stocks_code)

print(stocks_code)

# 根据股票代码划分数据
for i, stock_i in enumerate(stocks_code):
    stock_i_data = df[df['kdcode'].isin([stock_i])]
    exec("train_df%s = stock_i_data" % i)
    
    
# 测试集
df2 = pd.read_csv("./data/test.csv", usecols=cols)
stocks_code2 = df2["kdcode"].unique()

# 根据股票代码划分数据
for i, stock_i in enumerate(stocks_code2):
    stock_i_data = df2[df2['kdcode'].isin([stock_i])]
    exec("test_df%s = stock_i_data" % i)

['000001.SZ' '000157.SZ' '000333.SZ' '000568.SZ' '000703.SZ' '000768.SZ'
 '002024.SZ' '002044.SZ' '002049.SZ' '002120.SZ' '002230.SZ' '002271.SZ'
 '002311.SZ' '002371.SZ' '002456.SZ' '002602.SZ' '002607.SZ' '002714.SZ'
 '002773.SZ' '300003.SZ' '300033.SZ' '300124.SZ' '300144.SZ' '300628.SZ'
 '600000.SH' '600009.SH' '600016.SH' '600019.SH' '600025.SH' '600028.SH'
 '600030.SH' '600031.SH' '600036.SH' '600048.SH' '600050.SH' '600061.SH'
 '600104.SH' '600115.SH' '600196.SH' '600276.SH' '600309.SH' '600340.SH'
 '600383.SH' '600489.SH' '600519.SH' '600547.SH' '600570.SH' '600585.SH'
 '600588.SH' '600690.SH' '600703.SH' '600745.SH' '600809.SH' '600837.SH'
 '600848.SH' '600886.SH' '600887.SH' '600926.SH' '600958.SH' '601006.SH'
 '601088.SH' '601166.SH' '601169.SH' '601238.SH' '601318.SH' '601319.SH'
 '601336.SH' '601398.SH' '601555.SH' '601601.SH' '601628.SH' '601788.SH'
 '601838.SH' '601857.SH' '601872.SH' '601899.SH' '601919.SH' '601990.SH'
 '601998.SH' '603233.SH' '603799.SH' '603833.SH']


共82支股票

训练集按股票分为`train_df0`~`train_df81`

测试集按股票分为`test_df0`~`test_df81`

### 1.2 将原始数据改造为LSTM网络的输入

In [4]:
feanum=6 # 一共有多少特征
window=10 # 时间窗设置

In [5]:
import numpy as np

trainResult = []
for i in range(stock_num): # 遍历训练集所有股票的DataFrame
    exec("trainData = train_df%s.values" % i)
    sequence_length = window + 1
    trainData = trainData[:,2:] # 去除股票代码、日期两字段
    for index in range(len(trainData) - sequence_length + 1):
        trainResult.append(trainData[index: index + sequence_length])

trainResult = np.array(trainResult)
trainResult.shape

(73741, 11, 7)

In [7]:
testResult = []
for i in range(stock_num): # 遍历训练集所有股票的DataFrame
    exec("testData = test_df%s.values" % i)
    sequence_length = window + 1
    testData = testData[:,2:] # 去除股票代码、日期两字段
    for index in range(len(testData) - sequence_length + 1):
        testResult.append(testData[index: index + sequence_length])

testResult = np.array(testResult)
testResult.shape

(19085, 11, 7)

In [8]:
X_train = trainResult[:, :-1, :-1]
X_test = testResult[:, :-1, :-1]

X_train = X_train.astype('float64')
X_test = X_test.astype('float64')

print("训练集X：" + str(X_train.shape))
print("测试集X：" + str(X_test.shape))

训练集X：(73741, 10, 6)
测试集X：(19085, 10, 6)


In [9]:
Y_train = trainResult[:, window, -1]
Y_test = testResult[:, window, -1]

In [10]:
Y_train = Y_train.astype('float64')
Y_test = Y_test.astype('float64')

## 2. 分析模型
### 2.1 计算模型在不同股票的准确率

In [11]:
for i in range(stock_num): # 遍历训练集所有股票的DataFrame
    exec("testData = test_df%s.values" % i)
    exec("stockData%s = []" % i)
    sequence_length = window + 1
    testData = testData[:,2:] # 去除股票代码、日期两字段
    for index in range(len(testData) - sequence_length + 1):
        exec("stockData%s.append(testData[index: index + sequence_length])" % i)
    exec("stockData%s = np.array(stockData%s)" % (i, i))
#     exec("print(stockData%s.shape)" % i)

In [12]:
accList = []
for i in range(stock_num):
    exec("x_stock%s = stockData%s[:, :-1, :-1]" % (i, i))
    exec("y_stock%s = stockData%s[:, window, -1]" % (i, i))
    exec("x_stock%s = x_stock%s.astype('float64')" % (i, i))
    exec("y_stock%s = y_stock%s.astype('float64')" % (i, i))
    exec("y_predict%s = model.predict(x_stock%s)[:,0]" % (i, i))
    exec("label%s = y_predict%s.copy()" % (i, i))
    exec("label%s[label%s <= 0.5] = 0" % (i, i))
    exec("label%s[label%s > 0.5] = 1" % (i, i))
    exec("res = metrics.accuracy_score(y_stock%s, label%s)" % (i, i))
    # print("股票%s的准确率为%s" % (i, res))
    accList.append(res)

len(accList)

82

## 4.2 获取股票名称及分类

In [14]:
import pandas as pd

df = pd.read_csv("./data/train.csv", usecols=cols)
stocks_code = df["kdcode"].unique()
stock_num = len(stocks_code)

# print(stocks_code)
# stock_num

codes = stocks_code.copy()
for i in range(stock_num):
    codes[i] = stocks_code[i][:-3]

In [15]:
import requests
#爬取沪深300股票代码
def getNameCategoryByID(id):
    page=1
    hs300=[]
    while(page<=6):
        url = 'https://datainterface.eastmoney.com/EM_DataCenter/js.aspx?'
        key_dict = {
            'cb': 'jQuery112305522721612895851_1626420027301',
            'st': '1',
            'sr': '-1',
            'ps': '50',
            'p': page,
            'type': 'SHSZZS',
            'sty': 'SHSZZS',
            'code': '000300'
        }
        data = requests.get(url, params=key_dict)
        res=data.text
        res = res[42:-1]
        List = eval(res)
        for i in range(len(List)):
            List[i] = List[i].split(',')
        
        for i in range(len(List)):
            if List[i][0] == id:
                return List[i][1:3]
            
        page+=1

stock_msg = []
for i in range(stock_num):
    msg = []
    msg.append(stocks_code[i])
    res = getNameCategoryByID(codes[i])
    msg.append(res[0])
    msg.append(res[1])
    stock_msg.append(msg)

# stock_msg

## 4.3 分析

In [16]:
print("预测较准确(acc > 0.55)的股票有")
tmpList = []
for i in range(stock_num):
    if accList[i] > 0.55:
        tmpList.append([stock_msg[i][0], stock_msg[i][1], stock_msg[i][2], accList[i]])

df3 = pd.DataFrame(tmpList, columns=('股票代码','股票名称','分类名称','分类平均准确率'))
df3

预测较准确(acc > 0.55)的股票有


Unnamed: 0,股票代码,股票名称,分类名称,分类平均准确率
0,000768.SZ,中航西飞,国防与装备,0.562232
1,002230.SZ,科大讯飞,信息技术,0.553648
2,002714.SZ,牧原股份,农林牧渔,0.55794
3,002773.SZ,康弘药业,医药生物,0.562232
4,600000.SH,浦发银行,金融,0.575107
5,600016.SH,民生银行,金融,0.60515
6,600028.SH,中国石化,化石能源,0.583691
7,600050.SH,中国联通,信息技术,0.639485
8,600104.SH,上汽集团,交运设备,0.553648
9,600309.SH,万华化学,基础化工,0.566524


In [17]:
print("预测较不准确(acc < 0.49)的股票有")
tmpList = []
for i in range(stock_num):
    if accList[i] < 0.49:
        tmpList.append([stock_msg[i][0], stock_msg[i][1], stock_msg[i][2], accList[i]])

df4 = pd.DataFrame(tmpList, columns=('股票代码','股票名称','分类名称','分类平均准确率'))
df4

预测较不准确(acc < 0.49)的股票有


Unnamed: 0,股票代码,股票名称,分类名称,分类平均准确率
0,000001.SZ,平安银行,金融,0.463519
1,002371.SZ,北方华创,电子设备,0.467811
2,300124.SZ,汇川技术,机械设备,0.454936
3,600519.SH,贵州茅台,食品饮料,0.476395
4,600809.SH,山西汾酒,食品饮料,0.437768
5,601088.SH,中国神华,化石能源,0.472103


In [18]:
category = []
for i in range(stock_num):
    category.append(stock_msg[i][2])
category = list(set(category))

categoryDict = dict()
for cate in category:
    tmp = []
    for i in range(stock_num):
        if stock_msg[i][2] == cate:
            tmp.append(i)
    categoryDict[cate] = tmp

# print(categoryDict)

In [19]:
meanAccList = []
dataList = []
for e in categoryDict:
    stockList = categoryDict[e]
    cnt = 0
    accSum = 0
    for i in stockList:
        if accList[i] > 0.5:
            cnt += 1
        accSum += accList[i]
    meanAcc = accSum / len(stockList)
    meanAccList.append([meanAcc, e])
    dataList.append([e, len(stockList), cnt, meanAcc])
df = pd.DataFrame(dataList, columns=('分类','分类股票数量','准确率大于0.5的数量','分类平均准确率'))
df

Unnamed: 0,分类,分类股票数量,准确率大于0.5的数量,分类平均准确率
0,交运设备,2,2,0.542918
1,轻工制造,1,0,0.497854
2,基础化工,2,2,0.553648
3,食品饮料,4,2,0.504292
4,国防与装备,1,1,0.562232
5,钢铁,1,1,0.549356
6,文化传媒,1,1,0.523605
7,电子设备,5,4,0.512539
8,房地产,4,4,0.5397
9,交通运输,6,6,0.538627


In [20]:
print("按照平均准确率对分类排序：")
meanAccList.sort(reverse = True)
meanAccList
df2 = pd.DataFrame(meanAccList, columns=('分类平均准确率','分类名称'))
df2

按照平均准确率对分类排序：


Unnamed: 0,分类平均准确率,分类名称
0,0.562232,国防与装备
1,0.559371,化石能源
2,0.554506,信息技术
3,0.553648,基础化工
4,0.549356,钢铁
5,0.54721,农林牧渔
6,0.546068,金融
7,0.545064,休闲、生活及专业服务
8,0.542918,交运设备
9,0.5397,房地产
