# 将通达信盘后数据转换为csv文件，每只股票对应一个csv文件

In [2]:
import numpy as np
import pandas as pd
import os
from hangqinghelper import *
import datetime
import time

## 上证股票

In [4]:
# sh_rootdir = 'vipdoc/sh/lday'
# sh_filelists = dayfilelists(sh_rootdir)
# for data in sh_filelists:
#     exactStock(data,'database')

## 深证股票

In [13]:
# for data in sz_filelists:
#     exactStock(data,'database')

## 获取沪深300股票列表，并读取行情数据

In [3]:
hs300s = pd.read_csv('hs300s.csv')
hs300s.tail()

Unnamed: 0,code,name,date,weight,abbr,symbol
295,1,平安银行,2017-09-01,0.89,payh,000001.sz
296,600019,宝钢股份,2017-09-01,0.7,bggf,600019.ss
297,601088,中国神华,2017-09-01,0.39,zgsh,601088.ss
298,826,启迪桑德,2017-09-01,0.16,qdsd,000826.sz
299,2142,宁波银行,2017-09-01,0.37,nbyh,002142.sz


In [4]:
symbols = [sym.split('.')[0] for sym in hs300s['symbol']]

In [5]:
basedir = 'database/'
hs300data = pd.DataFrame()
for sym in symbols:
    df = pd.read_csv(basedir+sym+'.csv',index_col=0)
    code = df['code'][0].split('.')[0]
    df['code'] = code
    df['pct_change'] = df['close'].pct_change()
    func = lambda x: x if abs(x)>0.0001 else 0.0001
    df['pct_change'] = df['pct_change'].apply(func)
    df['Direction'] = np.sign(df['pct_change'])
    hs300data = hs300data.append(df)

In [6]:
hs300data.shape

(856884, 9)

In [7]:
hs300data.tail()

Unnamed: 0,code,open,high,low,close,amount,vol,pct_change,Direction
2018-01-09,2142,17.76,17.89,17.51,17.71,4978331.52,28231358,-0.002815,-1.0
2018-01-10,2142,17.69,18.12,17.69,18.03,5056385.6,28209096,0.018069,1.0
2018-01-11,2142,18.0,18.47,17.93,18.28,4122906.88,22498507,0.013866,1.0
2018-01-12,2142,18.29,18.57,18.13,18.51,4505714.88,24368982,0.012582,1.0
2018-01-15,2142,18.37,19.46,18.37,19.17,9549434.88,50240732,0.035656,1.0


## T日给定一只股票，在沪深300股票历史行情中找近30天走势最相近的3只股票

In [8]:
target_symbol = '600340'
today = datetime.datetime.now()
delta = datetime.timedelta(days=1)
yesterday = today - delta
day_years_ago = today - datetime.timedelta(days=365)
today = today.strftime('%Y-%m-%d')
yesterday = yesterday.strftime('%Y-%m-%d')
day_years_ago = day_years_ago.strftime('%Y-%m-%d')
print today
print yesterday
print day_years_ago

2018-01-18
2018-01-17
2017-01-18


In [9]:
# 获取目标股票近30个交易日收盘价组成的数组
target_close_30 = hs300data[hs300data['code']==target_symbol][:yesterday]['close'].tail(30).values
# 数据归一化
target_max = target_close_30.max()
target_min = target_close_30.min()
target_close_30 = (target_close_30-target_min)/(target_max-target_min)

In [177]:
# 单只股票测试耗时，一年内的耗时还可以接受
symbol = '002049'
best_result = []    # 存放该股票最佳匹配数据
best_coef = 0
start1 = time.time()
data = hs300data[hs300data['code']==symbol][day_years_ago:yesterday]['close']
print 'First cost time:',round(time.time()-start1,2)
data_max = data.max()
data_min = data.min()
start2 = time.time()
data = (data-data_min)/(data_max-data_min)
print 'Second cost time:',round(time.time()-start2,2)
length = len(data)-29

start3 = time.time()
for i in range(0,length):
    # 取一组收盘价
    df = data.head(30)
    tmp_close = df.values
    # 这组收盘价中最后一个的日期
    lastdate = df.tail(1).index.tolist()[0]
    # 计算相关系数
    coef = np.corrcoef(target_close_30,tmp_close)[0][1]
    if coef > best_coef:
        best_coef = coef
        best_result = [coef,lastdate]
    # 删掉第一组元素
    data = data.drop(data.index[0])
    
print 'Third cost time:',round(time.time()-start3,2)    


First cost time: 0.19
Second cost time: 0.0
Third cost time: 0.17


In [144]:
# 历史数据总耗时，时间有点多
# 遍历沪深300股票，取每支股票历史上的30天数据，跟目标数据做相似度评估（此处用相关系数），取最大的相关系数以及对应的股票
result_dict = {}
start = time.time()
for symbol in symbols:
    best_result = []    # 存放该股票最佳匹配数据
    best_coef = 0
    data = hs300data[hs300data['code']==symbol][:yesterday]['close']
    data_max = data.max()
    data_min = data.min()
    data = (data-data_min)/(data_max-data_min)
    length = len(data)-29
    
    # 30天为一个窗口，步长为1天，遍历找出该股票最佳匹配窗口
    for i in range(0,length):
        # 取一组收盘价
        df = data.head(30)
        tmp_close = df.values
        # 这组收盘价中最后一个的日期
        lastdate = df.tail(1).index.tolist()[0]
        # 计算相关系数
        coef = np.corrcoef(target_close_30,tmp_close)[0][1]
        if coef > best_coef:
            best_coef = coef
            best_result = [coef,lastdate]
        # 删掉第一组元素
        data = data.drop(data.index[0])
    
    result_dict[symbol] = best_result
end = time.time()
print 'Total process time:',round(end-start,2),'s'

Total process time: 1252.41 s


In [136]:
result_dict

{'000001': [0.95822546094001226, '2017-11-17'],
 '000002': [0.97116995891571667, '2009-05-15'],
 '000008': [0.98148529083302172, '2014-10-09'],
 '000009': [0.96492299688523697, '2014-02-17'],
 '000060': [0.96992396872872133, '2002-07-02'],
 '000063': [0.9480756499426598, '2007-12-24'],
 '000069': [0.9622759511899357, '2004-09-29'],
 '000100': [0.96460417547538224, '2011-04-27'],
 '000156': [0.96973422395918796, '2013-01-15'],
 '000157': [0.97211904355620171, '2010-10-25'],
 '000166': [0.90095055020078774, '2015-03-25'],
 '000333': [0.93657032598279844, '2014-08-05'],
 '000338': [0.97184872701972658, '2010-10-28'],
 '000402': [0.96690406323683786, '2014-12-10'],
 '000413': [0.95800526278198783, '2015-03-12'],
 '000415': [0.97039169589015351, '2015-04-30'],
 '000423': [0.95953824210745697, '2010-04-14'],
 '000425': [0.96744273074421272, '2017-02-17'],
 '000503': [0.97463794887658228, '2004-09-27'],
 '000538': [0.95582694775599653, '2003-09-05'],
 '000540': [0.97057576569236548, '2004-02-

In [142]:
sort_dict = sorted(result_dict.items(), lambda x,y: cmp(x[1],y[1]),reverse=True)
sort_dict

[('600340', [1.0, '2018-01-15']),
 ('002008', [0.98585561214122919, '2006-01-19']),
 ('600109', [0.98518861463567553, '2006-11-07']),
 ('000895', [0.98475316842236338, '2010-12-13']),
 ('002174', [0.98324486957981383, '2013-11-07']),
 ('601800', [0.98314180317843081, '2014-11-11']),
 ('000783', [0.98250921154800908, '2010-10-22']),
 ('000008', [0.98148529083302172, '2014-10-09']),
 ('600157', [0.98092950820043545, '2015-04-22']),
 ('600050', [0.97995109172707151, '2016-10-21']),
 ('600030', [0.97968636309346357, '2010-10-22']),
 ('600372', [0.97939076171515327, '2016-07-11']),
 ('600023', [0.9784365266351539, '2015-06-05']),
 ('601939', [0.97820054209937979, '2010-10-22']),
 ('000876', [0.97791498828233314, '2010-09-30']),
 ('600685', [0.9777087191278111, '2016-07-12']),
 ('601186', [0.97762160520654129, '2016-10-25']),
 ('601628', [0.97760979593961783, '2010-10-22']),
 ('600535', [0.97748768705164246, '2004-09-28']),
 ('601688', [0.97739778999623894, '2010-10-22']),
 ('600028', [0.976

In [12]:
# 一年数据总耗时
result_dict = {}
start = time.time()
for symbol in symbols:
    best_result = []    # 存放该股票最佳匹配数据
    best_coef = 0
    data = hs300data[hs300data['code']==symbol][day_years_ago:yesterday]['close']
    data_max = data.max()
    data_min = data.min()
    data = (data-data_min)/(data_max-data_min)
    length = len(data)-29
    
    # 30天为一个窗口，步长为1天，遍历找出该股票最佳匹配窗口
    for i in range(0,length):
        # 取一组收盘价
        df = data.head(30)
        tmp_close = df.values
        # 这组收盘价中最后一个的日期
        lastdate = df.tail(1).index.tolist()[0]
        # 计算相关系数
        coef = np.corrcoef(target_close_30,tmp_close)[0][1]
        if coef > best_coef:
            best_coef = coef
            best_result = [coef,lastdate]
        # 删掉第一组元素
        data = data.drop(data.index[0])
    
    result_dict[symbol] = best_result
end = time.time()
print 'Total process time:',round(end-start,2),'s'

Total process time: 117.57 s


In [13]:
sort_dict = sorted(result_dict.items(), lambda x,y: cmp(x[1],y[1]),reverse=True)
sort_dict

[('600340', [1.0, '2018-01-15']),
 ('600028', [0.97694683458543297, '2018-01-11']),
 ('600383', [0.97251369424492717, '2018-01-12']),
 ('600008', [0.96949186820449129, '2017-04-19']),
 ('600606', [0.96780311729027113, '2018-01-12']),
 ('601006', [0.96560597328302999, '2018-01-12']),
 ('300072', [0.96534599243492847, '2017-03-20']),
 ('000961', [0.96506215429454112, '2018-01-11']),
 ('601992', [0.96481091789490037, '2017-04-19']),
 ('300251', [0.9640266231792044, '2017-09-18']),
 ('601111', [0.9639091245020126, '2017-11-06']),
 ('002304', [0.96295497671568053, '2017-10-09']),
 ('601211', [0.96289126557771643, '2017-06-05']),
 ('600816', [0.9610920371784385, '2017-10-20']),
 ('600196', [0.96064470353344189, '2017-09-27']),
 ('600038', [0.96060356379120204, '2017-10-19']),
 ('601766', [0.96049722541941684, '2017-11-02']),
 ('600029', [0.95982235591798559, '2017-11-08']),
 ('002146', [0.95923224034774335, '2017-04-17']),
 ('600256', [0.95882877027814339, '2017-12-11']),
 ('000001', [0.9582

In [14]:
similar_close_data = hs300data[hs300data['code']=='600028']['2018-01-11':].head(10)
similar_close_data

Unnamed: 0,code,open,high,low,close,amount,vol,pct_change,Direction
2018-01-11,600028,7.15,7.18,7.01,7.08,11588856.32,163552716,-0.00979,-1.0
2018-01-12,600028,7.1,7.23,6.99,7.15,13798700.8,193761782,0.009887,1.0
2018-01-15,600028,7.2,7.44,7.1,7.29,24148303.36,331951180,0.01958,1.0


In [None]:
similar_close_data = hs300data[hs300data['code']=='600383']['2018-01-12':].head(10)
similar_close_data