# tushare for python2

In [1]:
import tushare as ts

In [2]:
import pandas as pd

- 历史行情数据
- 复权历史数据
- 实时行情数据
- 历史分笔数据
- 实时报价数据
- 当日历史分笔
- 大盘指数列表
- 大单交易数据

In [1]:
# -*- coding:utf-8 -*- 
"""
交易数据接口 
Created on 2014/07/31
@author: Jimmy Liu
@group : waditu
@contact: jimmysoa@sina.cn
"""
from __future__ import division

import time
import json
import lxml.html
from lxml import etree
import pandas as pd
import numpy as np
import datetime
import re
from pandas.compat import StringIO
import os
try:
    from urllib.request import urlopen, Request
except ImportError:
    from urllib2 import urlopen, Request

In [6]:
from tushare.stock import cons as ct
from tushare.util.conns import get_apis, close_apis
from tushare.stock.fundamental import get_stock_basics
from tushare.util import dateu as du
from tushare.util.formula import MA

In [7]:
def get_hist_data(code=None, start=None, end=None,
                  ktype='D', retry_count=3,
                  pause=0.001):
    """
        获取个股历史交易记录
    Parameters
    ------
      code:string
                  股票代码 e.g. 600848
      start:string
                  开始日期 format：YYYY-MM-DD 为空时取到API所提供的最早日期数据
      end:string
                  结束日期 format：YYYY-MM-DD 为空时取到最近一个交易日数据
      ktype：string
                  数据类型，D=日k线 W=周 M=月 5=5分钟 15=15分钟 30=30分钟 60=60分钟，默认为D
      retry_count : int, 默认 3
                 如遇网络等问题重复执行的次数 
      pause : int, 默认 0
                重复请求数据过程中暂停的秒数，防止请求间隔时间太短出现的问题
    return
    -------
      DataFrame
          属性:日期 ，开盘价， 最高价， 收盘价， 最低价， 成交量， 价格变动 ，涨跌幅，5日均价，10日均价，20日均价，5日均量，10日均量，20日均量，换手率
    """
    symbol = ct._code_to_symbol(code)
    url = ''
    if ktype.upper() in ct.K_LABELS:
        url = ct.DAY_PRICE_URL%(ct.P_TYPE['http'], ct.DOMAINS['ifeng'],
                                ct.K_TYPE[ktype.upper()], symbol)
    elif ktype in ct.K_MIN_LABELS:
        url = ct.DAY_PRICE_MIN_URL%(ct.P_TYPE['http'], ct.DOMAINS['ifeng'],
                                    symbol, ktype)
    else:
        raise TypeError('ktype input error.')
    
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            request = Request(url)
            lines = urlopen(request, timeout = 10).read()
            if len(lines) < 15: #no data
                return None
        except Exception as e:
            print(e)
        else:
            js = json.loads(lines.decode('utf-8') if ct.PY3 else lines)
            cols = []
            if (code in ct.INDEX_LABELS) & (ktype.upper() in ct.K_LABELS):
                cols = ct.INX_DAY_PRICE_COLUMNS
            else:
                cols = ct.DAY_PRICE_COLUMNS
            if len(js['record'][0]) == 14:
                cols = ct.INX_DAY_PRICE_COLUMNS
            df = pd.DataFrame(js['record'], columns=cols)
            if ktype.upper() in ['D', 'W', 'M']:
                df = df.applymap(lambda x: x.replace(u',', u''))
                df[df==''] = 0
            for col in cols[1:]:
                df[col] = df[col].astype(float)
            if start is not None:
                df = df[df.date >= start]
            if end is not None:
                df = df[df.date <= end]
            if (code in ct.INDEX_LABELS) & (ktype in ct.K_MIN_LABELS):
                df = df.drop('turnover', axis=1)
            df = df.set_index('date')
            df = df.sort_index(ascending = False)
            return df
    raise IOError(ct.NETWORK_URL_ERROR_MSG)

In [10]:
def get_k_data(code=None, start='', end='',
                  ktype='D', autype='qfq', 
                  index=False,
                  retry_count=3,
                  pause=0.001):
    """
    获取k线数据
    ---------
    Parameters:
      code:string
                  股票代码 e.g. 600848
      start:string
                  开始日期 format：YYYY-MM-DD 为空时取上市首日
      end:string
                  结束日期 format：YYYY-MM-DD 为空时取最近一个交易日
      autype:string
                  复权类型，qfq-前复权 hfq-后复权 None-不复权，默认为qfq
      ktype：string
                  数据类型，D=日k线 W=周 M=月 5=5分钟 15=15分钟 30=30分钟 60=60分钟，默认为D
      retry_count : int, 默认 3
                 如遇网络等问题重复执行的次数 
      pause : int, 默认 0
                重复请求数据过程中暂停的秒数，防止请求间隔时间太短出现的问题
    return
    -------
      DataFrame
          date 交易日期 (index)
          open 开盘价
          high  最高价
          close 收盘价
          low 最低价
          volume 成交量
          amount 成交额
          turnoverratio 换手率
          code 股票代码
    """
    symbol = ct.INDEX_SYMBOL[code] if index else ct._code_to_symbol(code)
    url = ''
    dataflag = ''
    autype = '' if autype is None else autype
    if (start is not None) & (start != ''):
        end = du.today() if end is None or end == '' else end
    if ktype.upper() in ct.K_LABELS:
        fq = autype if autype is not None else ''
        if code[:1] in ('1', '5') or index:
            fq = ''
        kline = '' if autype is None else 'fq'
        if (start is None or start == '') & (end is None or end == ''):
            urls = [ct.KLINE_TT_URL%(ct.P_TYPE['http'], ct.DOMAINS['tt'],
                                    kline, fq, symbol, 
                                    ct.TT_K_TYPE[ktype.upper()], start, end,
                                    fq, _random(17))]
        else:
            years = du.tt_dates(start, end)
            urls = []
            for year in years:
                startdate = str(year) + '-01-01'
                enddate = str(year+1) + '-12-31'
                url = ct.KLINE_TT_URL%(ct.P_TYPE['http'], ct.DOMAINS['tt'],
                                    kline, fq+str(year), symbol, 
                                    ct.TT_K_TYPE[ktype.upper()], startdate, enddate,
                                    fq, _random(17))
                urls.append(url)
        dataflag = '%s%s'%(fq, ct.TT_K_TYPE[ktype.upper()])
    elif ktype in ct.K_MIN_LABELS:
        urls = [ct.KLINE_TT_MIN_URL%(ct.P_TYPE['http'], ct.DOMAINS['tt'],
                                    symbol, ktype, ktype,
                                    _random(16))]
        dataflag = 'm%s'%ktype
    else:
        raise TypeError('ktype input error.')
    data = pd.DataFrame()
    for url in urls:
        data = data.append(_get_k_data(url, dataflag, 
                                       symbol, code,
                                       index, ktype,
                                       retry_count, pause), 
                           ignore_index=True)
    if ktype not in ct.K_MIN_LABELS:
        if ((start is not None) & (start != '')) & ((end is not None) & (end != '')):
            if data.empty==False:       
                data = data[(data.date >= start) & (data.date <= end)]
    return data
    raise IOError(ct.NETWORK_URL_ERROR_MSG)

In [12]:
def _random(n=13):
    from random import randint
    start = 10**(n-1)
    end = (10**n)-1
    return str(randint(start, end))

In [14]:
def _get_k_data(url, dataflag='',
                symbol='',
                code = '',
                index = False,
                ktype = '',
                retry_count=3,
                pause=0.001):
    for _ in range(retry_count):
            time.sleep(pause)
            try:
                request = Request(url)
                lines = urlopen(request, timeout = 10).read()
                if len(lines) < 100: #no data
                    return None
            except Exception as e:
                print(e)
            else:
                lines = lines.decode('utf-8') if ct.PY3 else lines
                lines = lines.split('=')[1]
                reg = re.compile(r',{"nd.*?}') 
                lines = re.subn(reg, '', lines) 
                js = json.loads(lines[0])
                dataflag = dataflag if dataflag in list(js['data'][symbol].keys()) else ct.TT_K_TYPE[ktype.upper()]
                if len(js['data'][symbol][dataflag]) == 0:
                    return None
                if len(js['data'][symbol][dataflag][0]) == 6:
                    df = pd.DataFrame(js['data'][symbol][dataflag], 
                                  columns = ct.KLINE_TT_COLS_MINS)
                else:
                    df = pd.DataFrame(js['data'][symbol][dataflag], 
                                  columns = ct.KLINE_TT_COLS)
                df['code'] = symbol if index else code
                if ktype in ct.K_MIN_LABELS:
                    df['date'] = df['date'].map(lambda x: '%s-%s-%s %s:%s'%(x[0:4], x[4:6], 
                                                                            x[6:8], x[8:10], 
                                                                            x[10:12]))
                for col in df.columns[1:6]:
                    df[col] = df[col].astype(float)
                return df

In [8]:
# 历史行情
get_hist_data('300676', start='2018-01-01', end='2018-07-31') #一次性获取全部日k线数据 
# failed

In [17]:
# get_k_data('300676')

In [2]:
# 复权数据
df = ts.get_stock_basics()
date = df.ix['600848']['timeToMarket'] #上市日期YYYYMMDD

In [15]:
# df.ix['600848']
df.index

Index([u'600379', u'002282', u'002691', u'600604', u'600158', u'000561',
       u'300570', u'300502', u'002662', u'300288',
       ...
       u'000950', u'000693', u'000629', u'603590', u'603583', u'603192',
       u'601577', u'601068', u'600680', u'600401'],
      dtype='object', name=u'code', length=3543)

In [22]:
stockList = list(df.index)

In [19]:
df.to_csv("stock_basics.csv")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3543 entries, 600379 to 600401
Data columns (total 22 columns):
name                3543 non-null object
industry            3543 non-null object
area                3543 non-null object
pe                  3543 non-null float64
outstanding         3543 non-null float64
totals              3543 non-null float64
totalAssets         3543 non-null float64
liquidAssets        3543 non-null float64
fixedAssets         3543 non-null float64
reserved            3543 non-null float64
reservedPerShare    3543 non-null float64
esp                 3543 non-null float64
bvps                3543 non-null float64
pb                  3543 non-null float64
timeToMarket        3543 non-null int64
undp                3543 non-null float64
perundp             3543 non-null float64
rev                 3543 non-null float64
profit              3543 non-null float64
gpr                 3543 non-null float64
npr                 3543 non-null float64
holders       

In [5]:
# from tongHuaShun
2139+1115

3254

In [8]:
# problem
# 无法获取最新的两个季度的基本面数据

In [21]:
#获取2014年第3季度的业绩报表数据
ts.get_report_data(2018,1)

[Getting data:]###########################################################

Unnamed: 0,code,name,eps,eps_yoy,bvps,roe,epcf,net_profits,profits_yoy,distrib,report_date
0,300724,捷佳伟创,0.31,6.90,,7.72,,7499.90,8.70,,07-23
1,601869,长飞光纤,0.55,71.88,8.19,6.86,-0.39,37206.63,68.21,,07-19
2,603657,春光科技,0.15,0.00,,3.10,,1082.90,1.14,,07-10
3,000693,*ST华泽,-0.00,-97.22,,,,-107.43,-97.36,,06-29
4,000939,*ST凯迪,-0.07,-240.00,,-3.04,,-28370.99,-368.73,,06-29
5,002932,明德生物,0.27,,,5.51,,1362.27,,,06-27
6,600399,*ST抚钢,-0.02,-182.64,,,,-2194.45,-169.75,,06-26
7,603713,密尔克卫,0.29,130.65,,,,3313.11,130.66,,06-25
8,000409,*ST地矿,-0.16,166.67,,-12.12,,-8139.96,180.95,,06-22
9,603105,芯能科技,0.03,-200.00,,,,1087.80,-178.76,,06-19


In [None]:
#获取2014年第3季度的盈利能力数据
ts.get_profit_data(2018,1)

In [None]:
#获取2014年第3季度的营运能力数据
ts.get_operation_data(2018,1)

In [None]:
#获取2014年第3季度的成长能力数据
ts.get_growth_data(2018,1)

In [None]:
#获取2014年第3季度的偿债能力数据
ts.get_debtpaying_data(2018,1)

In [None]:
#获取2014年第3季度的现金流量数据
ts.get_cashflow_data(2018,1)

## 获取某家公司的财务信息

In [4]:
all = ts.get_stock_basics()
all.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3538 entries, 600225 to 000950
Data columns (total 22 columns):
name                3538 non-null object
industry            3538 non-null object
area                3538 non-null object
pe                  3538 non-null float64
outstanding         3538 non-null float64
totals              3538 non-null float64
totalAssets         3538 non-null float64
liquidAssets        3538 non-null float64
fixedAssets         3538 non-null float64
reserved            3538 non-null float64
reservedPerShare    3538 non-null float64
esp                 3538 non-null float64
bvps                3538 non-null float64
pb                  3538 non-null float64
timeToMarket        3538 non-null int64
undp                3538 non-null float64
perundp             3538 non-null float64
rev                 3538 non-null float64
profit              3538 non-null float64
gpr                 3538 non-null float64
npr                 3538 non-null float64
holders       

In [11]:
all.loc["300676"]

name                    华大基因
industry                医疗保健
area                      深圳
pe                     72.59
outstanding             2.47
totals                     4
totalAssets           504989
liquidAssets          369130
fixedAssets          77748.9
reserved              296826
reservedPerShare        7.42
esp                    0.252
bvps                   10.54
pb                      6.93
timeToMarket        20170714
undp                 80426.1
perundp                 2.01
rev                    33.13
profit                  6.71
gpr                    53.68
npr                    19.51
holders                37289
Name: 300676, dtype: object

# 获取行业分类信息

In [19]:
ts.get_industry_classified()

Unnamed: 0,code,name,c_name
0,600051,宁波联合,综合行业
1,600209,罗顿发展,综合行业
2,600212,江泉实业,综合行业
3,600256,广汇能源,综合行业
4,600576,祥源文化,综合行业
5,600603,广汇物流,综合行业
6,600614,鹏起科技,综合行业
7,600620,天宸股份,综合行业
8,600622,光大嘉宝,综合行业
9,600624,复旦复华,综合行业


# 存储为csv格式，在R中做下游分析

## 我需要哪些数据？
- 表1：每个公司所属的行业信息，以及其他分类标准；（静态）
- 表2：每个公司的财务信息；（动态，季度）
- 表3：各种指数的信息（国内外各种指数，期货）；（动态，每日）
- 表4：每个公司的每日的收盘价和成交量；（动态，每日）
- 表5：

# 0.过滤掉上市不足2年的股票，过滤掉创业板，过滤掉小盘股

## 1.选出0.25、0.5、1、2年内每个股所处的价位比，比如如果某股在0.5年内处于（max-min）的0%，那它就是半年内一直跌
我比较喜欢那种已经大跌过，但现在开始启动行情的个股。

# 2.每个股票之间的相关性分析，以及其与大盘、外汇、外盘的相关性分析。找出一些股票模块。 

# 3.根据财务指标对每个行业的个股进行排序，不碰行业的吊车尾。不碰有太多负债、恶性资产且没有盈利能力的公司。不碰即将解禁、做空势力大的股票（现在的华大）。

# 4.开发一款工具来检测启动/反势行情

# 5.最终肯定会筛选出一些个股，需要建立一套完善的个股评价标准，确定最终是否应该买入。

# 6.交易策略的历史验证

# 7.永远不要做纯粹主观的交易

# 8.个股的估值

[美股估值的三种常用方法 一看就懂](http://stock.qq.com/a/20160317/058536.htm)

# 基本统计

In [1]:
import tushare as ts

In [2]:
all_basics = ts.get_stock_basics()

In [3]:
all_basics.head()

Unnamed: 0_level_0,name,industry,area,pe,outstanding,totals,totalAssets,liquidAssets,fixedAssets,reserved,...,bvps,pb,timeToMarket,undp,perundp,rev,profit,gpr,npr,holders
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
509,华塑控股,其他建材,四川,0.0,8.25,8.25,24894.26,18130.7,1169.06,21226.06,...,0.05,68.29,19930507,-102324.15,-1.24,585.13,-0.01,0.55,-0.78,63186.0
600653,申华控股,汽车服务,上海,0.0,17.46,19.46,989894.88,492175.06,230977.7,34658.4,...,0.86,2.52,19901219,-61917.1,-0.32,42.54,-55.04,4.54,-8.24,195232.0
300240,飞力达,仓储物流,江苏,33.21,3.65,3.66,234576.22,147641.8,32642.69,21162.53,...,3.03,2.31,20110706,50005.6,1.37,8.82,-0.18,12.02,2.49,18171.0
600278,东方创业,商贸代理,上海,25.46,5.22,5.22,809394.06,476305.75,92595.86,96109.74,...,7.79,1.27,20000712,124624.3,2.39,27.15,16.99,4.42,1.1,29100.0
909,数源科技,综合类,浙江,121.77,3.11,3.12,376438.78,302609.0,1981.5,47205.76,...,3.33,2.7,19990507,22165.71,0.71,28.42,11.93,9.35,1.86,32651.0


In [15]:
all_basics.to_csv("all_basics.csv")

In [13]:
# all_basics.info()

In [14]:
# all_basics["timeToMarket"]//10000

In [None]:
# 沪深上市公司总数： 3538
# 上市时间




In [16]:
df = ts.get_index()

In [32]:
tmp = ts.get_hist_data(code="600379",start="2016-08-20",end="2018-08-20")

In [33]:
len(tmp)

474

In [6]:
df = pd.read_csv("filterCode.csv", dtype="string")

In [8]:
# df

In [10]:
stockList = list(df["code"])

In [11]:
"002913" in stockList

False

In [14]:
count = 0
for i in stockList:
    count = count+1
    #print(i)
    tmp = ts.get_k_data(code=i,start="2016-08-20",end="2018-08-21")
    if len(tmp) < 200:
        #tmp = ts.get_hist_data(code=i,start="2016-08-20",end="2018-08-20")
        #if len(tmp) != 474:
        print i, len(tmp), "data deficiencies"
        #continue
    tmp.to_csv("price/"+i+".csv")
    print count
    #break
print "done!"

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
600725 198 data deficiencies
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
600890 156 data deficiencies
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
26

1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
002075 18 data deficiencies
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
000155 164 data deficiencies
1941
1942
1943
1944
1945
1946
1947
1948
000029 17 data deficiencies
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993


In [15]:
# tmp

In [44]:
tmp = ts.get_report_data(2018,1)
print len(tmp)
tmp.to_csv("report_2018_1.csv", encoding='utf-8')

[Getting data:]###########################################################3528


In [45]:
tmp = ts.get_report_data(2017,1)
print len(tmp)
tmp.to_csv("report_2017_1.csv", encoding='utf-8')
tmp = ts.get_report_data(2017,2)
print len(tmp)
tmp.to_csv("report_2017_2.csv", encoding='utf-8')
tmp = ts.get_report_data(2017,3)
print len(tmp)
tmp.to_csv("report_2017_3.csv", encoding='utf-8')
tmp = ts.get_report_data(2017,4)
print len(tmp)
tmp.to_csv("report_2017_4.csv", encoding='utf-8')
#
tmp = ts.get_report_data(2016,1)
print len(tmp)
tmp.to_csv("report_2016_1.csv", encoding='utf-8')
tmp = ts.get_report_data(2016,2)
print len(tmp)
tmp.to_csv("report_2016_2.csv", encoding='utf-8')
tmp = ts.get_report_data(2016,3)
print len(tmp)
tmp.to_csv("report_2016_3.csv", encoding='utf-8')
tmp = ts.get_report_data(2016,4)
print len(tmp)
tmp.to_csv("report_2016_4.csv", encoding='utf-8')

[Getting data:]###########################################################3529
[Getting data:]###########################################################3523
[Getting data:]##########################################################3479
[Getting data:]###########################################################3534
[Getting data:]#######################################################3295
[Getting data:]#########################################################3402
[Getting data:]##########################################################3452
[Getting data:]###########################################################3536


In [28]:
# better than get_hist_data
ts.get_k_data(code="sh",start="2016-08-02",end="2018-08-21")

Unnamed: 0,date,open,close,high,low,volume,code
142,2016-08-02,2950.08,2971.28,2971.28,2946.64,115468900.0,sh
143,2016-08-03,2963.21,2978.46,2981.16,2956.79,141141332.0,sh
144,2016-08-04,2976.41,2982.43,2982.86,2958.93,133933301.0,sh
145,2016-08-05,2978.78,2976.70,2991.68,2971.56,141857101.0,sh
146,2016-08-08,2972.62,3004.28,3004.72,2959.05,155729833.0,sh
147,2016-08-09,3001.31,3025.68,3025.91,2998.68,169995446.0,sh
148,2016-08-10,3023.47,3018.75,3033.20,3017.09,164675165.0,sh
149,2016-08-11,3013.68,3002.64,3038.05,3001.17,161879481.0,sh
150,2016-08-12,3000.27,3050.67,3051.05,2999.04,168173657.0,sh
151,2016-08-15,3056.48,3125.20,3137.48,3053.87,297616506.0,sh
