# 練習 - 股票資料彙整_Yahoo股市 - 解答

In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
import datetime

目標資料來源:  
https://tw.stock.yahoo.com/s/list.php?c=tse&pid=1

## 抓取網頁資料

In [3]:
import requests

def get_yahoo_page_html(url): 
    html = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'})
    return html.text

In [4]:
def getDataOnePage(html):
    targetTableIndex = 0
    table = pd.read_html(html,
                         attrs = {'border': '1' , 
                                  'cellspacing': '0', 
                                  'cellpadding': '2', 
                                  'bgcolor': '#ffffff'},
                         header = 0
                        )[targetTableIndex]
    
    return table

In [5]:
def getDataOnePageTSE(page):
    url = 'https://tw.stock.yahoo.com/s/list.php?c=tse&pid=' + str(page)   
    return getDataOnePage(html = get_yahoo_page_html(url))

In [6]:
# 抓第一頁的資料
df = getDataOnePageTSE(1)

In [7]:
df.tail()

Unnamed: 0,選擇,股票代號,時間,成交,買進,賣出,漲跌,張數,昨收,開盤,最高,最低,凱基證券下單
195,,4763 材料-KY,14:30,121.5,121.5,122.0,0.00,994,121.5,121.0,128.0,120.5,買 賣 張 零股交易
196,,1598 岱宇,13:30,46.05,46.0,46.05,▽0.50,252,46.55,46.5,46.95,46.0,買 賣 張 零股交易
197,,1701 中化,13:30,18.1,18.1,18.15,▽0.05,218,18.15,18.2,18.2,18.05,買 賣 張 零股交易
198,,1707 葡萄王,13:30,261.0,261.0,261.5,△3.5,537,257.5,261.0,262.5,258.0,買 賣 張 零股交易
199,,1720 生達,13:30,33.45,33.4,33.45,0.00,179,33.45,33.6,33.6,33.3,買 賣 張 零股交易


In [8]:
df.to_excel('stock.xlsx')

## 修整 DataFrame中的資料

In [9]:
def fixTable(marketType, table, theDate = datetime.date.today()):
    
    fixedTable = table
    
    # Drop
    fixedTable.drop(['選擇', '凱基證券下單'], axis = 1, inplace = True)
    fixedTable.dropna(axis=0, how='all', inplace=True)

    # fill missing data
    fixedTable['股票代號名稱'] = fixedTable['股票代號']
    fixedTable['股票代號'] = fixedTable['股票代號名稱'].map(lambda x: x.split()[0])
    fixedTable['股票名稱'] = fixedTable['股票代號名稱'].map(lambda x: x.split()[1])
    fixedTable['日期'] = theDate
    fixedTable['市場別'] = marketType
    
    # data type
    fixedTable.replace('－', np.nan, inplace = True)    
     
    fixedTable['股票代號'] = fixedTable['股票代號'].astype(str)
    fixedTable['時間'] = fixedTable['時間'].astype(datetime.time)    
    fixedTable[['成交', '買進', '賣出', '張數', '昨收', '開盤', '最高', '最低']] = \
        fixedTable[['成交', '買進', '賣出', '張數', '昨收', '開盤', '最高', '最低']].astype(float)  
    
    fixedTable['漲跌'] = fixedTable['成交'] - fixedTable['昨收']
    fixedTable['漲跌'] = fixedTable['漲跌'].map(lambda x: round(x, 2))
    
    # sort
#     fixedTable.sort_values(by = '股票代號', inplace = True) 
    
    # indexing
    fixedTable.index = Series(range(len(fixedTable)))
    fixedTable.index.name = '項次'
    fixedTable = fixedTable.reindex(columns = ['市場別', '股票代號', '股票名稱', '日期', '時間', '成交', '買進', '賣出', '漲跌', '張數', '昨收', '開盤', '最高', '最低'])
    
    return fixedTable

In [10]:
df1 = fixTable('TSE', df)

In [11]:
df1.tail(5)

Unnamed: 0_level_0,市場別,股票代號,股票名稱,日期,時間,成交,買進,賣出,漲跌,張數,昨收,開盤,最高,最低
項次,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
195,TSE,4763,材料-KY,2016-10-04,14:30,121.5,121.5,122.0,0.0,994.0,121.5,121.0,128.0,120.5
196,TSE,1598,岱宇,2016-10-04,13:30,46.05,46.0,46.05,-0.5,252.0,46.55,46.5,46.95,46.0
197,TSE,1701,中化,2016-10-04,13:30,18.1,18.1,18.15,-0.05,218.0,18.15,18.2,18.2,18.05
198,TSE,1707,葡萄王,2016-10-04,13:30,261.0,261.0,261.5,3.5,537.0,257.5,261.0,262.5,258.0
199,TSE,1720,生達,2016-10-04,13:30,33.45,33.4,33.45,0.0,179.0,33.45,33.6,33.6,33.3


## 彙整 Yahoo 股市 page 1~ 5 的資料

In [12]:
# 抓第一頁~第五頁的資料
dfs = map(lambda p: fixTable('TSE', getDataOnePageTSE(p)) , range(1, 6))

In [13]:
# Append 在一起
df = pd.concat(dfs)
len(df)

1000

In [14]:
df.index = pd.Index(range(len(df)))  # 重新編排 row index 編號
df = df[df['股票代號'].str.len() <= 4]  # 濾除 權證 資料
df.tail()

Unnamed: 0,市場別,股票代號,股票名稱,日期,時間,成交,買進,賣出,漲跌,張數,昨收,開盤,最高,最低
902,TSE,9941,裕融,2016-10-04,13:30,71.4,71.3,71.4,0.1,171.0,71.3,71.3,71.4,71.2
903,TSE,9942,茂順,2016-10-04,13:18,86.9,85.9,86.4,1.6,37.0,85.3,85.8,86.9,85.2
904,TSE,9944,新麗,2016-10-04,13:30,24.2,24.2,24.5,-0.3,110.0,24.5,24.5,24.6,24.2
905,TSE,9945,潤泰新,2016-10-04,14:30,37.7,37.7,37.75,-1.1,7229.0,38.8,38.6,38.6,37.6
906,TSE,9955,佳龍,2016-10-04,13:30,17.6,17.6,17.65,0.0,73.0,17.6,17.8,17.9,17.4


## 抓取 類股 資料

In [15]:
df_類股 = pd.read_excel('..\\data\個股_類別.xls')  # 需先解壓縮 個股_類別.rar
df_類股.tail()

Unnamed: 0,市場別_ID,類股別_ID,個股_代號,個股_名稱,類股_名稱
29345,2,72,72861P,國泰RG,櫃認售
29346,2,72,72863P,國泰RJ,櫃認售
29347,2,72,72895P,元大P3,櫃認售
29348,2,72,72901P,工銀QM,櫃認售
29349,2,72,72953P,日盛QW,櫃認售


## Merge

In [16]:
mdf = df.merge(df_類股, left_on = '股票代號', right_on = '個股_代號', how = 'left')  # merge
mdf = mdf.drop(['市場別_ID', '個股_代號', '個股_名稱'], axis = 1)  # drop 多於的欄位
mdf.tail()

Unnamed: 0,市場別,股票代號,股票名稱,日期,時間,成交,買進,賣出,漲跌,張數,昨收,開盤,最高,最低,類股別_ID,類股_名稱
885,TSE,9941,裕融,2016-10-04,13:30,71.4,71.3,71.4,0.1,171.0,71.3,71.3,71.4,71.2,6.0,其他
886,TSE,9942,茂順,2016-10-04,13:18,86.9,85.9,86.4,1.6,37.0,85.3,85.8,86.9,85.2,6.0,其他
887,TSE,9944,新麗,2016-10-04,13:30,24.2,24.2,24.5,-0.3,110.0,24.5,24.5,24.6,24.2,6.0,其他
888,TSE,9945,潤泰新,2016-10-04,14:30,37.7,37.7,37.75,-1.1,7229.0,38.8,38.6,38.6,37.6,6.0,其他
889,TSE,9955,佳龍,2016-10-04,13:30,17.6,17.6,17.65,0.0,73.0,17.6,17.8,17.9,17.4,6.0,其他


## GroupBy

In [17]:
# 各類股有多少支個股
mdf.groupby(['類股_名稱']).size().sort_index()

類股_名稱
光電       69
其他       46
其它電子     32
化工       25
半導體      64
塑膠       22
憑證        7
橡膠       10
水泥        7
汽車        6
油電燃氣      8
營建       48
玻璃        4
生技醫療     20
紡織       46
航運運輸     21
觀光       13
貿易百貨     11
資訊服務     13
通信網路     39
造紙        7
金融       33
鋼鐵       30
電器電纜     15
電子通路     23
電子零組件    81
電機       43
電腦週邊     60
食品       21
dtype: int64

In [18]:
# 各類股 平均股價
mdf.groupby(['類股_名稱'])['成交'].mean().sort_index()

類股_名稱
光電        75.766667
其他        56.844130
其它電子      42.225000
化工        27.510400
半導體       50.935156
塑膠        28.105909
憑證         3.641429
橡膠        34.415000
水泥        19.055714
汽車       128.300000
油電燃氣      38.787500
營建        18.451250
玻璃        10.752500
生技醫療      54.405000
紡織        27.713696
航運運輸      15.499524
觀光        69.121538
貿易百貨      40.754545
資訊服務      35.570000
通信網路      38.973846
造紙        14.381429
金融        15.520606
鋼鐵        16.662333
電器電纜      11.322000
電子通路      34.553478
電子零組件     38.924321
電機        67.288837
電腦週邊      48.827833
食品        39.721905
Name: 成交, dtype: float64