In [4]:
import pandas as pd
import requests
import numpy as np
from io import StringIO
import time
import datetime
import matplotlib.pyplot as plt
import html5lib

In [5]:
def raw_ctnt(year, season):
    url = 'https://mops.twse.com.tw/mops/web/t163sb04'
    r = requests.post(url, {
            'encodeURIComponent':1,
            'step':1,
            'firstin':1,
            'off':1,
            'TYPEK':'sii',
            'year':year,
            'season':season,
        })
    r.encoding = 'utf8'
    return r.text

In [6]:
def unify(x):
    dfs = x.copy()
    dfs[0]['Gross_Profit'] = dfs[0]['利息淨收益'] + dfs[0]['利息以外淨損益']
    dfs[0].rename(columns={'繼續營業單位稅前淨利（淨損）':'Operating_Income',
                              '本期稅後淨利（淨損）':'Net_Income'}, inplace=True)
    dfs[0]['Pre_Tax_Income'] = dfs[0]['Operating_Income']

    dfs[1].rename(columns={'收益':'Gross_Profit',
                              '營業利益':'Operating_Income',
                              '稅前淨利（淨損）':'Pre_Tax_Income',
                              '本期淨利（淨損）':'Net_Income'}, inplace=True)

    dfs[2].rename(columns={'營業收入':'Gross_Profit',
                              '營業利益（損失）':'Operating_Income',
                              '稅前淨利（淨損）':'Pre_Tax_Income',
                              '本期淨利（淨損）':'Net_Income'}, inplace=True)

    dfs[3].rename(columns={'淨收益':'Gross_Profit',
                              '繼續營業單位稅前損益':'Operating_Income',
                              '本期稅後淨利（淨損）':'Net_Income'}, inplace=True)
    dfs[3]['Pre_Tax_Income'] = dfs[3]['Operating_Income']

    dfs[4].rename(columns={'營業收入':'Gross_Profit',
                              '營業利益（損失）':'Operating_Income',
                              '繼續營業單位稅前純益（純損）':'Pre_Tax_Income',
                              '本期淨利（淨損）':'Net_Income'}, inplace=True)
    for i in dfs:
        i.rename(columns={'公司代號':'Id','公司名稱':'Name','基本每股盈餘（元）':'EPS'}, inplace=True)
        
    dfs = [i[['Id', 'Name', 'Gross_Profit', 'Operating_Income', 'Pre_Tax_Income', 'Net_Income', 'EPS']] for i in dfs]
    
    return dfs

In [31]:
def income_statement(year, season):

    if year >= 1000:
        year -= 1911
        
    dfs = pd.read_html(raw_ctnt(year, season))
    
    dfs = [i for i in dfs if i.shape[1] > 20]
    
    # 將不同欄位名統一
    df_all = pd.concat(unify(dfs), ignore_index=True)
    
    print(f'scrapping {year}, {season} done')
    
    # 偽停頓
    time.sleep(1)
    
    return df_all

In [21]:
a = list(range(103,108))
b = list(range(1, 5))
data_range = [(x, y) for x in a for y in b]

In [35]:
icmList = []
for t in data_range:
    df = income_statement(t[0], t[1])
    icmList.append(df)

scrapping 103, 1 done
scrapping 103, 2 done
scrapping 103, 3 done
scrapping 103, 4 done
scrapping 104, 1 done
scrapping 104, 2 done
scrapping 104, 3 done
scrapping 104, 4 done
scrapping 105, 1 done
scrapping 105, 2 done
scrapping 105, 3 done
scrapping 105, 4 done
scrapping 106, 1 done
scrapping 106, 2 done
scrapping 106, 3 done
scrapping 106, 4 done
scrapping 107, 1 done
scrapping 107, 2 done
scrapping 107, 3 done
scrapping 107, 4 done


In [44]:
list(zip(data_range, icmList))

[((103, 1),
         Id  Name  Gross_Profit  Operating_Income  Pre_Tax_Income  Net_Income  \
  0    2801    彰銀       6220401           3650902         3650902     3100347   
  1    2809   京城銀       1758428           1450177         1450177     1264937   
  2    2812   台中銀       2845289           1131757         1131757     1036839   
  3    2820    華票        519550            450362          450362      386635   
  4    2834   臺企銀       4482279           1218856         1218856     1212460   
  5    2836   高雄銀        770874            155990          155990      134782   
  6    2838   聯邦銀       2588111            880648          880648      727963   
  7    2845   遠東銀       2749993           1154644         1154644     1004915   
  8    2849   安泰銀       1695728            757602          757602      666405   
  9    2855   統一證       1503083            533635          529634      464136   
  10   6005   群益證       1657404            459301          563786      494907   
  11   6024   群益

In [48]:
for t, df in enumerate(icmList):
    df['Year'] = data_range[t][0]
    df['Season'] = data_range[t][1]
    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    df = df[cols]

In [49]:
icmList[0]

Unnamed: 0,Id,Name,Gross_Profit,Operating_Income,Pre_Tax_Income,Net_Income,EPS,Year,Season
0,2801,彰銀,6220401,3650902,3650902,3100347,0.40,103,1
1,2809,京城銀,1758428,1450177,1450177,1264937,1.05,103,1
2,2812,台中銀,2845289,1131757,1131757,1036839,0.40,103,1
3,2820,華票,519550,450362,450362,386635,0.29,103,1
4,2834,臺企銀,4482279,1218856,1218856,1212460,0.24,103,1
5,2836,高雄銀,770874,155990,155990,134782,0.19,103,1
6,2838,聯邦銀,2588111,880648,880648,727963,0.32,103,1
7,2845,遠東銀,2749993,1154644,1154644,1004915,0.38,103,1
8,2849,安泰銀,1695728,757602,757602,666405,0.40,103,1
9,2855,統一證,1503083,533635,529634,464136,0.35,103,1
