In [1]:
import akshare as ak 

import pandas as pd
import numpy as np

import os 
from datetime import datetime
from tqdm import tqdm

### load the stock list

In [2]:
stocks = ak.stock_info_a_code_name()
stocks

  0%|          | 0/15 [00:00<?, ?it/s]

Unnamed: 0,code,name
0,000001,平安银行
1,000002,万 科Ａ
2,000004,*ST国华
3,000006,深振业Ａ
4,000007,全新好
...,...,...
5452,920978,开特股份
5453,920981,晶赛科技
5454,920982,锦波生物
5455,920985,海泰新能


In [3]:
today = pd.to_datetime("today").strftime("%Y%m%d")

stocks['code'] = stocks['code'].astype(str).str.zfill(6)
stocks['symbol'] = stocks['code'].apply(lambda x: 'sh' + x if x.startswith('6') else 'sz' + x)
stocks.to_csv(f'../data/input/stock_names_full_{today}.csv', index=False)

In [4]:
# load the full list of a share
stocks_df = pd.read_csv('../data/input/stock_names_full.csv')
stocks_df['code'] = stocks_df['symbol'].str[2:8].astype(str)
stocks_df['init_digit'] = stocks_df['code'].str[0]

# exclude the beijing stocks
stocks_df = stocks_df.query("init_digit != '9'")
stocks_df

Unnamed: 0,code,name,symbol,init_digit
0,000001,平安银行,sz000001,0
1,000002,万 科Ａ,sz000002,0
2,000004,*ST国华,sz000004,0
3,000006,深振业Ａ,sz000006,0
4,000007,全新好,sz000007,0
...,...,...,...,...
5166,688799,华纳药厂,sh688799,6
5167,688800,瑞可达,sh688800,6
5168,688819,天能股份,sh688819,6
5169,688981,中芯国际,sh688981,6


### load the financial data

In [5]:
STOCK_CODES = stocks_df['code'].tolist()
print(len(STOCK_CODES))
STOCK_CODES = [stock for stock in STOCK_CODES if stock not in ["600519", "000858", "600938", "000333", "601088", "300866", "600900", "600036", "300750", "601899", "603993"]]

len(STOCK_CODES)

5171


5160

In [6]:
MISSED_CODE = []

for stock_code in tqdm(STOCK_CODES):
    try: 
        # --- get the financial data ---
        financial_df = ak.stock_financial_abstract_ths(symbol=f"{stock_code}", indicator="按单季度")
        # select the key indicators
        financial_df = financial_df[['报告期', '每股净资产', '基本每股收益', '净资产收益率']]
        # rename the columns
        financial_df.columns = ['report_date', 'bps', 'eps', 'roe']
        # chage the date format
        financial_df['report_date'] = pd.to_datetime(financial_df['report_date'])
        # choose the date later than 2010-01-01
        financial_df = financial_df[financial_df['report_date'] >= '2010-01-01']
        # change the data format
        financial_df['eps'] = financial_df['eps'].astype(float)
        financial_df['roe'] = financial_df['roe'].str.replace('%', '').astype(float)
        financial_df['bps'] = financial_df['bps'].astype(float)
        # calculate ttm eps and ttm roe
        financial_df['bps_ttm'] = financial_df['bps'].rolling(window=4).mean()
        financial_df['eps_ttm'] = financial_df['eps'].rolling(window=4).sum()
        financial_df['roe_ttm'] = financial_df['roe'].rolling(window=4).sum()
        # drop the values with null values
        financial_df.dropna(inplace=True)
        
        # --- merge the financial data with standardized report dates ---
        # standardize the report dates
        date_df = pd.DataFrame(pd.date_range(start='2010-12-31', end='2025-12-31', freq='ME'), columns=['report_date'])
        financial_date = pd.merge(date_df, financial_df, on='report_date', how='left', validate="1:1")
        financial_date.to_csv(f"../data/input/financial_indicators_{stock_code}_{today}.csv", index=False)
    except: 
        MISSED_CODE.append(stock_code)
        continue



  financial_df['report_date'] = pd.to_datetime(financial_df['report_date'])
100%|██████████| 5160/5160 [26:43:45<00:00, 18.65s/it]    
