In [1]:
import tushare as ts
import seaborn as sns
import keras
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats
from keras.models import Model
from keras import backend as K
from keras.layers import Input, Dense, LSTM, Dropout, Bidirectional, BatchNormalization, Activation
from keras.callbacks import EarlyStopping, TensorBoard
from keras.optimizers import RMSprop, Adam
from keras.initializers import Orthogonal
from tcn import TCN
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import (
    roc_auc_score, roc_curve, auc, f1_score,
    confusion_matrix, recall_score, 
    precision_recall_fscore_support)
set_matplotlib_formats('retina')
pd.set_option('display.max_columns', 100)

Using TensorFlow backend.


In [2]:
use = ['Symbol', 'ContrshrProportion', 'IsRelatedTrading', 'ShareholderFirstProp',
       'ControlProportion', 'FamEntyp_1.0', 'FamEntyp_2.0', 'FamEntyp_3.0',
       'BoardCode_P3401', 'BoardCode_P3402', 'BoardCode_P3403', 'FamStyle_1',
       'FamStyle_2', 'ManGeneration_1.0', 'ManGeneration_2.0',
       'FamNameStatus_1', 'FamNameStatus_2', 'FamNameStatus_3',
       'FamNameStatus_4', 'FamNameStatus_5']
fam = pd.read_csv('../data/family_firm_clean.csv', index_col=0, usecols=use)

In [3]:
pledge_neg = pd.read_csv('../data/pledge/pledge_company_neg.csv', 
                         parse_dates=['start_date'])
pledge_pos = pd.read_csv('../data/pledge/pledge_company_pos.csv',
                        parse_dates=['start_date', 'close_date'])

In [4]:
pledge_neg = pledge_neg.sort_values(by=['start_date'])
pledge_neg = pledge_neg.reset_index(drop=True)

In [5]:
pledge_pos = pledge_pos.sort_values(by=['start_date'])
pledge_pos = pledge_pos.reset_index(drop=True)

In [6]:
pledge_pos.shape

(15891, 29)

In [7]:
pledge_neg.shape

(19753, 28)

In [8]:
pledge_neg = pledge_neg.merge(fam, how='left', left_on='ts_code', right_on='Symbol')
pledge_neg = pledge_neg.fillna(0)

In [9]:
pledge_pos = pledge_pos.merge(fam, how='left', left_on='ts_code', right_on='Symbol')
pledge_pos = pledge_pos.fillna(0)

In [11]:
pledge_neg.head()

Unnamed: 0,ts_code,start_date,pledge_price,forcast_close_line,current_ratio,quick_ratio,cash_ratio,inv_turn,ar_turn,ca_turn,netprofit_margin,grossprofit_margin,roe,debt_to_assets,op_yoy,ebt_yoy,assets_yoy,tr_yoy,or_yoy,q_gr_yoy,q_gr_qoq,q_sales_yoy,q_sales_qoq,q_op_yoy,q_op_qoq,q_profit_yoy,q_profit_qoq,revenue,ContrshrProportion,IsRelatedTrading,ShareholderFirstProp,ControlProportion,FamEntyp_1.0,FamEntyp_2.0,FamEntyp_3.0,BoardCode_P3401,BoardCode_P3402,BoardCode_P3403,FamStyle_1,FamStyle_2,ManGeneration_1.0,ManGeneration_2.0,FamNameStatus_1,FamNameStatus_2,FamNameStatus_3,FamNameStatus_4,FamNameStatus_5
0,600998.SH,2016-01-04,17.4032,12.18224,1.3862,1.0302,0.9365,1.7793,1.5325,0.5284,0.9054,7.0569,1.4093,69.3711,16.355,18.608,12.8411,29.3125,29.3125,29.3125,17.6031,29.3125,17.6031,16.355,-56.016,20.5204,-56.8724,45232980000.0,50.19,1.0,23.05,50.19,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
1,300071.SZ,2016-01-04,12.6686,8.86802,1.2334,1.2334,1.1324,0.0,0.7313,0.5061,2.8887,15.7362,1.8828,56.5039,-36.094,-18.308,-0.0642,-12.6251,-12.6251,-12.6251,-61.746,-12.6251,-61.746,-36.094,-73.6072,-16.6867,-54.4894,2530791000.0,31.24,1.0,31.24,31.24,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0
2,300409.SZ,2016-01-04,14.3533,10.04731,3.5132,2.8557,2.8079,0.2906,0.4558,0.105,9.8997,36.0899,1.1187,24.1653,15.8836,13.5719,36.3601,33.6112,33.6112,33.6112,-31.7073,33.6112,-31.7073,15.8836,3277.5147,9.8855,752.0287,565076100.0,37.03,1.0,37.03,37.03,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0
3,002706.SZ,2016-01-04,8.1208,5.68456,3.9828,3.4084,2.9236,0.9005,2.7326,0.2363,10.0936,35.6004,2.0447,19.3427,23.7769,27.165,44.0904,25.0799,25.0799,25.0799,-8.2441,25.0799,-8.2441,23.7769,29.9184,27.165,28.6224,1225771000.0,40.34,1.0,10.37,40.34,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0
4,300203.SZ,2016-01-04,29.7155,20.80085,1.6633,1.3274,1.202,0.2894,0.2386,0.1043,2.7809,46.0856,0.4277,41.2935,58.1968,55.6273,1.6549,22.8735,22.8735,22.8735,-68.3647,22.8735,-68.3647,58.1968,-104.0298,123.4244,-93.3394,1315154000.0,37.41,1.0,24.65,37.41,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [12]:
pledge_neg.shape

(19753, 47)

In [13]:
pledge_pos.shape

(15891, 48)

In [14]:
pledge_pos.columns

Index(['ts_code', 'start_date', 'pledge_price', 'forcast_close_line',
       'close_date', 'current_ratio', 'quick_ratio', 'cash_ratio', 'inv_turn',
       'ar_turn', 'ca_turn', 'netprofit_margin', 'grossprofit_margin', 'roe',
       'debt_to_assets', 'op_yoy', 'ebt_yoy', 'assets_yoy', 'tr_yoy', 'or_yoy',
       'q_gr_yoy', 'q_gr_qoq', 'q_sales_yoy', 'q_sales_qoq', 'q_op_yoy',
       'q_op_qoq', 'q_profit_yoy', 'q_profit_qoq', 'revenue',
       'ContrshrProportion', 'IsRelatedTrading', 'ShareholderFirstProp',
       'ControlProportion', 'FamEntyp_1.0', 'FamEntyp_2.0', 'FamEntyp_3.0',
       'BoardCode_P3401', 'BoardCode_P3402', 'BoardCode_P3403', 'FamStyle_1',
       'FamStyle_2', 'ManGeneration_1.0', 'ManGeneration_2.0',
       'FamNameStatus_1', 'FamNameStatus_2', 'FamNameStatus_3',
       'FamNameStatus_4', 'FamNameStatus_5'],
      dtype='object')

In [15]:
codes = set(pd.read_csv('../data/pledge/pledge_full_clean.csv')['ts_code'])
code_price_dict = dict()
for code in codes:
    price_df = pd.read_csv('../data/price/{}.csv'.format(code), index_col=0)
    price_df.index = pd.to_datetime(price_df.index)
    code_price_dict[code] = price_df

In [16]:
def get_stock_price_by_pos_code(code, lookback, 
                                delay, pledge_date, 
                                close_position_date, 
                                code_price_dict, forcast_close_line):
    """
    通过爆仓股票代码，过去天数，生成三维的股票价格矩阵
    delay * lookback * 9(features)
    """
#     scaler = MinMaxScaler()
    non_close_rate = 4
    step = 4
    price_df = code_price_dict[code]
    price_df = price_df.drop(['pre_close'], axis=1)
    price_df['delta'] = price_df.apply(lambda x: x['close'] - forcast_close_line, axis=1)
#     scaler.fit(price_df)
    price_df = price_df[pledge_date: close_position_date]
    price_values = price_df.values
    # 平仓训练集
    usable_close_len = len(price_df) - lookback
    usable_close_len = min(usable_close_len, delay + 1)
    close_list = []
    for i in range(1, usable_close_len + 1, step):
        price_time = price_values[-lookback - i: -i]
        close_list.append(price_time)
    # 正常训练集
    usable_non_close_len = len(price_df) - delay - lookback
    usable_non_close_len = min(usable_non_close_len, non_close_rate * delay)
    non_close_list = []
    for i in range(delay+1, delay+1+usable_non_close_len, step):
        price_time = price_values[-lookback - i: -i]
        non_close_list.append(price_time)
        
    return close_list, non_close_list

In [17]:
def get_stock_price_by_neg_code(code, delay, lookback, pledge_date, code_price_dict, forcast_close_line):
    """
    通过非爆仓股票代码，过去天数，生成三维的股票价格矩阵
    delay * lookback * 9(features)
    """
#     scaler = MinMaxScaler()
    step = 4
    price_df = code_price_dict[code]
    price_df = price_df.drop(['pre_close'], axis=1)
    price_df['delta'] = price_df.apply(lambda x: x['close'] - forcast_close_line, axis=1)
#     scaler.fit(price_df)
    price_df = price_df[:pledge_date]
    price_values = price_df.values
    # 质押前训练集
    usable_close_len = len(price_df) - lookback
    usable_close_len = min(usable_close_len, delay + 1)
    price_list = []
    for i in range(1, usable_close_len + 1, step):
        price_time = price_values[-lookback - i: -i]
        price_list.append(price_time)
        
    return price_list

In [18]:
def pos_generator(pledge, lookback, delay, min_index, max_index, code_price_dict):
    # batch_size == delay(30)
    i = min_index
    all_zip = []

    pledge_drop = pledge.drop(['ts_code', 'start_date', 'close_date'], axis=1)
    pledge_values = pledge_drop.values
    while i < max_index:
        single_pledge = pledge.loc[i]
        single_pledge_scaled = pledge_values[i]
        close_list, non_close_list = get_stock_price_by_pos_code(code=single_pledge['ts_code'],
                               lookback=lookback,
                               delay=delay,
                               pledge_date=single_pledge['start_date'],
                               close_position_date=single_pledge['close_date'],
                               code_price_dict=code_price_dict,
                               forcast_close_line=single_pledge['forcast_close_line'])
        close_zip = [(prices, single_pledge_scaled, 1) for prices in close_list]
        non_close_zip = [(prices, single_pledge_scaled, 0) for prices in non_close_list]
        all_zip += close_zip + non_close_zip
        i += 1
    return all_zip

In [19]:
def neg_generator(pledge, lookback, delay, min_index, max_index, code_price_dict):
    # batch_size == delay(30)
    i = min_index
    all_zip = []

    pledge_drop = pledge.drop(['ts_code', 'start_date'], axis=1)
    pledge_values = pledge_drop.values
    while i < max_index:
        single_pledge = pledge.loc[i]
        single_pledge_scaled = pledge_values[i]
        price_list = get_stock_price_by_neg_code(code=single_pledge['ts_code'],
                               lookback=lookback,
                               delay=delay,
                               pledge_date=single_pledge['start_date'],
                               code_price_dict=code_price_dict,
                               forcast_close_line=single_pledge['forcast_close_line'])
        all_zip += [(prices, single_pledge_scaled, 0) for prices in price_list]
        i += 1
    return all_zip

In [20]:
print('neg: {}  pos: {}'.format(len(pledge_neg), len(pledge_pos)))

neg: 19753  pos: 15891


In [21]:
lookback = 48
delay = 30
train_neg_num = int(len(pledge_neg) * 0.8)
train_pos_num = int(len(pledge_pos) * 0.8)

train_zip = neg_generator(pledge_neg, lookback, delay, 0, train_neg_num, code_price_dict) + pos_generator(pledge_pos, lookback, delay, 0, train_pos_num, code_price_dict)
test_zip = neg_generator(pledge_neg, lookback, delay, train_neg_num, len(pledge_neg), code_price_dict) + pos_generator(pledge_pos, lookback, delay, train_pos_num, len(pledge_pos), code_price_dict)

In [23]:
def get_price_info_target_by_zip(all_zip):
    #random.shuffle(all_zip)
    price_time = np.stack([t[0] for t in all_zip])
    pledge_info = np.stack([t[1] for t in all_zip])
    targets = np.stack([t[2] for t in all_zip])
    return price_time, pledge_info, targets

In [24]:
train_price, train_info, train_target = get_price_info_target_by_zip(train_zip)
test_price, test_info, test_target = get_price_info_target_by_zip(test_zip)

In [32]:
train = pd.DataFrame(train_info)
label = pd.DataFrame(train_target)
train[46] = label[0]
train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,46
0,0.040713,0.040713,0.044665,0.047307,0.062763,3.075524e-07,5e-05,0.014867,0.039876,0.986541,0.877902,0.103702,0.005488,0.165285,0.048398,0.000856,0.000856,0.011603,0.000386,0.011603,0.000386,0.580032,0.489082,0.153621,0.877379,0.004028,0.424978,1.0,0.453911,0.427022,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0
1,0.020103,0.020103,0.01772,0.01607,0.020665,6.260488e-07,2.8e-05,0.027492,0.039855,0.899863,0.877184,0.271962,0.005485,0.164983,0.045906,0.000675,0.000675,0.011423,0.000384,0.011423,0.000384,0.579816,0.488942,0.153596,0.877478,0.005678,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.042216,0.042216,0.023471,0.023994,0.039308,7.423608e-07,5e-06,0.010256,0.039861,0.907961,0.876419,0.178934,0.005489,0.165431,0.04497,0.000905,0.000905,0.011651,0.000382,0.011651,0.000382,0.580096,0.488957,0.153634,0.87735,0.000474,0.292186,1.0,0.2414,0.293591,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0
3,0.017561,0.017561,0.012516,0.010812,0.017399,5.145216e-07,1.7e-05,0.020576,0.039854,0.872822,0.877618,0.447431,0.005498,0.166091,0.050085,0.001916,0.001916,0.012651,0.000534,0.012651,0.000534,0.580587,0.489344,0.153641,0.877343,0.019254,0.249332,1.0,0.345085,0.250531,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0
4,0.074037,0.074037,0.018363,0.01476,0.020305,4.278857e-07,4.8e-05,0.026193,0.039859,0.885592,0.876835,0.160976,0.005503,0.166678,0.049243,0.002698,0.002698,0.013425,0.000414,0.013425,0.000414,0.580905,0.489014,0.153706,0.877373,0.003747,0.455142,1.0,0.380463,0.457331,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0


In [34]:
train.to_csv('train_info.csv', index = False)

In [33]:
test = pd.DataFrame(test_info)
label = pd.DataFrame(test_target)
test[46] = label[0]
test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,46
0,0.019142,0.019142,0.011978,0.012214,0.010286,1.5e-05,0.000347,0.18209,0.039851,0.864152,0.881496,0.421369,0.005484,0.16487,0.045764,0.000729,0.000729,0.011287,0.0004,0.011287,0.0004,0.579587,0.489359,0.153574,0.877443,0.02091,0.578028,1.0,0.598115,0.714461,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0
1,0.019142,0.019142,0.011978,0.012214,0.010286,1.5e-05,0.000347,0.18209,0.039851,0.864152,0.881496,0.421369,0.005484,0.16487,0.045764,0.000729,0.000729,0.011287,0.0004,0.011287,0.0004,0.579587,0.489359,0.153574,0.877443,0.02091,0.578028,1.0,0.598115,0.714461,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0
2,0.019142,0.019142,0.011978,0.012214,0.010286,1.5e-05,0.000347,0.18209,0.039851,0.864152,0.881496,0.421369,0.005484,0.16487,0.045764,0.000729,0.000729,0.011287,0.0004,0.011287,0.0004,0.579587,0.489359,0.153574,0.877443,0.02091,0.578028,1.0,0.598115,0.714461,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0
3,0.019142,0.019142,0.011978,0.012214,0.010286,1.5e-05,0.000347,0.18209,0.039851,0.864152,0.881496,0.421369,0.005484,0.16487,0.045764,0.000729,0.000729,0.011287,0.0004,0.011287,0.0004,0.579587,0.489359,0.153574,0.877443,0.02091,0.578028,1.0,0.598115,0.714461,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0
4,0.019142,0.019142,0.011978,0.012214,0.010286,1.5e-05,0.000347,0.18209,0.039851,0.864152,0.881496,0.421369,0.005484,0.16487,0.045764,0.000729,0.000729,0.011287,0.0004,0.011287,0.0004,0.579587,0.489359,0.153574,0.877443,0.02091,0.578028,1.0,0.598115,0.714461,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0


In [35]:
test.to_csv('test_info.csv', index = False)

In [31]:
info_scaler = MinMaxScaler()
train_info = info_scaler.fit_transform(train_info)
test_info = info_scaler.transform(test_info)

In [45]:
train_price[0][2][3]

20.2308

In [46]:
price = np.zeros((442680,48))
for i in range(train_price.shape[0]):
    for j in range(train_price.shape[1]):
        price[i][j] = train_price[i][j][3]

In [47]:
p = pd.DataFrame(price)
p.to_csv('train_price.csv', index = False)

In [49]:
price = np.zeros((66251,48))
for i in range(test_price.shape[0]):
    for j in range(test_price.shape[1]):
        price[i][j] = test_price[i][j][3]

In [50]:
test_price.shape

(66251, 48, 9)

In [51]:
p = pd.DataFrame(price)
p.to_csv('test_price.csv', index = False)