In [1]:
%matplotlib qt
import numpy as np
import pandas as pd
from pandas import DataFrame
from scipy import stats
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
import gc
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures,StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score,KFold
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.externals import joblib
from sklearn.svm import SVC
import sklearn.metrics as metrics
import xgboost as xgb
from xgboost import XGBClassifier
import warnings
from hyperopt import tpe,fmin,hp
from hpsklearn import random_forest,HyperoptEstimator
warnings.filterwarnings('ignore')

path = './data/Debt issuing company 2018 report/'

WARN: OMP_NUM_THREADS=None =>
... If you are using openblas if you are using openblas set OMP_NUM_THREADS=1 or risk subprocess calls hanging indefinitely


In [2]:
# 去掉异常值

def drop_out(frame,col,model='Confidence interval',t_alpha=0.95,alpha=2,IQR_rate=1.5,quantile=0.95):
    '''modle: 'gauss','box','quantile' '''
    
    if model == 'Confidence interval':
        u_ = frame[col].mean()
        v_ = frame[col].std()
        interval_ = stats.t.interval(t_alpha,frame[col].count()-1,u_,v_)
        cond_ = (frame[col]<interval_[1])&(frame[col]>interval_[0])
    
    elif model == 'gauss':
        u_ = frame[col].mean()
        v_ = frame[col].std()
        cond_ = (frame[col]-u_)/v_ < alpha
    
    elif model == 'box':
        q1 = frame[col].quantile(0.25)
        q3 = frame[col].quantile(0.75)
        IQR = (q3-q1)*IQR_rate
        q1 -= IQR ; q3 += IQR
        cond_ = (frame[col]<q3)&(frame[col]>q1)
    
    elif model == 'quantile':
        top_ = frame[col].quantile(quantile)
        bottom_ = frame[col].quantile(1-quantile)
        cond_ = (frame[col]<top_)&(frame[col]>bottom_)
    
    else:
        print('please try again')
        return frame
    
    index_ = np.where(frame[col]!=frame[col],True,
                                                  np.where(cond_,True,False))
    frame = frame.loc[index_,:]
    return frame

# 时间操作

def mms(frame,groupby,col,):
    max_ = dict(frame.groupby(by=groupby,)[col].max())
    min_ = dict(frame.groupby(by=groupby,)[col].min())
    max_ = frame[groupby].map(lambda x:max_[x])
    min_ = frame[groupby].map(lambda x:min_[x])
    result = (frame[col].values - min_)/(max_-min_)
    return [round(i,2) for i in result]

def moving(frame,groupby,time_col,col,wins=3,weight=[0.55,0.3,0.15]):
    '''wins是窗口数,weight是权重,这两个必须对应'''
    frame = frame.sort_values([groupby,time_col])
    for i in range(wins):
        if i == 0 :
            list_ = weight[i]*(frame[col].values)
        else :
            list_ += weight[i]*(frame.groupby(groupby)[col].rolling(i+1).mean().values)
    return list_

def bodonglv(frame,groupby,time_col,col,wins):
    frame = frame.sort_values([groupby,time_col])
    std_ = (frame.groupby(groupby)[col].rolling(wins).std().values)
    mean_ = (frame.groupby(groupby)[col].rolling(wins).mean().values)
    return [round(std_[i]/mean_[i],2) for i in range(len(std_))]

def tongbi(frame,groupby,time_col,col,diff):
    frame = frame.sort_values([groupby,time_col])
    diff_ = frame.groupby(groupby)[col].diff(diff).values
    ori_ = frame[col].values
    return [np.nan]*diff+[round(diff_[i]/abs(ori_[i-diff]),2) for i in range(diff,frame.shape[0])]

# 补充空白值

def diy_ss(frame,weight_dict,quantile=0.2):
    for i in frame.columns:
        if ('%' not in i) and ('/' not in i) and ('率' not in i) and ('倍数' not in i):
            if frame[i].min() >= 0:
                frame[i] = np.log1p(frame[i])
                weight_dict[i] = 'log1p'
            else :
                mean_ = frame[i][(frame[i]>=frame[i].quantile(quantile)) & (frame[i] <= frame[i].quantile(1-quantile))].mean()
                std_ = frame[i][(frame[i]>=frame[i].quantile(quantile)) & (frame[i] <= frame[i].quantile(1-quantile))].std()
                weight_dict[i] = [mean_,std_]
                frame[i] = (frame[i]-mean_)/std_
    return frame,weight_dict

def fillna_(frame,group_col,quantile=0.2,n_epoch=3):
    
    def return_index(aa,bb):
        j=0
        cc = []
        for i in range(len(aa)):
            if aa[i] == False :
                cc.append(aa[i])
            else:
                cc.append(bb[j])
                j += 1
        return cc
    
    frame_col = frame.columns
    
    for l,comp in enumerate(set(group_col)):

        index_y = list(group_col == comp)
        full_col = []
        loss_col = {}

        for col in frame.columns:
            if frame.loc[index_y,col].isnull().sum() == 0:
                full_col.append(col)
            else:
                loss_col[col] = frame.loc[index_y,col].isnull().sum()

        loss_col = sorted(loss_col.items(),key=lambda x:x[1])
        loss_col = [i[0] for i in loss_col]
        
        index_dict = {}
        if len(full_col) == 0:
            index_dict[loss_col[0]] = frame.loc[index_y,loss_col[0]].isnull()
            index_dict[loss_col[0]].fillna(index_dict[loss_col[0]].median(),inplace=True)
            full_col.append(loss_col[0])
            loss_col = loss_col[1:]
            
        for epoch in range(n_epoch):

            if epoch == 0:
                for _,col in enumerate(loss_col):
                    if np.random.rand()>0.75:
                        print(comp,f'{l}/{len(set(group_col))}',col,f'{_}/{len(loss_col)}')
                    index_l = list(frame.loc[index_y,col].isnull())
                    index_f = list(frame.loc[index_y,col].notnull())
                    index_l_ = return_index(index_y,index_l)
                    index_f_ = return_index(index_y,index_f)
                    index_dict[col] = (index_l_,index_f_)
                    rfr = RandomForestRegressor(n_estimators=10,n_jobs=-1,max_features=0.9)
                    rfr.fit(frame.loc[index_f_,full_col],frame.loc[index_f_,col])
                    pre = rfr.predict(frame.loc[index_l_,full_col])
                    frame.loc[index_l_,col] = pre
                    full_col.append(col)
            
            else:
                for col in index_dict:
                    index_l_ = index_dict[col][0]
                    index_f_ = index_dict[col][1]
                    rfr = RandomForestRegressor(n_estimators=20,n_jobs=-1,max_features=0.6)
                    rfr.fit(frame.loc[index_f_,full_col],frame.loc[index_f_,col])
                    pre = rfr.predict(frame.loc[index_l_,full_col])
                    frame.loc[index_l_,col] = pre
                    
    gc.collect()
    return frame[frame_col]

# 独热编码

def one_hot_str(frame,col,replace=True):
    if replace:
        a_ = frame.pop(col)
    else:
        a_ = frame[col]
    a_.fillna('miss',inplace=True)
    a_ = pd.get_dummies(a_,prefix=a_.name)
    frame = pd.concat([frame,a_],axis=1)
    del a_
    return frame
def one_hot_int(frame,col,number):
    a_ = frame.pop(col)
    a_ = pd.qcut(a_,number)
    col_name_ = [a_.name+'_'+str(i+1) for i in range(number)]
    a_ = pd.get_dummies(a_,)
    a_.columns = col_name_
    frame = pd.concat([frame,a_],axis=1)
    del a_,col_name_
    return frame

# 截取时间节点

def getxyb(date,delta_y = 1,):
    '''得到当前日期的前几年最后一天,默认1年'''
    aaa = date - pd.tseries.offsets.DateOffset(years=1*delta_y)
    return aaa + pd.tseries.offsets.DateOffset(years=1,months=1 - aaa.month, days= - aaa.day)
def getxsb(date,delta_s = 1,):
    '''得到当前日期的前几个季度最后一天,默认1季度'''
    aaa = date - pd.tseries.offsets.DateOffset(months=3*delta_s)
    return aaa + pd.tseries.offsets.DateOffset(months=3 - ((aaa.month - 1) % 3), days=-aaa.day)
def getxhyb(date,delta_hy = 1,):
    '''得到当前日期的前几个半年最后一天,默认1半年'''
    aaa = date - pd.tseries.offsets.DateOffset(months=6*delta_hy)
    return aaa + pd.tseries.offsets.DateOffset(months=6 - ((aaa.month - 1) % 6), days=-aaa.day)

# 原始财务数据读取+分布图

start_year = 2013
end_year = 2018
data_a = DataFrame()
for i in range(start_year, end_year+1):
    if i % 3 ==0:
        print('is concating {} {}/{}'.format(i, i-start_year+1, end_year+1-start_year))
    path_a = path+'a/{}a.xlsx'.format(i)
    data_a_ = pd.read_excel(path_a)[:-2]
    data_a_.drop(['是否经过审计','审计意见'] + [i for i in data_a_.columns if i.find('E') >= 0], axis=1, inplace=True)
    data_a_.loc[:, ['主营业务收入(亿元)', '主营业务利润(亿元)', '净利润(亿元)']] = \
    data_a_.loc[:, ['主营业务收入(亿元)', '主营业务利润(亿元)', '净利润(亿元)']].apply(lambda x:x/data_a_['报告期'].dt.month)
    data_a = pd.concat([data_a, data_a_])
del data_a_
gc.collect()
print('finish concat data_y')

print(np.array(list(data_a.isnull().sum(0))))
# data_a.dropna(thresh=data_a.shape[1]-6,inplace=True)

# for i in range(data_a.shape[1]-2):
#     ax = plt.subplot(5,5,i+1)
#     ax.scatter(range(data_a.shape[0]),data_a.iloc[:,i+2].sort_values(),s=3)
#     plt.title(data_a.columns[i+2])
# plt.suptitle('散点趋势图')

# 公司信息读取

# list_all_col = ['名称','主营业务收入(万元)','净资产(万元)', 
#                 '所属省市', '所属县市', '企业性质', '是否上市','成立日期','注册资本(万元)',
#                 '所属行业一级', '所属行业二级', '所属行业三级','所属行业四级']#'最新评级',

# all_com = pd.read_excel(path+'all_company.xlsx',)[:-2]
# all_com = all_com[list_all_col]

# all_com = all_com[['名称','所属省市','所属行业一级', '所属行业二级', '所属行业三级', '所属行业四级' ,
#        '注册资本(万元)','企业性质', '是否上市']]

indestry = pd.read_excel(path+'comp_feature/产业类发债企业行业分类0827.xlsx', sheet_name='产业类企业')
indestry.insert(1, '是否交通', 0)
transport = pd.read_excel(path+'comp_feature/产业类发债企业行业分类0827.xlsx', sheet_name='交通运输')
transport.insert(1, '是否交通', 1)
all_com = pd.concat([indestry, transport,], ignore_index=True)[['名称', '是否交通',
                                                              '最新评级', '企业性质', '是否上市','一级分类', '二级分类']]
del indestry, transport

list_company_property = ['民营企业', '地方国有企业', '中央国有企业']
all_com['企业性质'] = all_com['企业性质'].map(lambda x:x if x in list_company_property else '其他')

# 违约信息读取+筛选第一次

list_re_col = ['发生日期','名称',]#'发行时主体评级'

re_of_de = pd.read_excel(path+'report of defaulted.xlsx',)[:-2]
re_of_de = re_of_de[list_re_col]
gc.collect()
print('违约记录数:', re_of_de['名称'].shape[0])
re_of_de = re_of_de.groupby(['名称',], as_index=False).apply(lambda x:x.sort_values(['发生日期']).iloc[0])
print('违约公司数:', re_of_de['名称'].shape[0])
all_com['whetherin'] = all_com['名称'].isin(re_of_de['名称'])*1  ##  总表中的违约公司
print('总表里的违约公司数:', sum(all_com['whetherin']))
re_of_de['whetherin'] = re_of_de['名称'].isin(all_com['名称'])*1  ##   在总表中有记录的违约公司
print('缺失数:',re_of_de['名称'].shape[0]-sum(all_com['whetherin']))

loss_re_name = re_of_de.loc[re_of_de['whetherin']==0, '名称']
print('缺失的公司名如下:\n---------------------------\n', pd.unique(loss_re_name), re_of_de.shape)

re_of_de = re_of_de.loc[re_of_de['whetherin']==1,:]

re_of_de = re_of_de.iloc[:,:-1]
all_com = all_com.iloc[:,:-1]

# 时光机器-一个季度前

re_of_de['发生日期'] = re_of_de['发生日期'].map(lambda x:getxsb(x))

# 处理财报数据,如果要画图就在这里改

# data_a = data_a.loc[data_a['名称'].isin(all_com['名称']),:]

# re_of_de = re_of_de.merge(data_a, on=['名称',],)  #留下违约公司所有财报数据re_of_de
# re_of_de = re_of_de.loc[re_of_de['发生日期'] >= re_of_de['报告期'],:]
# re_of_de.insert(0, 'target', (re_of_de['发生日期'] - re_of_de['报告期']).map(lambda x:x.days))

# data_a.insert(0, 'target', -1)
# fin_col = re_of_de.columns
# data_a = data_a.loc[~data_a['名称'].isin(re_of_de['名称']), :]   #总的数据表中剔除re_of_de

# data_2018 = data_a.loc[data_a['报告期'] == '2018-06-30', :]
# data_a = data_a.loc[data_a['报告期'] != '2018-06-30', :]

# data_a.dropna(inplace=True)  #总表剔除nan

# for k in range(2):
#     col_ = np.random.choice(data_a.columns[3:-1],len(data_a.columns[3:-1]),replace=False,)
#     for j in col_:
#         data_a = drop_out(data_a, j, model='gauss', alpha=3)

# a = pd.concat([re_of_de, data_2018, data_a])[fin_col]

# del re_of_de, data_a
# gc.collect()

# weight_dict = {}
# a.iloc[:, 4:-1],weight_dict = diy_ss(a.iloc[:, 4:-1], weight_dict, 0)

# a = pd.concat([a, pd.get_dummies(a['报告期'].dt.year)], axis=1)

# com_flist = ['名称', '是否交通', '一级分类', '企业性质', '是否上市']
# a = a.merge(all_com[com_flist], on='名称')

# com_flist_ = com_flist.copy()
# for o in ['名称', '一级分类']:
#     com_flist_.remove(o)

# for i in com_flist_:
#     a = one_hot_str(a, i)

# drop_length = 1
# drop_length += len(set(a['报告期'].dt.year))
# for i in com_flist_:
#     drop_length += len(set(all_com[i]))

# cols_a = list(a)
# cols_a.insert(len(cols_a), cols_a.pop(cols_a.index('一级分类')))
# a = a[cols_a]

# a.iloc[:, 4:-1] = fillna_(a.iloc[:, 4:-1],a['一级分类'] ,n_epoch=4)

# def rechange(dataframe, col, weight):
#     if col in weight.keys() :
#         if weight[col] == 'log1p':
#             return np.exp(dataframe[col]) - 1
#         else:
#             return dataframe[col] * weight[col][1] + weight[col][0]
#     else :
#         return dataframe[col]

# a = a.iloc[:, :drop_length * -1]
# a.insert(a.shape[1],'净利润/带息债务',rechange(a,'净利润(亿元)',weight_dict)/((rechange(a,'带息债务(亿元)',weight_dict))+0.01))
# a[['净利润/带息债务']],weight_dict = diy_ss(a[['净利润/带息债务']],weight_dict,0)

# a.to_excel(path+'2013-2018_a.xlsx',index=False)
# joblib.dump(weight_dict,path+'weight_dict_a.m')

fin_col = re_of_de.columns

del re_of_de
gc.collect()

a = pd.read_excel(path+'2013-2018_a.xlsx')
weight_dict = joblib.load(path+'weight_dict_a.m')

# 违约范围划定

a.loc[a['发生日期'].notnull(),'target'] = a.loc[a['发生日期'].notnull(),'target'].map(lambda x: 1 if x <= 93 else 2)

a.drop('发生日期',axis=1,inplace=True)
re_of_de_0 = a.loc[a['target']==2,:]
re_of_de_1 = a.loc[a['target']==1,:]
data_2018 = a.loc[(a['报告期'] == '2018-06-30')&(a['target'] == -1),:]
data_a = a.loc[(a['报告期'] != '2018-06-30')&(a['target'] == -1),:]
del a
data_a = data_a.loc[data_a['报告期'].between('2016-01-01','2018-03-31')]
gc.collect()

# 处理公司非财报数据

all_com = one_hot_str(all_com,'企业性质',replace=False)
all_com = one_hot_str(all_com,'是否上市',replace=False)

re_of_de_0 = re_of_de_0.merge(all_com,on='名称',)
re_of_de_1 = re_of_de_1.merge(all_com,on='名称',)
data_2018 = data_2018.merge(all_com,on='名称',)
data_a = data_a.merge(all_com,on='名称')
del all_com

# 看看corr

# a = [('净资产回报率(%)', 0.2831613389507573),
#  ('净利润(亿元)', 0.19612521767114766),
#  ('流动比率', 0.10006272874771727),
#  ('净资产(亿元)', 0.07643978218479412),
#  ('主营业务利润(亿元)', 0.06485263851265907),
#  ('货币资金/短期债务', 0.04648683392338189),
#  ('总资产报酬率(%)', 0.03358045966497278),
#  ('筹资活动现金流(亿元)', 0.0299973498895215),
#  ('投资活动现金流(亿元)', 0.02486080556388141),
#  ('总资产(亿元)', 0.02458365655281366),
#  ('短期债务/总债务', 0.023363955568147503),
#  ('经营活动现金流(亿元)', 0.01991941294740471),
#  ('速动比率', 0.017362472591467188),
#  ('货币资产(亿元)', 0.013813465657292835),
#  ('带息债务(亿元)', 0.012756002097363587),
#  ('主营业务收入(亿元)', 0.00853891335415696),
#  ('净债务(亿元)', 0.008103904697081556),
#  ('主营业务利润率(%)', 0.005525472849658592),
#  ('资产负债率', 0.0040236587366119),
#  ('存货周转率', 0.002683313003464182),
#  ('总债务(亿元)', 0.002371458968943401),]
# pd.concat([re_of_de,data_2018,data_a])[[i[0] for i in a]].corr().to_excel(path+'corr.xlsx',)

# list_choich = ['名称', '报告期','获息倍数','货币资金/短期债务','净利润亿元','筹资活动现金流亿元','主营业务利润率%',
#                 '主营业务收入增长率%','投资活动现金流亿元','净资产回报率%','总资产报酬率%','流动比率']


# for i in [re_of_de,data_2018]:
#     i.fillna(0,inplace=True)

# 把标签特征移入data_object

object_list = ['名称', '报告期','最新评级','企业性质', '是否上市', '一级分类', '二级分类',]
data_a_object = pd.DataFrame();data_2018_object = pd.DataFrame();re_object_0 = pd.DataFrame();re_object_1 = pd.DataFrame()

for i in object_list:
    data_a_object = pd.concat([data_a_object,data_a.pop(i)],axis=1)
for i in object_list:
    data_2018_object = pd.concat([data_2018_object,data_2018.pop(i)],axis=1)
for i in object_list:
    re_object_0 = pd.concat([re_object_0,re_of_de_0.pop(i)],axis=1)
for i in object_list:
    re_object_1 = pd.concat([re_object_1,re_of_de_1.pop(i)],axis=1)

re_target_0 = re_of_de_0.pop('target')
re_target_0 = re_target_0.map(lambda x : 0)
re_target_1 = re_of_de_1.pop('target')
data_a_target = data_a.pop('target')
data_a_target = data_a_target.map(lambda x : 0)
data_2018.drop(['target'],axis=1,inplace=True)

# 数据集 -> x1,x2,y1,y2

def get_hxy(length_0=0.6,length_a=0.022,test_size=0.3,simple=False,random_state=None):
    '''横向数据'''
    global x1,x2,y1,y2,list_
    list_ = re_of_de_1.columns
    index_a = np.random.permutation(len(data_a_target))[:int(len(data_a_target)*length_a)]
    x1,x2,y1,y2 = train_test_split(pd.concat([re_of_de_1,data_a.iloc[index_a,:]],),
                                   np.r_[re_target_1,[0]*(len(index_a))],test_size=test_size,random_state=random_state)
    if simple:
        pass

def get_zxy(length_0=0.6,test_size=0.3,simple=False,random_state=None):
    '''纵向数据'''
    global x1,x2,y1,y2,list_
    list_ = re_of_de_1.columns
    index_0 = np.random.permutation(len(re_target_0))[:int(len(re_target_0)*length_0)]
    x1,x2,y1,y2 = train_test_split(pd.concat([re_of_de_1,re_of_de_0.iloc[index_0,:],],),
                                   np.r_[re_target_1,[0]*(len(index_0))],test_size=test_size,random_state=random_state)
    if simple:
        list_ = ['获息倍数','货币资金/短期债务','净利润(亿元)','筹资活动现金流(亿元)','主营业务利润率(%)',
                 '主营业务收入增长率(%)','投资活动现金流(亿元)','净资产回报率(%)','总资产报酬率(%)','流动比率',]
        x1 = x1[list_]
        x2 = x2[list_]

is concating 2013 1/6
is concating 2016 4/6
finish concat data_y
[    0     1  3424  7725  3628  3864  9156  9156  2274  3119  2803   622
  1357  1015  1822  8058  6564  2001  7803  8873  8349  4728  9206  9185
  8989  7734 16029]
违约记录数: 181
违约公司数: 74
总表里的违约公司数: 65
缺失数: 9
缺失的公司名如下:
---------------------------
 ['东兴金满堂商贸有限公司' '山东迪浩耐磨管道股份有限公司' '惠州侨兴电信工业有限公司' '惠州侨兴电讯工业有限公司'
 '甘肃华协农业生物科技股份有限公司' '甘肃宏良皮业股份有限公司' '百花医药集团股份有限公司' '鄂尔多斯市益通路桥有限公司'
 '陕西通海绒业股份有限公司'] (74, 3)


# hpsklearn

In [3]:
def model_score(args):
    n_estimators ,max_depth ,min_samples_split ,max_features = args
    model = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,
                                   class_weight='balanced',n_jobs=-1,
                                   min_samples_split=min_samples_split,max_features=max_features,)
    model.fit(x1,y1)
    pre = model.predict(x2)
    recall_ = metrics.recall_score(pre,y2)
    accu_ = metrics.accuracy_score(pre,y2)
    auc_ = metrics.roc_auc_score(pre,y2)
    score_ = -(recall_*0.7+auc_*0.15+accu_*0.15)
    return score_

space_dict = {
              'n_estimators':[i for i in range(30,121,5)],
              'max_depth':[2,3,4,5,],
              'min_samples_split':[i for i in range(2,11,2)],
              'max_features':[i/100 for i in range(50,71,5)],
              }
space = [hp.choice(i,space_dict[i]) for i in space_dict.keys()]

result_list = []
for i in range(200):
    get_hxy(simple=True,random_state=i*10+5)
    index = fmin(model_score,space,tpe.suggest,150)

    result_dict = {}
    for i in index.keys():
        result_dict[i] = space_dict[i][index[i]]
    rfc = RandomForestClassifier(n_estimators = result_dict['n_estimators'],
                                 max_depth = result_dict['max_depth'],
                                 min_samples_split = result_dict['min_samples_split'],
                                 max_features = result_dict['max_features'],
                                 class_weight = 'balanced', n_jobs = -1,)
    result_list.append((rfc,result_dict))

In [96]:
get_zxy(simple=True,)

In [88]:
for n,(i,_) in enumerate(result_list):
    get_zxy(random_state=n*10+5)
    i.fit(x1,y1)
# metrics.recall_score(y2,rfc.predict(x2))
    print(0.6*metrics.recall_score(y2,i.predict(x2))+\
          0.2*metrics.roc_auc_score(y2,i.predict(x2))+\
          0.2*metrics.accuracy_score(y2,i.predict(x2)))

0.43427339290146444
0.42708333333333337
0.535
0.6849643493761141
0.4540891039960288


In [65]:
for j in zip(['data_a','re_of_de_0','re_of_de_1'],['data_a_target','re_target_0','re_target_1']):
    pre = [i[0].predict_proba(eval(j[0])[list_])[:,1] for i in result_list]
    pre = [sum(i)/len(result_list) for i in zip(*pre)]
    pre = [1 if i>=0.5 else 0 for i in pre]
    print(f'---{j[0]}---\n',metrics.confusion_matrix(eval(j[1]),pre),'\n-----------')

---data_a---
 [[10161   446]
 [    0     0]] 
-----------
---re_of_de_0---
 [[412  33]
 [  0   0]] 
-----------
---re_of_de_1---
 [[ 0  0]
 [ 8 43]] 
-----------


In [27]:
get_zxy(simple=True)

In [32]:
for n,(i,_) in enumerate(result_list):
    if n == 0:
        feature_importances = pd.Series(i.feature_importances_)
    else:
        feature_importances += pd.Series(i.feature_importances_)
feature_importances = pd.DataFrame(feature_importances/feature_importances.sum(),columns=['feature_importance'],)
feature_importances['id'] = list_
feature_importances.set_index(['id'],inplace=True)
feature_importances.sort_values(by=['feature_importance'],ascending=False).iloc[:10,:]

Unnamed: 0_level_0,feature_importance
id,Unnamed: 1_level_1
获息倍数,0.154681
主营业务利润率(%),0.137175
净利润(亿元),0.135164
流动比率,0.101272
净资产回报率(%),0.086647
筹资活动现金流(亿元),0.083815
货币资金/短期债务,0.080709
总资产报酬率(%),0.079772
投资活动现金流(亿元),0.074477
主营业务收入增长率(%),0.066288


In [28]:
pre_2018 = [i[0].predict_proba(data_2018[list_])[:,1] for i in result_list]
pre_2018 = [sum(i)/len(result_list) for i in zip(*pre_2018)]
pre_2018_ = [1 if i>=0.5 else 0 for i in pre_2018]

sum(pre_2018_)

60

In [30]:
# data_2018_object.insert(0,'0-1 score',pre_2018)

In [31]:
data_2018_object.loc[[True if i==1 else False for i in pre_2018_],:].sort_values(by=['0-1 score'],ascending=False)

Unnamed: 0,0-1 score,名称,报告期,最新评级,企业性质,是否上市,一级分类,二级分类
1865,0.895059,山东龙力生物科技股份有限公司,2018-06-30,AA-,民营企业,是,必需品,食品
2571,0.868933,山东地矿股份有限公司,2018-06-30,AA-,地方国有企业,是,投资贸易,投资管理
1136,0.827068,华天酒店集团股份有限公司,2018-06-30,AA-,地方国有企业,是,可选消费品,酒店餐饮
511,0.759319,内蒙古矿业(集团)有限责任公司,2018-06-30,AA,地方国有企业,否,原材料采掘加工,有色
1704,0.737303,焦作万方铝业股份有限公司,2018-06-30,AA,其他,是,原材料采掘加工,有色
1445,0.729218,上海外滩投资开发(集团)有限公司,2018-06-30,AA+,地方国有企业,否,房地产开发,房地产
2128,0.726486,大唐电信科技股份有限公司,2018-06-30,AA-,中央国有企业,是,高端装备,集成电路
780,0.714669,暴风集团股份有限公司,2018-06-30,A+,民营企业,是,信息技术,软件开发
1556,0.712252,保定天威保变电气股份有限公司,2018-06-30,A+,中央国有企业,是,传统制造业,电气
2198,0.706287,哈尔滨工业投资集团有限公司,2018-06-30,AA,地方国有企业,否,投资贸易,投资管理


In [47]:
# joblib.dump(result_list,path+'../../m_save/2018-06-30_result list_a.m')

['./data/Debt issuing company 2018 report/../../m_save/2018-06-30_result list_a.m']

In [25]:
result_list = joblib.load(path+'../../m_save/2018-06-30_result list_a.m')

In [84]:
name = '青海省投资'

pre_2018[439]

0.25988692581181766

In [None]:
# pd.concat([data_2018_object,data_2018],axis=1).loc[[True if i==1 else False for i in pre_2018_],:].iloc[:,:-6]\
# .to_excel(path+'2018_re名单_60.xlsx',index=False)

## hxy

In [156]:
def model_score(args):
    n_estimators ,max_depth ,min_samples_split ,max_features = args
    model = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,
                                   class_weight='balanced',n_jobs=-1,
                                   min_samples_split=min_samples_split,max_features=max_features,)
    model.fit(x1,y1)
    pre = model.predict(x2)
    recall_ = metrics.recall_score(pre,y2)
    accu_ = metrics.accuracy_score(pre,y2)
    auc_ = metrics.roc_auc_score(pre,y2)
    score_ = -(recall_*0.6+auc_*0.2+accu_*0.2)
    return score_

space_dict = {
              'n_estimators':[i for i in range(30,121,5)],
              'max_depth':[2,3,4,5,6],
              'min_samples_split':[i for i in range(2,11,2)],
              'max_features':[i/100 for i in range(50,71,5)],
              }
space = [hp.choice(i,space_dict[i]) for i in space_dict.keys()]

result_list_h = []
for i in range(20):
    get_hxy(random_state=i*10)
    index = fmin(model_score,space,tpe.suggest,400)
    if (i-1)%5 == 0:
        print(i)
    result_dict_h = {}
    for i in index.keys():
        result_dict_h[i] = space_dict[i][index[i]]
    rfc = RandomForestClassifier(n_estimators=result_dict_h['n_estimators'],
                             max_depth=result_dict_h['max_depth'],
                             min_samples_split=result_dict_h['min_samples_split'],
                             max_features=result_dict_h['max_features'],
                             class_weight='balanced',n_jobs=-1,)
    result_list_h.append((rfc,result_dict_h))

1
6
11
16


In [158]:
for n,(i,_) in enumerate(result_list_h):
    get_zxy(random_state=n*10)
    i.fit(x1,y1)
# metrics.recall_score(y2,rfc.predict(x2))
    print(0.6*metrics.recall_score(y2,i.predict(x2))+\
          0.2*metrics.roc_auc_score(y2,i.predict(x2))+\
          0.2*metrics.accuracy_score(y2,i.predict(x2)))

0.37761696450732196
0.36839430894308944
0.42666666666666664
0.5730837789661318
0.3044939811367585
0.48141025641025637
0.38513888888888886
0.5159845636819321
0.43333333333333335
0.5295470342910102
0.41768805993203584
0.35188008130081305
0.5249999999999999
0.6809567901234568
0.37426780838917845
0.37162303058387397
0.4889244186046512
0.39874331550802145
0.37426780838917845
0.41794131455399064


In [159]:
for j in zip(['data_a','re_of_de_0','re_of_de_1'],['data_a_target','re_target_0','re_target_1']):
    pre = [i[0].predict_proba(eval(j[0])[list_])[:,1] for i in result_list_h]
    pre = [sum(i)/len(result_list_h) for i in zip(*pre)]
    pre = [1 if i>=0.5 else 0 for i in pre]
    print(f'---{j[0]}---\n',metrics.confusion_matrix(eval(j[1]),pre),'\n-----------')

---data_a---
 [[10430   177]
 [    0     0]] 
-----------
---re_of_de_0---
 [[433  12]
 [  0   0]] 
-----------
---re_of_de_1---
 [[ 0  0]
 [17 34]] 
-----------


In [150]:
for n,(i,_) in enumerate(result_list):
    if n == 0:
        feature_importances = pd.Series(i.feature_importances_)
    else:
        feature_importances += pd.Series(i.feature_importances_)
feature_importances = pd.DataFrame(feature_importances/feature_importances.sum(),columns=['feature_importance'],)
feature_importances['id'] = list(re_of_de_0)
feature_importances.set_index(['id'],inplace=True)
feature_importances.sort_values(by=['feature_importance'],ascending=False).iloc[:10,:]

Unnamed: 0_level_0,feature_importance
id,Unnamed: 1_level_1
流动比率,0.094564
净资产回报率(%),0.080026
经营活动现金流(亿元),0.061908
短期债务/总债务,0.061157
获息倍数,0.060783
净利润(亿元),0.05704
净利润/带息债务,0.048542
主营业务利润率(%),0.04648
主营业务利润(亿元),0.046236
货币资金/短期债务,0.044832


In [160]:
pre_2018 = [i[0].predict_proba(data_2018[list_])[:,1] for i in result_list]
pre_2018 = [sum(i)/len(result_list) for i in zip(*pre_2018)]
pre_2018_ = [1 if i>=0.5 else 0 for i in pre_2018]

sum(pre_2018_)

0

In [152]:
data_2018_object.loc[[True if i==1 else False for i in pre_2018_],:]

Unnamed: 0,名称,报告期,最新评级,企业性质,是否上市,一级分类,二级分类


# XGBboost

## 寻找参数

In [None]:
para_test = {'learning_rate': [i/1000 for i in range(1,10,2)],
              'max_depth':[i for i in range(3,8,2)], 
              'min_child_weight':[1,3,5],
              'gamma':[i/10 for i in range(1,50,8)],
              'subsample':[i/100 for i in range(50,80,5)],
              'colsample_bytree':[i/100 for i in range(30,60,5)],
              'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05],
              'scale_pos_weight':[i for i in range(6,11,2)]}
xgb_ = xgb.XGBClassifier(objective= 'binary:logistic',eval_metric='auc',n_jobs=-1,early_stopping_rounds=30)
flod = KFold(n_splits=4)

grid = GridSearchCV(xgb_,param_grid=para_test,scoring='roc_auc',cv=flod,n_jobs=-1,iid=False, )

In [51]:
feature_importance_mean = pd.Series()
xgb_clone = []
def modelfit(alg,feature_importance_mean,xgb_clone,useTrainCV=True,cv_folds=4, early_stopping_rounds=50, n_epoch=5,):
    get_zxy()
    for i in range(n_epoch):
        if useTrainCV:
            xgb_param = alg.get_xgb_params()
            xgtrain = xgb.DMatrix(x1, label=y1)
            cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
                metrics='auc', early_stopping_rounds=early_stopping_rounds,)
            alg.set_params(n_estimators=cvresult.shape[0])
            del xgtrain
            gc.collect()

            if i == 0: #init
                param_test = [{'learning_rate': [0.001,0.004,0.007,0.01]},
                              {'max_depth':[i for i in range(3,8,2)], 'min_child_weight':[1,3,5]},
                              {'gamma':[i/10 for i in range(1,50,2)]},
                              {'subsample':[i/100 for i in range(50,80,3)],'colsample_bytree':[i/100 for i in range(30,60,3)]},
                              {'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05]},
                              {'scale_pos_weight':[1,2,3,5]}]
                for i in range(len(param_test)):
                    gsearch = GridSearchCV(estimator = alg, param_grid = param_test[i], scoring='roc_auc',iid=False, cv=cv_folds)
                    gsearch.fit(x1,y1)
                    alg = gsearch.best_estimator_

            param_test_little_list = [['learning_rate'],
                                      ['max_depth','min_child_weight'],
                                      ['gamma'],
                                      ['subsample','colsample_bytree'],
                                      ['reg_alpha'],
                                      ['scale_pos_weight']]
            for little in range(len(param_test_little_list)):
                param_test_little_dict = {}
                for j in param_test_little_list[little]:
                    if little == 1:
                        param_test_little_dict[j] = [i for i in range(int(xgb1.get_xgb_params()[j])-1,int(xgb1.get_xgb_params()[j])+2)]
                    elif little == 3:
                        param_test_little_dict[j] = [i/1000. for i in np.linspace(int(1000*round(xgb1.get_xgb_params()[j],2)*0.9),
                                                                      int(1000*round(xgb1.get_xgb_params()[j],2)*1.1),
                                                                      3)]
                    else:
                        param_test_little_dict[j] = [i/1000. for i in np.linspace(int(1000*round(xgb1.get_xgb_params()[j],2)*0.9),
                                                                      int(1000*round(xgb1.get_xgb_params()[j],2)*1.1),
                                                                      5)]
                gsearch = GridSearchCV(estimator = alg, param_grid = param_test_little_dict, scoring='roc_auc',iid=False, cv=cv_folds)
                gsearch.fit(x1,y1)
                alg = gsearch.best_estimator_

    #Fit the algorithm on the data
    alg.fit(x1, y1,eval_metric='auc')

    #Print model report:
    print ("AUC Score (Train): %f" % metrics.roc_auc_score(y1, alg.predict_proba(x1)[:,1]))
    print ("AUC Score (Test): %f" % metrics.roc_auc_score(y2, alg.predict_proba(x2)[:,1]))
    print('Recall Score (Test): %f' % metrics.recall_score(y2, alg.predict(x2)))

    feature_importance_mean = feature_importance_mean.add(pd.Series(alg.get_booster().get_fscore()),fill_value=0)
    xgb_clone.append(alg)
    return (feature_importance_mean,xgb_clone)

xgb1 = XGBClassifier(
 learning_rate =0.001,
 n_estimators=1000,
 max_depth=3,
 min_child_weight=2,
 gamma=0.1,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
)
feature_importance_mean,xgb_clone = modelfit(xgb1,feature_importance_mean,xgb_clone,useTrainCV=True)

AUC Score (Train): 0.500000
AUC Score (Test): 0.500000
Recall Score (Test): 0.000000


In [53]:
get_zxy()
xgb_clone

[XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
        colsample_bytree=0.72, gamma=0.09, learning_rate=0.0,
        max_delta_step=0, max_depth=2, min_child_weight=1, missing=None,
        n_estimators=1, n_jobs=1, nthread=None, objective='binary:logistic',
        random_state=0, reg_alpha=0.0, reg_lambda=1, scale_pos_weight=0.9,
        seed=None, silent=True, subsample=0.72)]

## 训练

In [32]:
feature_importance_mean = pd.Series()

xgb_epoch = 200
xgb_list = []

for i in range(xgb_epoch):
    get_zxy(simple=True)
#     get_zxy(test_size=0.3,simple=True)
    print(x1.iloc[0,0])
    
    num_rounds = 1000 # 迭代次数
    
    params={
    'booster':'gbtree',
    'objective': 'binary:logistic',
    'n_estimators':np.random.choice([i for i in range(30,60)]),
    'gamma':np.random.choice([i/10 for i in range(1,10)]),  # 用于控制是否后剪枝的参数,越大越保守，一般0.1、0.2这样子。
    'max_depth':np.random.choice([3,4,5],p=[0.35,0.55,0.10]), # 构建树的深度，越大越容易过拟合
    'lambda':np.random.choice([i/10 for i in range(1,10)]),  # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。
    'subsample':np.random.choice([i/100 for i in range(70,75)]), # 随机采样训练样本
    'colsample_bytree':np.random.choice([i/100 for i in range(35,45)]), # 生成树时进行的列采样
    'min_child_weight':np.random.choice([i for i in range(2,5)]), 
    'eta': np.random.choice([i/1000 for i in range(1,10)]),
    'eval_metric': 'auc',
    'scale_pos_weight':np.random.choice([3,4,5,6])
    }
    
    xgtrain = xgb.DMatrix(x1, label=y1)
    xgtest = xgb.DMatrix(x2,label=y2,)

    watchlist = [(xgtrain, 'train'),(xgtest, 'val')]
    model = xgb.train(params,xgtrain,num_rounds,watchlist,early_stopping_rounds=30,verbose_eval=10)
    xgb_list.append(model)
    if i == 0:
        feature_importance_mean = model.get_fscore()
    else:
        for g in model.get_fscore():
            feature_importance_mean[g] = feature_importance_mean.get(g,0) + model.get_fscore()[g]
    print ("AUC Score (Test): %f" % metrics.roc_auc_score(y2, model.predict(xgtest)))
    print('Recall Score (Test): %f' % metrics.recall_score(y2, [1 if i>=0.5 else 0 for i in model.predict(xgtest)]))
    
feature_importance_mean = pd.Series(feature_importance_mean).sort_values()/xgb_epoch
feature_importance_mean.plot(kind='barh', title='Feature Importance Score of Mean({})'.format(xgb_epoch))
plt.ylabel('Feature Importances')

0.3908154267629185
[0]	train-auc:0.733077	val-auc:0.635933
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved in 30 rounds.
[10]	train-auc:0.932465	val-auc:0.751844
[20]	train-auc:0.936358	val-auc:0.764489
[30]	train-auc:0.947363	val-auc:0.781349
[40]	train-auc:0.953488	val-auc:0.779241
[50]	train-auc:0.952658	val-auc:0.781349
[60]	train-auc:0.952554	val-auc:0.778188
Stopping. Best iteration:
[35]	train-auc:0.947051	val-auc:0.786618

AUC Score (Test): 0.783456
Recall Score (Test): 0.615385
-5.632517702343907
[0]	train-auc:0.764137	val-auc:0.630756
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved in 30 rounds.
[10]	train-auc:0.936458	val-auc:0.839433
[20]	train-auc:0.940675	val-auc:0.831169
[30]	train-auc:0.945139	val-auc:0.824675
[40]	train-auc:0.959226	val-auc:0.837662
Stopping. Best iteration:
[12]	train-auc:0.936706	val-auc:0.861865

A

AUC Score (Test): 0.735000
Recall Score (Test): 0.708333
2.43354941045487
[0]	train-auc:0.807309	val-auc:0.702055
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved in 30 rounds.
[10]	train-auc:0.932153	val-auc:0.731296
[20]	train-auc:0.940718	val-auc:0.747629
[30]	train-auc:0.946117	val-auc:0.743414
[40]	train-auc:0.955669	val-auc:0.752371
[50]	train-auc:0.958368	val-auc:0.762908
[60]	train-auc:0.95951	val-auc:0.765016
[70]	train-auc:0.96169	val-auc:0.765543
[80]	train-auc:0.962313	val-auc:0.763435
[90]	train-auc:0.964805	val-auc:0.75922
[100]	train-auc:0.965116	val-auc:0.756059
[110]	train-auc:0.967919	val-auc:0.751317
Stopping. Best iteration:
[84]	train-auc:0.96169	val-auc:0.768177

AUC Score (Test): 0.749737
Recall Score (Test): 0.576923
-1.253459598163918
[0]	train-auc:0.773747	val-auc:0.543596
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn'

Stopping. Best iteration:
[9]	train-auc:0.934309	val-auc:0.764302

AUC Score (Test): 0.730549
Recall Score (Test): 0.521739
0.3185086134418092
[0]	train-auc:0.792401	val-auc:0.736954
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved in 30 rounds.
[10]	train-auc:0.94162	val-auc:0.752609
[20]	train-auc:0.952254	val-auc:0.731025
[30]	train-auc:0.95591	val-auc:0.744307
Stopping. Best iteration:
[9]	train-auc:0.939016	val-auc:0.758302

AUC Score (Test): 0.744307
Recall Score (Test): 0.580645
1.413432757176257
[0]	train-auc:0.745306	val-auc:0.685946
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved in 30 rounds.
[10]	train-auc:0.88894	val-auc:0.703514
[20]	train-auc:0.903355	val-auc:0.688649
[30]	train-auc:0.907254	val-auc:0.71027
Stopping. Best iteration:
[7]	train-auc:0.877603	val-auc:0.735676

AUC Score (Test): 0.701081
Recall Score (Test): 

Will train until val-auc hasn't improved in 30 rounds.
[10]	train-auc:0.936679	val-auc:0.78858
[20]	train-auc:0.924593	val-auc:0.774691
[30]	train-auc:0.932948	val-auc:0.758745
Stopping. Best iteration:
[7]	train-auc:0.941146	val-auc:0.809156

AUC Score (Test): 0.775720
Recall Score (Test): 0.740741
0.859718992472219
[0]	train-auc:0.73161	val-auc:0.598378
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved in 30 rounds.
[10]	train-auc:0.886888	val-auc:0.724865
[20]	train-auc:0.888684	val-auc:0.728108
[30]	train-auc:0.902842	val-auc:0.734865
[40]	train-auc:0.910126	val-auc:0.752703
Stopping. Best iteration:
[12]	train-auc:0.895352	val-auc:0.769189

AUC Score (Test): 0.758919
Recall Score (Test): 0.720000
-1.224941237648749
[0]	train-auc:0.805959	val-auc:0.753161
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved in 30 rounds.
[10]	train-auc:0

[40]	train-auc:0.978738	val-auc:0.754577
[50]	train-auc:0.978638	val-auc:0.764874
Stopping. Best iteration:
[26]	train-auc:0.973423	val-auc:0.776888

AUC Score (Test): 0.756293
Recall Score (Test): 0.695652
282.3114335675948
[0]	train-auc:0.815214	val-auc:0.59611
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved in 30 rounds.
[10]	train-auc:0.923428	val-auc:0.690789
[20]	train-auc:0.930498	val-auc:0.693364
[30]	train-auc:0.941831	val-auc:0.695652
Stopping. Best iteration:
[4]	train-auc:0.914101	val-auc:0.703661

AUC Score (Test): 0.700229
Recall Score (Test): 0.608696
-1.215364176281715
[0]	train-auc:0.868875	val-auc:0.642518
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved in 30 rounds.
[10]	train-auc:0.97311	val-auc:0.693625
[20]	train-auc:0.97093	val-auc:0.708641
[30]	train-auc:0.975187	val-auc:0.719705
Stopping. Best iteration:
[2]	t

[40]	train-auc:0.92726	val-auc:0.798919
Stopping. Best iteration:
[10]	train-auc:0.913307	val-auc:0.814865

AUC Score (Test): 0.798919
Recall Score (Test): 0.720000
-0.04387674750747533
[0]	train-auc:0.785602	val-auc:0.708333
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved in 30 rounds.
[10]	train-auc:0.944299	val-auc:0.81893
[20]	train-auc:0.958066	val-auc:0.79784
[30]	train-auc:0.960168	val-auc:0.78858
Stopping. Best iteration:
[5]	train-auc:0.911981	val-auc:0.820216

AUC Score (Test): 0.815844
Recall Score (Test): 0.666667
-0.02712478232314355
[0]	train-auc:0.800525	val-auc:0.622785
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved in 30 rounds.
[10]	train-auc:0.920472	val-auc:0.756962
[20]	train-auc:0.938302	val-auc:0.750633
[30]	train-auc:0.943257	val-auc:0.73481
[40]	train-auc:0.946318	val-auc:0.736076
Stopping. Best iteration:
[1

AUC Score (Test): 0.757437
Recall Score (Test): 0.652174
0.859718992472219
[0]	train-auc:0.802943	val-auc:0.623457
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved in 30 rounds.
[10]	train-auc:0.925223	val-auc:0.659208
[20]	train-auc:0.932738	val-auc:0.671811
[30]	train-auc:0.925066	val-auc:0.668724
[40]	train-auc:0.924961	val-auc:0.670267
[50]	train-auc:0.92454	val-auc:0.668724
[60]	train-auc:0.9299	val-auc:0.674897
Stopping. Best iteration:
[38]	train-auc:0.925066	val-auc:0.680041

AUC Score (Test): 0.667695
Recall Score (Test): 0.407407
-1.974452701412157
[0]	train-auc:0.866369	val-auc:0.61039
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved in 30 rounds.
[10]	train-auc:0.944742	val-auc:0.62987
[20]	train-auc:0.949306	val-auc:0.609209
[30]	train-auc:0.953274	val-auc:0.614522
Stopping. Best iteration:
[8]	train-auc:0.936756	val-auc:0.

Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved in 30 rounds.
[10]	train-auc:0.926852	val-auc:0.779321
[20]	train-auc:0.947977	val-auc:0.801955
[30]	train-auc:0.947451	val-auc:0.77572
[40]	train-auc:0.95176	val-auc:0.783951
Stopping. Best iteration:
[14]	train-auc:0.941093	val-auc:0.829218

AUC Score (Test): 0.796811
Recall Score (Test): 0.592593
-0.8530852271253972
[0]	train-auc:0.733856	val-auc:0.697313
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved in 30 rounds.
[10]	train-auc:0.904641	val-auc:0.697313
[20]	train-auc:0.900332	val-auc:0.694415
[30]	train-auc:0.913933	val-auc:0.712329
Stopping. Best iteration:
[4]	train-auc:0.887407	val-auc:0.738145

AUC Score (Test): 0.723920
Recall Score (Test): 0.500000
-0.9453974575243121
[0]	train-auc:0.788154	val-auc:0.674131
Multiple eval metrics have been passed: 'val-auc' will be used for e

[30]	train-auc:0.954555	val-auc:0.774145
[40]	train-auc:0.957642	val-auc:0.78169
Stopping. Best iteration:
[16]	train-auc:0.944125	val-auc:0.792757

AUC Score (Test): 0.786720
Recall Score (Test): 0.571429
-2.303903612438139
[0]	train-auc:0.763988	val-auc:0.678276
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved in 30 rounds.
[10]	train-auc:0.895337	val-auc:0.713695
[20]	train-auc:0.884623	val-auc:0.720189
[30]	train-auc:0.897619	val-auc:0.72137
[40]	train-auc:0.905357	val-auc:0.744982
[50]	train-auc:0.905754	val-auc:0.74085
[60]	train-auc:0.912599	val-auc:0.749705
[70]	train-auc:0.915476	val-auc:0.745573
Stopping. Best iteration:
[46]	train-auc:0.906151	val-auc:0.755018

AUC Score (Test): 0.750295
Recall Score (Test): 0.818182
-0.2305230323716788
[0]	train-auc:0.755002	val-auc:0.658378
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved i

AUC Score (Test): 0.725000
Recall Score (Test): 0.473684
-2.335986768017705
[0]	train-auc:0.77789	val-auc:0.6275
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved in 30 rounds.
[10]	train-auc:0.872008	val-auc:0.728889
[20]	train-auc:0.890416	val-auc:0.735
[30]	train-auc:0.900051	val-auc:0.731111
[40]	train-auc:0.900456	val-auc:0.716111
[50]	train-auc:0.904108	val-auc:0.711111
Stopping. Best iteration:
[24]	train-auc:0.89285	val-auc:0.748333

AUC Score (Test): 0.711667
Recall Score (Test): 0.458333
-1.655536557889912
[0]	train-auc:0.835808	val-auc:0.603267
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved in 30 rounds.
[10]	train-auc:0.927429	val-auc:0.638567
[20]	train-auc:0.933555	val-auc:0.651212
[30]	train-auc:0.936462	val-auc:0.67176
[40]	train-auc:0.943937	val-auc:0.674394
[50]	train-auc:0.944145	val-auc:0.682824
[60]	train-auc:0.945

[40]	train-auc:0.947061	val-auc:0.687568
[50]	train-auc:0.943983	val-auc:0.69027
[60]	train-auc:0.9486	val-auc:0.691892
[70]	train-auc:0.952806	val-auc:0.696216
[80]	train-auc:0.955166	val-auc:0.695135
[90]	train-auc:0.956294	val-auc:0.697838
[100]	train-auc:0.959885	val-auc:0.686486
[110]	train-auc:0.961321	val-auc:0.685405
Stopping. Best iteration:
[86]	train-auc:0.954961	val-auc:0.701081

AUC Score (Test): 0.687568
Recall Score (Test): 0.560000
-0.8991083275836459
[0]	train-auc:0.785193	val-auc:0.643056
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved in 30 rounds.
[10]	train-auc:0.939351	val-auc:0.735
[20]	train-auc:0.947566	val-auc:0.77
[30]	train-auc:0.960649	val-auc:0.758333
[40]	train-auc:0.959838	val-auc:0.765556
Stopping. Best iteration:
[14]	train-auc:0.940568	val-auc:0.791111

AUC Score (Test): 0.788889
Recall Score (Test): 0.833333
-2.054474369723378
[0]	train-auc:0.759834	val-auc:0.63744
Multiple 

[20]	train-auc:0.93557	val-auc:0.834054
[30]	train-auc:0.935467	val-auc:0.831892
[40]	train-auc:0.93834	val-auc:0.809189
Stopping. Best iteration:
[16]	train-auc:0.929619	val-auc:0.862162

AUC Score (Test): 0.820541
Recall Score (Test): 0.840000
-1.974452701412157
[0]	train-auc:0.838602	val-auc:0.65166
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved in 30 rounds.
[10]	train-auc:0.97004	val-auc:0.739437
[20]	train-auc:0.969774	val-auc:0.720825
[30]	train-auc:0.974457	val-auc:0.697686
Stopping. Best iteration:
[4]	train-auc:0.956577	val-auc:0.751509

AUC Score (Test): 0.705231
Recall Score (Test): 0.571429
-0.655691351171522
[0]	train-auc:0.806744	val-auc:0.6375
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved in 30 rounds.
[10]	train-auc:0.94645	val-auc:0.762222
[20]	train-auc:0.976876	val-auc:0.781667
[30]	train-auc:0.98357	val-auc:0.7

Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved in 30 rounds.
[10]	train-auc:0.956731	val-auc:0.79372
[20]	train-auc:0.96427	val-auc:0.77971
[30]	train-auc:0.96722	val-auc:0.776812
Stopping. Best iteration:
[9]	train-auc:0.951486	val-auc:0.795652

AUC Score (Test): 0.785507
Recall Score (Test): 0.733333
-0.2902732541226766
[0]	train-auc:0.782642	val-auc:0.705419
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved in 30 rounds.
[10]	train-auc:0.87655	val-auc:0.803941
[20]	train-auc:0.90221	val-auc:0.815764
[30]	train-auc:0.925067	val-auc:0.828079
[40]	train-auc:0.93186	val-auc:0.82266
[50]	train-auc:0.929811	val-auc:0.825123
Stopping. Best iteration:
[25]	train-auc:0.911806	val-auc:0.834975

AUC Score (Test): 0.819212
Recall Score (Test): 0.551724
-4.023464980889163
[0]	train-auc:0.853874	val-auc:0.585513
Multiple eval metrics have been pa

Text(0,0.5,'Feature Importances')

In [20]:
# joblib.dump(xgb_list, "m_save/xgb_z_250_三个月.m")
xgb_list = joblib.load('m_save/xgb_z.m')

## 原数据回测

In [25]:
get_zxy(simple=True)

In [30]:
xga = xgb.DMatrix(data_a[list_]);xg0 = xgb.DMatrix(re_of_de_0[list_]);xg1 = xgb.DMatrix(re_of_de_1[list_])
for j in zip(['xga','xg0','xg1'],['data_a_target','re_target_0','re_target_1']):
    e = [i.predict(eval(j[0])) for i in xgb_list]
    print(f'---{j[0]}---\n',metrics.confusion_matrix(eval(j[1]),
                                                   np.where(np.array([sum(i)/len(xgb_list) for i in list(zip(*e))])>=0.5,1,0)),'\n-----------')
del xga,xg0,xg1

---xga---
 [[22563   511]
 [    0     0]] 
-----------
---xg0---
 [[325  84]
 [  0   0]] 
-----------
---xg1---
 [[ 0  0]
 [19 63]] 
-----------


## 预测

In [39]:
data_2018[list_].shape

(2810, 10)

In [24]:
xgb_list[0].feature_names

['获息倍数',
 '货币资金/短期债务',
 '净利润(亿元)',
 '筹资活动现金流(亿元)',
 '主营业务利润率(%)',
 '主营业务收入增长率(%)',
 '投资活动现金流(亿元)',
 '净资产回报率(%)',
 '总资产报酬率(%)',
 '流动比率']

In [34]:
xg_2018 = xgb.DMatrix(data_2018[list_])
e_xgb = [i.predict(xg_2018) for i in xgb_list]
e_xgb = np.array([sum(i)/len(xgb_list) for i in list(zip(*e_xgb))])
f = np.where(e_xgb>=0.5,1,0)
del xg_2018
f.sum()

334

## 数据提取

In [35]:
e_xgb[data_2018_object['名称'].map(lambda x:x.find('新疆生产建设兵团第六师国有资产经营有限责任公司')>=0)]

array([0.48229144])

In [162]:
dict(zip(data_2018_object.loc[dlgindex|zjgindex|xmgindex,'名称'],e_xgb[dlgindex|zjgindex|xmgindex]))

{'湛江港(集团)股份有限公司': 0.466981696665287,
 '大连港股份有限公司': 0.5033089598417282,
 '厦门港务控股集团有限公司': 0.4768005536794662}

In [134]:
dlgindex = data_2018_object['名称'].map(lambda x:(x.find('大连港股份')>=0))
zjgindex = data_2018_object['名称'].map(lambda x:(x.find('湛江港')>=0))
xmgindex = data_2018_object['名称'].map(lambda x:(x.find('厦门港务控股')>=0))

In [148]:
a = pd.concat([data_2018_object.loc[dlgindex|zjgindex|xmgindex,'名称'],data_2018.loc[dlgindex|zjgindex|xmgindex,list_]],axis=1).set_index(['名称'])

In [158]:
a.stack().unstack(0).plot()
plt.xticks([i for i in range(a.shape[1])],a.columns,fontsize=12)
plt.legend(fontsize=15,loc='best')

<matplotlib.legend.Legend at 0x1e9990f1c18>

In [159]:
plt.figure()
for i in range(len(a.columns)):
    ax = plt.subplot(5,2,i+1)
    plt.barh(a.index,a.iloc[:,i],0.5)
    plt.title(a.columns[i])
    plt.xticks(rotation=20)

# 随即森林+栅栏搜索

## 栅栏搜索

In [37]:
rfc = RandomForestClassifier(oob_score=True,class_weight={1:10,0:1})

para_list = {
    'n_estimators':range(30,180,5),
    'max_depth':[2,3,4,],
    'min_samples_split':range(2,11,2),
    'max_features':[i/100 for i in range(50,71,5)],
}

flod = KFold(n_splits=4)
grid = GridSearchCV(rfc,param_grid=para_list,scoring='roc_auc',cv=flod,n_jobs=-1)

In [None]:
grid.fit(x1,y1)

## 保存模型

# joblib.dump(grid, "m_save/违约公司纵向比较全特征_auc.m")

## 读取模型

grid_recall = joblib.load('m_save/违约公司纵向比较全特征_recall.m')
grid_auc = joblib.load('m_save/违约公司纵向比较全特征_auc.m')

## 模型参数

print(grid_auc.best_params_)
print('\n--------\n',grid_auc.best_params_)
grid_auc = grid_auc.best_estimator_
grid_recall = grid_recall.best_estimator_

## 特征重要性

In [262]:
dict_auc = dict(zip(x2.columns,grid_auc.feature_importances_))
dict_recall = dict(zip(x2.columns,grid_recall.feature_importances_))

## 更新训练集、测试集

In [59]:
get_zxy()

## 模型效果在原数据上的混淆矩阵

In [342]:
for j in zip(['data_a','re_of_de_0','re_of_de_1'],['data_a_target','re_target_0','re_target_1']):
    for i in ['grid_auc','grid_recall']:
        print(f'{i}---{j[0]}',metrics.confusion_matrix(eval(j[1]),eval(i).predict(eval(j[0]))),'\n-----------')

grid_auc---data_a [[15558    53]
 [    0     0]] 
-----------
grid_recall---data_a [[15561    50]
 [    0     0]] 
-----------
grid_auc---re_of_de_0 [[437  13]
 [  0   0]] 
-----------
grid_recall---re_of_de_0 [[447   3]
 [  0   0]] 
-----------
grid_auc---re_of_de_1 [[ 0  0]
 [13 13]] 
-----------
grid_recall---re_of_de_1 [[ 0  0]
 [17  9]] 
-----------


## 训练模型 

In [43]:
rfc_list = []
rfc_recall = []
rfc_epoch = 100
for i in range(rfc_epoch):
    get_zxy(test_size=0.25,simple=True)
    rfc = RandomForestClassifier(n_estimators=np.random.choice(range(20,30)),
                                 max_depth=np.random.choice([2,3],p=[0.5,0.5,]),
                                 max_features=np.random.choice([i/100 for i in range(35,55)]),
                                 min_samples_split=np.random.choice([3,4,5],p=[0.15,0.45,0.4]),
                                 n_jobs=-1,class_weight='balanced')
    rfc.fit(x1,y1)
    rfc_recall.append(metrics.recall_score(y2,rfc.predict(x2)))
    rfc_list.append(rfc)
    if i == 0:
        feature_importance = rfc.feature_importances_
    else :
        feature_importance += rfc.feature_importances_
    if i%30 == 0:
        print(i)
feature_importance = [round(i/rfc_epoch,6) for i in feature_importance]
print('avg-recall:',sum(rfc_recall)/rfc_epoch)
pd.Series(dict(zip(x2.columns,feature_importance))).sort_values().plot(kind='barh')

0
30
60
90
avg-recall: 0.5185545087003685


<matplotlib.axes._subplots.AxesSubplot at 0x20227221e48>

In [25]:
print('\n',np.array(sorted(dict(zip(x2.columns,feature_importance)).items(),key=lambda x:x[1],reverse=True)))


 [['主营业务利润率(%)' '0.140116']
 ['净资产回报率(%)' '0.131204']
 ['净利润(亿元)' '0.122906']
 ['获息倍数' '0.111095']
 ['流动比率' '0.09749']
 ['主营业务收入增长率(%)' '0.093511']
 ['筹资活动现金流(亿元)' '0.085014']
 ['总资产报酬率(%)' '0.080298']
 ['货币资金/短期债务' '0.075792']
 ['投资活动现金流(亿元)' '0.062573']]


In [None]:
# joblib.dump(rfc_list, "m_save/rfc_z_250_三个月.m")

In [47]:
rfc_list = joblib.load("m_save/clf_grid12.m")

## 原数据回测

In [44]:
get_hxy()

In [58]:
for j in zip(['data_a','re_of_de_0','re_of_de_1'],['data_a_target','re_target_0','re_target_1']):
    e = [i.predict_proba(eval(j[0])[list_])[:,1] for i in rfc_list]
    print(f'---{j[0]}---\n',metrics.confusion_matrix(eval(j[1]),
                                                   np.where(np.array([sum(i)/rfc_epoch for i in list(zip(*e))])>=0.5,1,0)),'\n-----------')
# [ 1 if ((sum(i)/epoch)>=0.5) else 0 for i in list(zip(*e)) ]

---data_a---
 [[22237   837]
 [    0     0]] 
-----------
---re_of_de_0---
 [[370  39]
 [  0   0]] 
-----------
---re_of_de_1---
 [[ 0  0]
 [34 48]] 
-----------


## 预测

In [40]:
e_rfc = [i.predict_proba(data_2018[list_])[:,1] for i in rfc_list]
e_rfc = np.array([sum(i)/len(rfc_list) for i in list(zip(*e_rfc))])
f = np.where(e_rfc>=0.5,1,0)

f.sum()

113

## 数据提取

In [41]:
e_rfc[data_2018_object['名称'].map(lambda x:x.find('新疆生产建设兵团第六师国有资产经营有限责任公司')>=0)]

array([0.38275803])

In [42]:
data_2018.loc[data_2018_object['名称'].map(lambda x: True if x.find('新疆生产建设兵团第六师国有资产经营有限责任公司')!=-1 else False),:]

Unnamed: 0,总资产(亿元),货币资产(亿元),净资产(亿元),总债务(亿元),带息债务(亿元),净债务(亿元),经营活动现金流(亿元),投资活动现金流(亿元),筹资活动现金流(亿元),主营业务收入(亿元),...,净利润/带息债务,企业性质_中央国有企业,企业性质_其他,企业性质_地方国有企业,企业性质_民营企业,是否上市_否,是否上市_是,注册资本(万元)_1,注册资本(万元)_2,注册资本(万元)_3
1723,0.802212,-1.047194,2.233281,0.394321,0.756973,1.508769,0.313063,0.277939,-0.823264,-0.576479,...,-1.885398,0,0,1,0,1,0,0,0,1


## LR

In [38]:
para_list = {
    'lr__penalty':['l2','l1'],
    'lr__tol':[i/10000 for i in range(1,101,20)],
    'lr__C':[0.1,0.5,1,5,10],
    'lr__class_weight':[dict([(0,i),(1,1-i)]) for i in np.linspace(0.1,0.5,5)]+['balanced']
}

pp_lr = Pipeline([('pn',PolynomialFeatures()),
                      ('lr',LogisticRegression(fit_intercept=False,))])
nflod = KFold(n_splits=4,shuffle=True)
grid = GridSearchCV(pp_lr,para_list,n_jobs=-1,cv=nflod,scoring='recall')

In [None]:
get_zxy()
grid.fit(x1,y1)

In [97]:
grid.best_params_

{'lr__C': 1,
 'lr__class_weight': 'balanced',
 'lr__penalty': 'l1',
 'lr__tol': 0.0081}

In [53]:
lr_epoch = 10
lr_list = []
for i in range(lr_epoch):
    get_zxy(length_0=.4,simple=True)
    lr = LogisticRegression(fit_intercept=False,C=np.random.choice([2,4,6],p=[0.3,0.4,0.3]),
                    class_weight=np.random.choice(['balanced',]+[dict([(0,i),(1,1-i)]) for i in np.linspace(0.2,0.4,3)],p=[0.25,0.25,0.25,0.25]),
                    penalty=np.random.choice(['l1','l2'],p=[0.25,0.75]),
                    tol=np.random.choice([i/50 for i in range(74,87,2)],)           )
    lr.fit(x1,y1)
    lr_list.append(lr)
    if i == 0:
        feature_weight = np.array(lr.coef_).reshape(-1)
    else:
        feature_weight += np.array(lr.coef_).reshape(-1)

feature_weight = pd.Series(feature_weight/lr_epoch,index=x2.columns).sort_values()
feature_weight.plot(kind='barh',title=f'lr of epoch({lr_epoch})')

<matplotlib.axes._subplots.AxesSubplot at 0x1b23f61d9e8>

## 回测原数据

In [54]:
for j in zip(['data_a','re_of_de_0','re_of_de_1'],['data_a_target','re_target_0','re_target_1']):
    e = [i.predict_proba(eval(j[0])[list_])[:,1] for i in lr_list]
    print(f'---{j[0]}---\n',metrics.confusion_matrix(eval(j[1]),
                                                   np.where(np.array([sum(i)/lr_epoch for i in list(zip(*e))])>=0.5,1,0)),'\n-----------')

---data_a---
 [[14195  8879]
 [    0     0]] 
-----------
---re_of_de_0---
 [[187 222]
 [  0   0]] 
-----------
---re_of_de_1---
 [[ 0  0]
 [19 63]] 
-----------


## 预测 

In [55]:
e_lr = [i.predict_proba(data_2018[list_])[:,1] for i in lr_list]
e_lr = np.array([sum(i)/lr_epoch for i in list(zip(*e_lr))])
f = np.where(e_lr>=0.5,1,0)

f.sum()

1037

## 数据提取

In [None]:
e_lr[data_2018_object['名称'].map(lambda x:x.find('新光')>=0)]

## SVM 

In [27]:
para_list = {'C':[1,5,10],
             'kernel':['linear','poly','rbf'],
             'degree':[2,3]}
svc = SVC(class_weight='balanced',)
nflod = KFold(n_splits=4,shuffle=True)
grid = GridSearchCV(svc,para_list,n_jobs=-1,cv=nflod,scoring='roc_auc')

In [28]:
get_zxy(simple=True)
grid.fit(x1,y1)

GridSearchCV(cv=KFold(n_splits=4, random_state=None, shuffle=True),
       error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'C': [1, 5, 10], 'kernel': ['linear', 'poly', 'rbf'], 'degree': [2, 3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [29]:
grid.best_params_

{'C': 1, 'degree': 2, 'kernel': 'rbf'}

In [50]:
svc_epoch = 10
svc_list = []
for i in range(svc_epoch):
    get_zxy(length_0=.4,simple=True)
    svc = SVC(C=np.random.choice([1,5],p=[0.6,0.4],),class_weight='balanced',
              kernel=str(np.random.choice(['linear','rbf','poly'],p=[0.5,0.25,0.25])),
              degree=np.random.choice([2,3]),probability=True)
    svc.fit(x1,y1,)
    svc_list.append(svc)
#     if i == 0:
#         feature_weight = np.array(lr.coef_).reshape(-1)
#     else:
#         feature_weight += np.array(lr.coef_).reshape(-1)

# feature_weight = pd.Series(feature_weight/lr_epoch,index=x2.columns).sort_values()
# feature_weight.plot(kind='barh',title=f'lr of epoch({lr_epoch})')

## 原数据回测

In [52]:
for j in zip(['data_a','re_of_de_0','re_of_de_1'],['data_a_target','re_target_0','re_target_1']):
    e = [i.predict_proba(eval(j[0])[list_])[:,1] for i in svc_list]
    print(f'---{j[0]}---\n',metrics.confusion_matrix(eval(j[1]),
                                                   np.where(np.array([sum(i)/svc_epoch for i in list(zip(*e))])>=0.5,1,0)),'\n-----------')

---data_a---
 [[22995    79]
 [    0     0]] 
-----------
---re_of_de_0---
 [[409]] 
-----------
---re_of_de_1---
 [[ 0  0]
 [52 30]] 
-----------


## 预测

In [19]:
e_svc = [i.predict_proba(data_2018[list_])[:,1] for i in svc_list]
e_svc = np.array([sum(i)/svc_epoch for i in list(zip(*e_svc))])
f = np.where(e_svc>=0.5,1,0)

f.sum()

NameError: name 'svc_list' is not defined

## 提取数据 

In [None]:
e_svc[data_2018_object['名称'].map(lambda x:x.find('新光')>=0)]

# -------------------------------------------------

# 提取原数据

In [30]:
index_ = pd.concat([data_a_object,data_2018_object])['名称'].map(lambda x : x.find('新疆生产建设兵团第六师国有资产经营有限责任公司')>=0)

In [31]:
outdata = pd.concat([pd.concat([data_a_object,data_2018_object]).loc[index_,['名称','报告期']],\
                     pd.concat([data_a,data_2018]).loc[index_,:]],axis=1)

In [26]:
# outdata.to_excel(path+'新疆生产建设兵团第六师国有资产经营有限责任公司.xlsx',index=False)

In [32]:
outdata = outdata.iloc[:,:-9]

get_zxy(simple=True)

outdata = outdata[list(outdata.columns[:2])+list_+['经营活动现金流(亿元)']]

outdata.set_index('报告期',inplace=True)

outdata.sort_values('报告期',inplace=True)

In [33]:
outdata

Unnamed: 0_level_0,名称,获息倍数,货币资金/短期债务,净利润(亿元),筹资活动现金流(亿元),主营业务利润率(%),主营业务收入增长率(%),投资活动现金流(亿元),净资产回报率(%),总资产报酬率(%),流动比率,经营活动现金流(亿元)
报告期,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2016-03-31,新疆生产建设兵团第六师国有资产经营有限责任公司,-1.45096,0.73514,-1.724131,0.842136,-1.322245,6.391666,2.110962,-4.098005,-1.208708,0.962192,-2.916603
2016-06-30,新疆生产建设兵团第六师国有资产经营有限责任公司,-1.261334,0.745595,-1.140379,-0.911901,-7.32025,-3.102262,2.080562,0.410571,-1.330516,2.003252,-2.543357
2016-12-31,新疆生产建设兵团第六师国有资产经营有限责任公司,-0.833186,-1.630456,-0.712209,-1.891611,-1.396979,-2.21495,1.097175,-1.209295,-0.126586,1.033949,-2.559672
2017-03-31,新疆生产建设兵团第六师国有资产经营有限责任公司,-1.221802,-1.735538,-1.250023,0.266761,-1.16161,1.01363,0.907222,-1.58741,-1.781123,0.86262,-2.346397
2017-06-30,新疆生产建设兵团第六师国有资产经营有限责任公司,-1.053352,-1.820156,-1.170228,-0.783732,-0.991694,1.219125,0.511116,-1.501856,-1.848131,1.292461,-0.702139
2017-09-30,新疆生产建设兵团第六师国有资产经营有限责任公司,-0.653031,-0.939512,-0.974024,1.149206,-0.530799,0.37504,0.438609,-1.41263,-1.804707,2.054206,-2.212696
2018-03-31,新疆生产建设兵团第六师国有资产经营有限责任公司,-1.34524,-2.43051,-1.427194,-0.823264,-2.182058,0.752173,0.277939,-1.78718,-2.001986,1.39063,0.313063


In [34]:
a = [('获息倍数', 0.10293085778684695),('货币资金/短期债务', 0.08466747934544402),('净利润(亿元)', 0.08119844386846073),
     ('筹资活动现金流(亿元)', 0.07392047950839306),('主营业务利润率(%)', 0.07373536447871205),('主营业务收入增长率(%)', 0.0681957815907612),
 ('货币资金/总债务', 0.059822816991053754),('投资活动现金流(亿元)', 0.056517651957992715),('净资产回报率(%)', 0.05488686308572023),
 ('总资产报酬率(%)', 0.036039995560997476),('短期债务/总债务', 0.0350832996152875),('EBITDA/营业总收入', 0.034048366402690815),
 ('经营活动现金流(亿元)', 0.0243060183484769),('EBITDA/带息债务', 0.023188518342373905),('流动比率', 0.020910667354810814),
 ('成立日期', 0.0202111066498566),('经营性现金流/EBITDA', 0.019427371835625647),('货币资产(亿元)', 0.018929246296673925),
 ('净债务(亿元)', 0.015673135947545064),('EBITDA(亿元)', 0.013656585077388917),('带息债务/总投入资本', 0.013195889611687595),('主营业务利润(亿元)', 0.012345481772175503),
 ('总资产(亿元)', 0.01071488982344452),('净资产(亿元)', 0.009601482353160072),('总债务(亿元)', 0.009112083719364693),
 ('主营业务收入(亿元)', 0.006851438201812423),('速动比率', 0.005393658415696681),('存货周转率', 0.00521645305170003),
 ('资产负债率', 0.0034618581489861853),('带息债务(亿元)', 0.002858337636445684),('是否上市_否', 0.002687756282129772),
 ('企业性质_民营企业', 0.0012106209382845353),('注册资金低', 0.0),('注册资金中等', 0.0),
 ('注册资金高', 0.0),('企业性质_中外合资企业', 0.0),('企业性质_中央国有企业', 0.0),('企业性质_地方国有企业', 0.0),
 ('企业性质_外商独资企业', 0.0),('企业性质_外资企业', 0.0),('企业性质_集体企业', 0.0),('是否上市_是', 0.0)]
a = dict(a)
a = [a[i] for i in outdata.columns[1:]]
a = [j/sum(a) for j in a]
outdata['加权平均'] = outdata.iloc[:,1:].values.dot(np.array(a).reshape(-1,1))

In [53]:
z = pd.read_excel(path+'2017a.xlsx')

In [54]:
z = z[list(z.columns[:2])+list_+['经营活动现金流(亿元)']]

In [55]:
z = z.loc[z['名称']=='新疆生产建设兵团第六师国有资产经营有限责任公司',:]

In [56]:
z = z.iloc[0,:]

In [57]:
z.fillna(1.3,inplace=True)

In [44]:
z

名称              新疆生产建设兵团第六师国有资产经营有限责任公司
报告期                 2017-12-31 00:00:00
获息倍数                                1.3
货币资金/短期债务                     0.0820865
净利润(亿元)                        -3.50252
筹资活动现金流(亿元)                     11.2072
主营业务利润率(%)                      -8.3078
主营业务收入增长率(%)                    14.6426
投资活动现金流(亿元)                  -0.0295982
净资产回报率(%)                        0.2535
总资产报酬率(%)                       -1.1393
流动比率                             2.1238
经营活动现金流(亿元)                    -15.1898
Name: 3937, dtype: object

In [58]:
z = pd.DataFrame(z).stack().unstack(0).set_index('报告期',)

In [59]:
weight = joblib.load(path+'weight_dict_a.m')

In [60]:
for i in range(z.shape[1]-1):
    z.iloc[:,i+1] =(z.iloc[:,i+1]-weight[z.columns[i+1]][0])/weight[z.columns[i+1]][1]

In [61]:
z

Unnamed: 0_level_0,名称,获息倍数,货币资金/短期债务,净利润(亿元),筹资活动现金流(亿元),主营业务利润率(%),主营业务收入增长率(%),投资活动现金流(亿元),净资产回报率(%),总资产报酬率(%),流动比率,经营活动现金流(亿元)
报告期,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2017-12-31,新疆生产建设兵团第六师国有资产经营有限责任公司,-1.1993,-2.42139,-22.5031,1.45793,-3.34059,0.493806,1.10189,-1.43402,-3.18212,1.50703,-6.27212


In [52]:
outdata

Unnamed: 0_level_0,名称,获息倍数,货币资金/短期债务,净利润(亿元),筹资活动现金流(亿元),主营业务利润率(%),主营业务收入增长率(%),投资活动现金流(亿元),净资产回报率(%),总资产报酬率(%),流动比率,经营活动现金流(亿元),加权平均
报告期,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2016-03-31,新疆生产建设兵团第六师国有资产经营有限责任公司,-1.45096,0.73514,-1.724131,0.842136,-1.322245,6.391666,2.110962,-4.098005,-1.208708,0.962192,-2.916603,-0.039001
2016-06-30,新疆生产建设兵团第六师国有资产经营有限责任公司,-1.261334,0.745595,-1.140379,-0.911901,-7.32025,-3.102262,2.080562,0.410571,-1.330516,2.003252,-2.543357,-1.337332
2016-12-31,新疆生产建设兵团第六师国有资产经营有限责任公司,-0.833186,-1.630456,-0.712209,-1.891611,-1.396979,-2.21495,1.097175,-1.209295,-0.126586,1.033949,-2.559672,-1.070478
2017-03-31,新疆生产建设兵团第六师国有资产经营有限责任公司,-1.221802,-1.735538,-1.250023,0.266761,-1.16161,1.01363,0.907222,-1.58741,-1.781123,0.86262,-2.346397,-0.753055
2017-06-30,新疆生产建设兵团第六师国有资产经营有限责任公司,-1.053352,-1.820156,-1.170228,-0.783732,-0.991694,1.219125,0.511116,-1.501856,-1.848131,1.292461,-0.702139,-0.761336
2017-09-30,新疆生产建设兵团第六师国有资产经营有限责任公司,-0.653031,-0.939512,-0.974024,1.149206,-0.530799,0.37504,0.438609,-1.41263,-1.804707,2.054206,-2.212696,-0.417947
2018-03-31,新疆生产建设兵团第六师国有资产经营有限责任公司,-1.34524,-2.43051,-1.427194,-0.823264,-2.182058,0.752173,0.277939,-1.78718,-2.001986,1.39063,0.313063,-1.105021


In [62]:
z['加权平均'] = z.iloc[:,1:].values.dot(np.array(a).reshape(-1,1))

In [63]:
z = pd.concat([outdata,z])[outdata.columns]

In [64]:
outdata = z.sort_index()

In [65]:
for i in range(outdata.shape[1]-1):
    ax = plt.subplot(4,3,i+1)
    outdata.iloc[:,i+1].plot(label=outdata.columns[i+1],)
    ax.legend(loc='best',prop={'size':11})
    ax.spines['right'].set_color('none')
    ax.spines['top'].set_color('none')
    ax.set_xlabel('')
plt.suptitle('新疆生产建设兵团第六师国有资产经营有限责任公司')
plt.show()

# 画图 

## -------------

In [None]:
data_object['result'] = a.predict(data_2018)

In [None]:
data_object[data_object['result']==1].shape

In [None]:
a = dict(a)
a = [j for j in a.values()][:10]+[a['经营活动现金流(亿元)']]
a = [j/sum(a) for j in a]

In [None]:
def pct_ch(dataframe):
    for i in range(dataframe.shape[1]):
        name = dataframe.columns[i]+'_rate'
        dataframe[name] = pd.Series()
        for j in range(dataframe.shape[0]-1):
            dataframe.iloc[j+1,-1] = (dataframe.iloc[j+1,i]-dataframe.iloc[j,i])/abs(dataframe.iloc[j,i])
    return dataframe.iloc[1:,:]

b_median = b_.iloc[:,1:].groupby('报告期').median().iloc[::-1,:]
b_median['加权平均'] = b_median.values.dot(np.array(a).reshape(-1,1))
b_median = pct_ch(b_median)
b_median.index = [i for i in range(-b_median.shape[0]+1,0+1)]

b_mean = b_.iloc[:,1:].groupby('报告期').mean().iloc[::-1,:]
b_mean['加权平均'] = b_mean.values.dot(np.array(a).reshape(-1,1))
b_mean = pct_ch(b_mean)
b_mean.index = [i for i in range(-b_mean.shape[0]+1,0+1)]

lenght = int(prepared_frame.shape[1]/2)

prepared_frame = b_median
plt.figure()
for i in range(lenght):
    ax = plt.subplot(4,3,i+1)
    prepared_frame.iloc[:,i].plot(label=prepared_frame.columns[i+lenght],)
    ax.legend(loc='best',prop={'size':11})
    for X,Y,Z in zip(prepared_frame.index,prepared_frame.iloc[:,i],prepared_frame.iloc[:,i+lenght]):
        plt.text(X,Y,'%.2f'%Z,ha='center',va='bottom')
    ax.spines['right'].set_color('none')
    ax.spines['top'].set_color('none')
    if i == 1:
        plt.title('趋势变化图(中位数)',fontdict={'size':20})
plt.show()

prepared_frame = b_mean
plt.figure()
for i in range(lenght):
    ax = plt.subplot(4,3,i+1)
    prepared_frame.iloc[:,i].plot(label=prepared_frame.columns[i],)
    ax.legend(loc='best',prop={'size':11})
    for X,Y,Z in zip(prepared_frame.index,prepared_frame.iloc[:,i],prepared_frame.iloc[:,i+lenght]):
        plt.text(X,Y,'%.2f'%Z,ha='center',va='bottom')
    ax.spines['right'].set_color('none')
    ax.spines['top'].set_color('none')
    if i == 1:
        plt.title('趋势变化图(平均数)',fontdict={'size':20})
plt.show()

In [None]:
b_median.to_excel(path+'median_.xlsx',index=False)
b_mean.to_excel(path+'mean_.xlsx',index=False)

In [None]:
re_of_de = pd.read_excel(path+'report of defaulted.xlsx',)[:-2]

In [None]:
pd.DataFrame(re_of_de['发生日期'].groupby(re_of_de['名称'],).apply(lambda x:x.sort_values('发生日期').iloc[0]))

In [68]:
pd.concat([pd.concat([re_object_0,re_of_de_0],axis=1),pd.concat([re_object_1,re_of_de_1],axis=1)]).drop_duplicates(['名称'])['企业性质_中央国有企业'].sum()

4