In [1]:
%matplotlib qt
import numpy as np
import pandas as pd
from pandas import DataFrame
from scipy import stats
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import gc
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings('ignore')

path = '../data/Debt issuing company 2018 report/'

In [2]:
def drop_out(frame,col,model='Confidence interval',t_alpha=0.95,alpha=2,IQR_rate=1.5,quantile=0.95):
    '''modle: 'gauss','box','quantile' '''
    
    if model == 'Confidence interval':
        u_ = frame[col].mean()
        v_ = frame[col].std()
        interval_ = stats.t.interval(t_alpha,frame[col].count()-1,u_,v_)
        cond_ = (frame[col]<interval_[1])&(frame[col]>interval_[0])
    
    elif model == 'gauss':
        u_ = frame[col].mean()
        v_ = frame[col].std()
        cond_ = np.abs((frame[col]-u_)/v_) < alpha
    
    elif model == 'box':
        q1 = frame[col].quantile(0.25)
        q3 = frame[col].quantile(0.75)
        IQR = (q3-q1)*IQR_rate
        q1 -= IQR ; q3 += IQR
        cond_ = (frame[col]<q3)&(frame[col]>q1)
    
    elif model == 'quantile':
        top_ = frame[col].quantile(quantile)
        bottom_ = frame[col].quantile(1-quantile)
        cond_ = (frame[col]<top_)&(frame[col]>bottom_)
    
    else:
        print('please try again')
        return frame
    
    index_ = np.where(frame[col]!=frame[col],True,
                                                  np.where(cond_,True,False))
    frame = frame.loc[index_,:]
    return frame

In [3]:
def fenbutu(df):
    for i in range(df.shape[1]-2):
        ax = plt.subplot(5,5,i+1)
        ax.scatter(range(df.shape[0]),df.iloc[:,i+2].sort_values(),s=3)
        plt.title(df.columns[i+2])
    plt.suptitle('散点趋势图')

In [4]:
start_year = 2014
end_year = 2017
data_a = DataFrame()
for i in range(start_year,end_year+1):
    if i % 3 ==0:
        print('is concating {} {}/{}'.format(i,i-start_year+1,end_year+1-start_year))
    try:
        path_a = path+f'y/{i}y.xlsx'
        data_a_ = pd.read_excel(path_a)[:-2]
        data_a_.drop(['是否经过审计','审计意见']+[i for i in data_a_.columns if i.find('E')!=-1],axis=1,inplace=True)
    #     data_a_.loc[:,['主营业务收入(亿元)','主营业务利润(亿元)','净利润(亿元)']] = \
    #     data_a_.loc[:,['主营业务收入(亿元)','主营业务利润(亿元)','净利润(亿元)']].apply(lambda x:x/data_a_['报告期'].dt.month)
        data_a = pd.concat([data_a,data_a_])
    except:
        print(f'no {i}')
        pass
del data_a_
gc.collect()
print('finish concat data_y')

print(np.array(list(data_a.isnull().sum(0))))

is concating 2016 3/4
finish concat data_y
[   0    0  144 1739  183  334 2191 2191  478  633  545  268  540  441
  610  762 2195  598 1317 2034 2682  556 2194 2198 2166 1742 5213]


In [5]:
data_a.dropna(how='any',inplace=True)

In [6]:
ci = pd.read_excel(path+'comp_feature/产业类发债企业行业分类0827.xlsx',sheet_name='城投类企业')
ci = ci[['名称', '最新评级', '企业性质', '是否上市','一级分类', '二级分类']]
ci['trade'] = 1

In [7]:
tp = pd.read_excel(path+'comp_feature/产业类发债企业行业分类0827.xlsx',sheet_name='交通运输')
tp = tp[['名称', '最新评级', '企业性质', '是否上市','一级分类', '二级分类']]
tp['trade'] = 2

In [8]:
indestry = pd.read_excel(path+'comp_feature/产业类发债企业行业分类0827.xlsx',sheet_name='产业类企业')
indestry = indestry[['名称', '最新评级', '企业性质', '是否上市','一级分类', '二级分类']]
indestry['trade'] = 3

In [9]:
fi = pd.read_excel(path+'comp_feature/产业类发债企业行业分类0827.xlsx',sheet_name='金融类企业')
fi = tp[['名称', '最新评级', '企业性质', '是否上市','一级分类', '二级分类']]
fi['trade'] = 4

In [10]:
all_com = pd.concat([ci,tp,indestry,fi])[ci.columns]

In [11]:
ci_ = data_a.merge(ci[['名称',]],on='名称',)
tp_ = data_a.merge(tp[['名称',]],on='名称',)
indestry_ = data_a.merge(indestry[['名称',]],on='名称',)
fi_ = data_a.merge(fi[['名称',]],on='名称',)

In [12]:
for k in range(2):
    col_ = np.random.choice(ci_.columns[2:],len(ci_.columns[2:]),replace=False)
    for j in col_:
        ci_ = drop_out(ci_,j,model='gauss',alpha=3)
        tp_ = drop_out(tp_,j,model='gauss',alpha=3)
        indestry_ = drop_out(indestry_,j,model='gauss',alpha=3)

In [13]:
for col_ in ci_.columns[2:]:
    ci_.loc[:,col_] = ci_.loc[:,col_] - ci_.loc[:,col_].mean()
    tp_.loc[:,col_] = tp_.loc[:,col_] - tp_.loc[:,col_].mean()
    indestry_.loc[:,col_] = indestry_.loc[:,col_] - indestry_.loc[:,col_].mean()
    fi_.loc[:,col_] = fi_.loc[:,col_] - fi_.loc[:,col_].mean()

In [14]:
ci_.describe()

Unnamed: 0,总资产(亿元),货币资产(亿元),净资产(亿元),总债务(亿元),带息债务(亿元),净债务(亿元),经营活动现金流(亿元),投资活动现金流(亿元),筹资活动现金流(亿元),主营业务收入(亿元),...,净资产回报率(%),流动比率,速动比率,存货周转率,资产负债率,短期债务/总债务,带息债务/总投入资本,货币资金/短期债务,货币资金/总债务,获息倍数
count,2158.0,2158.0,2158.0,2158.0,2158.0,2158.0,2158.0,2158.0,2158.0,2158.0,...,2158.0,2158.0,2158.0,2158.0,2158.0,2158.0,2158.0,2158.0,2158.0,2158.0
mean,5.687634e-13,-2.854271e-15,-2.371659e-14,1.065189e-13,-4.688002e-14,1.468005e-14,-1.41911e-15,-5.729121e-15,-6.760528e-15,1.179861e-14,...,-1.310866e-16,3.015609e-15,1.023895e-15,5.209508e-16,7.27335e-14,-8.164327e-14,-6.11452e-14,9.707506e-16,1.978517e-16,1.196695e-14
std,77.54308,10.57353,38.31201,48.33849,33.73368,27.42204,8.801808,5.379827,11.68438,5.569113,...,1.350341,2.674025,1.002471,2.061806,13.28424,16.7091,14.59014,0.369244,0.1035915,92.43156
min,-139.8709,-12.44473,-68.54198,-71.20154,-45.26775,-41.84513,-31.36531,-21.20696,-35.98823,-8.5532,...,-4.13322,-4.12024,-1.683356,-0.3704134,-38.30098,-40.21521,-35.11235,-0.4883326,-0.1760734,-30.77522
25%,-60.45668,-7.637564,-26.77955,-37.99999,-25.81871,-21.51672,-4.263384,-1.927187,-7.872297,-3.782148,...,-0.9593447,-1.997365,-0.756781,-0.3055384,-9.065604,-12.70696,-10.65247,-0.2712178,-0.0797544,-22.28495
50%,-12.77807,-3.301008,-8.220352,-9.773641,-8.750459,-7.380951,1.495605,1.808913,-1.941821,-1.288612,...,-0.1555197,-0.7105896,-0.205006,-0.2568134,0.5968713,-1.650607,-0.6160953,-0.08868553,-0.0162006,-19.75502
75%,45.83195,4.75647,20.74482,25.68239,18.07023,14.06954,4.861539,3.465567,6.373827,2.293,...,0.8117053,1.335385,0.553519,-0.1597634,9.770071,10.36067,10.02263,0.1561588,0.06280017,-11.11857
max,268.2446,42.37981,154.0855,166.17,129.1188,89.75859,30.03588,17.66028,40.4966,25.29795,...,4.46928,9.51626,3.311444,59.54419,37.66922,49.83179,44.39345,1.528149,0.339983,1515.419


In [15]:
tp_.describe()

Unnamed: 0,总资产(亿元),货币资产(亿元),净资产(亿元),总债务(亿元),带息债务(亿元),净债务(亿元),经营活动现金流(亿元),投资活动现金流(亿元),筹资活动现金流(亿元),主营业务收入(亿元),...,净资产回报率(%),流动比率,速动比率,存货周转率,资产负债率,短期债务/总债务,带息债务/总投入资本,货币资金/短期债务,货币资金/总债务,获息倍数
count,510.0,510.0,510.0,510.0,510.0,510.0,510.0,510.0,510.0,510.0,...,510.0,510.0,510.0,510.0,510.0,510.0,510.0,510.0,510.0,510.0
mean,-9.819422e-14,-1.731077e-15,5.494864e-14,7.267041e-14,1.681618e-14,1.258079e-14,-3.291485e-16,-1.100645e-15,-7.2909e-15,-8.637971e-15,...,3.26362e-15,5.694791e-16,4.419123e-16,1.242057e-14,3.393886e-14,-3.280339e-14,-1.487263e-14,4.837089e-16,-1.195939e-16,2.250052e-15
std,111.5325,13.17085,48.84508,68.67858,49.07353,42.08347,8.223404,10.44166,12.25584,32.94822,...,4.090749,1.025959,0.5784873,90.69759,13.05759,18.93972,15.68246,0.3016366,0.1159368,13.6242
min,-154.7063,-14.03428,-63.57559,-86.8937,-56.60534,-67.39601,-31.64306,-35.70763,-38.19683,-29.2245,...,-12.9758,-1.431279,-0.9882863,-41.8075,-37.52779,-40.37736,-41.92271,-0.4273803,-0.1677664,-20.61794
25%,-89.68791,-9.431194,-34.75279,-53.89504,-37.13337,-31.75402,-4.007343,-4.449518,-7.423486,-21.02806,...,-2.8001,-0.676154,-0.4195113,-40.88455,-8.227088,-14.32726,-11.23623,-0.2125995,-0.08605363,-4.669811
50%,-30.31366,-4.348586,-12.7115,-22.49677,-16.30584,-12.85685,-0.9598116,3.210736,-2.4192,-11.88348,...,-0.7429002,-0.293579,-0.1268363,-36.22865,-0.3818384,-1.589813,0.2749431,-0.06234477,-0.02546908,-3.601486
75%,64.86096,5.484409,19.0339,37.8881,20.29042,17.94716,3.983999,7.633885,5.742544,7.082763,...,2.050575,0.478471,0.2904637,-8.648003,9.057012,12.55774,11.34527,0.137565,0.05260398,-1.094011
max,428.9894,60.98161,213.1756,249.3261,181.7618,143.5618,27.03601,20.08071,50.72891,148.7937,...,13.592,3.979221,1.895714,685.4565,34.48761,53.56984,38.15599,1.314159,0.3680209,102.2267


In [16]:
indestry_.describe()

Unnamed: 0,总资产(亿元),货币资产(亿元),净资产(亿元),总债务(亿元),带息债务(亿元),净债务(亿元),经营活动现金流(亿元),投资活动现金流(亿元),筹资活动现金流(亿元),主营业务收入(亿元),...,净资产回报率(%),流动比率,速动比率,存货周转率,资产负债率,短期债务/总债务,带息债务/总投入资本,货币资金/短期债务,货币资金/总债务,获息倍数
count,5188.0,5188.0,5188.0,5188.0,5188.0,5188.0,5188.0,5188.0,5188.0,5188.0,...,5188.0,5188.0,5188.0,5188.0,5188.0,5188.0,5188.0,5188.0,5188.0,5188.0
mean,-6.40817e-14,-2.439923e-14,1.681581e-14,-9.93267e-14,-5.782404e-15,-2.68686e-14,-7.702911e-15,6.54732e-15,3.3822e-15,8.683194e-15,...,-2.326709e-14,1.27757e-16,-9.266339e-16,-1.483813e-14,-8.685111e-14,-4.03522e-14,-4.371043e-14,1.201226e-16,1.507778e-16,4.899704e-16
std,77.16758,12.11971,27.66899,54.15128,33.44742,26.89224,6.909852,6.492092,8.884632,61.66075,...,6.92216,0.62603,0.4420487,16.49681,14.30365,18.30764,18.30509,0.2225223,0.130898,13.64325
min,-110.3249,-13.3103,-37.60291,-67.45643,-38.70409,-75.62728,-28.19134,-23.85921,-33.50267,-56.27463,...,-26.87534,-1.342241,-0.907665,-7.187776,-43.42687,-55.86433,-47.85039,-0.371745,-0.2184217,-114.2862
25%,-59.47485,-8.821081,-19.20009,-42.18649,-25.87048,-19.8207,-3.162305,-2.673735,-4.586026,-41.59026,...,-4.37162,-0.4250414,-0.319615,-5.824651,-9.918545,-12.59578,-13.66682,-0.1600099,-0.09407446,-5.021624
50%,-17.25084,-3.665027,-6.666051,-14.20362,-9.478631,-7.939092,-1.091418,1.883429,-1.509505,-22.42134,...,-0.9893449,-0.1074414,-0.06716502,-3.763876,1.11738,2.17397,0.1711555,-0.04635296,-0.02854027,-3.626924
75%,43.23477,5.341243,12.34252,28.77907,16.39829,11.90706,2.915339,4.273567,4.00589,17.79998,...,3.95073,0.3075586,0.242385,-0.3692762,10.60048,14.65869,13.67398,0.1097266,0.0645913,-0.3135494
max,317.1852,56.37031,140.5833,195.7358,150.1767,98.79271,29.02928,20.55484,36.66524,314.7427,...,26.99266,2.374359,1.508835,275.4582,39.06043,28.52927,49.14971,0.9074983,0.4622475,163.23


In [17]:
pca = PCA(n_components=3,)
ci__ = pca.fit_transform(ci_.iloc[:,2:],)
pca = PCA(n_components=3,)
tp__ = pca.fit_transform(tp_.iloc[:,2:])
pca = PCA(n_components=3,)
indestry__ = pca.fit_transform(indestry_.iloc[:,2:])
pca = PCA(n_components=3,)
fi__ = pca.fit_transform(indestry_.iloc[:,2:])

In [18]:
pca = PCA(n_components=3,)
ci__ = pca.fit_transform(ci_.iloc[:,2:],)
pca = PCA(n_components=3,)
tp__ = pca.fit_transform(tp_.iloc[:,2:])
pca = PCA(n_components=3,)
indestry__ = pca.fit_transform(indestry_.iloc[:,2:])
pca = PCA(n_components=3,)
fi__ = pca.fit_transform(indestry_.iloc[:,2:])

In [19]:
ina = np.random.permutation(ci__.shape[0])[:100]
inb = np.random.permutation(tp__.shape[0])[:100]
inc = np.random.permutation(indestry__.shape[0])[:100]
ind = np.random.permutation(fi__.shape[0])[:100]

In [20]:
ci___ = ci__[ina]
tp___ = tp__[inb]
indestry___ = indestry__[inc]
fi___ = fi__[ind]

In [21]:
ax = plt.subplot(111, projection='3d')
# ax.scatter(ci___[:,0], ci___[:,1], ci___[:,2], c='g')
# ax.scatter(tp___[:,0], tp___[:,1], tp___[:,2], c='r')
ax.scatter(indestry___[:,0], indestry___[:,1], indestry___[:,2], c='b')
ax.scatter(fi___[:,0], fi___[:,1], fi___[:,2], c='orange')

<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x202d0d1ec88>

In [284]:
ci_.iloc[ina,:].to_excel(r'D:\pycharm_folder\xijing\company\2018-06-07\data\Debt issuing company 2018 report\result\a\城投.xlsx',index=False)
ci_.iloc[ina,:].describe().to_excel(r'D:\pycharm_folder\xijing\company\2018-06-07\data\Debt issuing company 2018 report\result\a\城投_.xlsx',)
tp_.iloc[inb,:].to_excel(r'D:\pycharm_folder\xijing\company\2018-06-07\data\Debt issuing company 2018 report\result\a\交通.xlsx',index=False)
tp_.iloc[inb,:].describe().to_excel(r'D:\pycharm_folder\xijing\company\2018-06-07\data\Debt issuing company 2018 report\result\a\交通_.xlsx',)
indestry_.iloc[inc,:].to_excel(r'D:\pycharm_folder\xijing\company\2018-06-07\data\Debt issuing company 2018 report\result\a\基础设施.xlsx',index=False)
indestry_.iloc[inc,:].describe().to_excel(r'D:\pycharm_folder\xijing\company\2018-06-07\data\Debt issuing company 2018 report\result\a\基础设施_.xlsx',)

In [167]:
ax = plt.subplot(111, projection='3d')
ax.scatter(ci___[:,0], ci___[:,1], ci___[:,2], c='g')
ax.scatter(tp___[:,0], tp___[:,1], tp___[:,2], c='r')
ax.scatter(indestry___[:,0], indestry___[:,1], indestry___[:,2], c='b')
# ax.scatter(fi___[:,0], fi___[:,1], fi___[:,2], c='orange')

<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x28e09b32ba8>

In [67]:
aa = pd.DataFrame(pca.components_,columns=ci_.columns[2:])

In [68]:
aa

Unnamed: 0,总资产(亿元),货币资产(亿元),净资产(亿元),总债务(亿元),带息债务(亿元),净债务(亿元),经营活动现金流(亿元),投资活动现金流(亿元),筹资活动现金流(亿元),主营业务收入(亿元),...,净资产回报率(%),流动比率,速动比率,存货周转率,资产负债率,短期债务/总债务,带息债务/总投入资本,货币资金/短期债务,货币资金/总债务,获息倍数
0,0.67665,0.085482,0.148017,0.484744,0.277161,0.191679,0.017298,-0.023662,0.014575,0.39024,...,-0.001825,-0.000746,-0.000732,-0.004028,0.059481,-0.026317,0.079289,-0.00013,-0.000153,-0.01064
1,-0.271254,0.008509,-0.100696,-0.166625,-0.136052,-0.144561,0.016172,0.00917,-0.026028,0.89921,...,0.033214,-0.001684,3.8e-05,0.030206,-0.001261,0.180446,-0.018148,4.2e-05,0.000489,0.011749
2,0.036956,0.012372,0.053219,-0.000342,-0.008823,-0.021195,-0.009151,-0.032935,0.061781,-0.038465,...,0.049875,0.001529,0.001077,0.006994,-0.01985,-0.018814,-0.033661,0.000402,0.000232,0.046716


In [71]:
from sklearn.externals import joblib

In [89]:
# aaa = pd.DataFrame(ci___.T)
# aaa.to_excel(r'D:\pycharm_folder\xijing\company\2018-06-07\data\Debt issuing company 2018 report\result\ci.xlsx')
# bbb = pd.DataFrame(tp___.T)
# bbb.to_excel(r'D:\pycharm_folder\xijing\company\2018-06-07\data\Debt issuing company 2018 report\result\tp.xlsx')
# ccc = pd.DataFrame(indestry___.T)
# ccc.to_excel(r'D:\pycharm_folder\xijing\company\2018-06-07\data\Debt issuing company 2018 report\result\indesrty.xlsx')

In [114]:
pca = PCA(n_components=2,)
ci__ = pca.fit_transform(ci_.iloc[:,2:],)
pca = PCA(n_components=2,)
tp__ = pca.fit_transform(tp_.iloc[:,2:])
pca = PCA(n_components=2,)
indestry__ = pca.fit_transform(indestry_.iloc[:,2:])
pca = PCA(n_components=2,)
fi__ = pca.fit_transform(indestry_.iloc[:,2:])

In [115]:
ci___ = ci__[np.random.permutation(ci__.shape[0])[:500]]
tp___ = tp__[np.random.permutation(tp__.shape[0])[:500]]
indestry___ = indestry__[np.random.permutation(indestry__.shape[0])[:500]]
fi___ = fi__[np.random.permutation(tp__.shape[0])[:100]]

In [117]:
plt.scatter(ci___[:,0],ci___[:,1],c='g')
plt.scatter(tp___[:,0],tp___[:,1],c='r')
plt.scatter(indestry___[:,0],indestry___[:,1],c='b')

<matplotlib.collections.PathCollection at 0x209dfbd87b8>