# 第一问

In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from lightgbm import LGBMClassifier

from mlxtend.classifier import StackingClassifier
from mlxtend.classifier import StackingCVClassifier

from sklearn.svm import SVC
from sklearn import metrics

from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier as ada
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.metrics import roc_auc_score,precision_score,recall_score

In [14]:
#  Voting
data = pd.read_csv('./excel/extract/1/第一问所有特征.csv', encoding='utf-8', index_col='企业代号')
for i in range(len(data)):
    a = 'E' + str(i + 1)
    # One-hot
    if data.loc[a, '是否违约'] == '否':
        data.loc[a, '违约'] = 0
    else:
        data.loc[a, '违约'] = 1

x = data.iloc[:, :-3].values
y = data.iloc[:, -1].values

In [24]:
# init model

lgb = LGBMClassifier(learning_rate=0.05,n_estimators=100,objective='binary',
                           boosting_type='gbdt',
                           num_leaves=2**5,
                           max_depth=5,reg_alpha=0.5,reg_lambda=0.5,
                           metric='auc',subsample=0.75)
LR = LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)
Ada = ada(algorithm='SAMME', base_estimator=None, learning_rate=0.1,
                   n_estimators=100, random_state=30)
GBDT = GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.7, loss='exponential', max_depth=3,
                           max_features='auto', max_leaf_nodes=None,
                           min_impurity_decrease=0.0, 
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=25,
                           n_iter_no_change=None,
                           random_state=30, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)
svc = SVC(C=0.8, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=20, kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

rf = RF(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=30, verbose=0,
                       warm_start=False)


sclf = StackingCVClassifier(classifiers=[Ada, GBDT, LR,rf],
                            use_probas=True,
                            meta_classifier=svc,
                            random_state=30)

In [25]:
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=30)

# (x - u) / s  归一化
tranfer = StandardScaler()
x = tranfer.fit_transform(x)
x_train = tranfer.transform(x_train)
x_test = tranfer.transform(x_test)

metric_xlsx = {}
weight = []
for clf, label in zip([LR, Ada, GBDT, svc, rf, sclf,lgb],
                      ['LR',
                       'Ada',
                       'GBDT',
                       'svc',
                       'rf', 'StackingClassifier','LGBMClassifier']):
    clf.fit(x_train, y_train)
    y_predict = clf.predict(x_test)

    tem_1 = [recall_score(y_test, y_predict),
            precision_score(y_test, y_predict),
            metrics.f1_score(y_test, y_predict),
            metrics.accuracy_score(y_test, y_predict),
            metrics.accuracy_score(y_train, clf.predict(x_train)),
            metrics.roc_auc_score(y_test, clf.predict_proba(x_test)[:,1])]


    metric_xlsx[label] = tem_1
    tem = metrics.accuracy_score(y, clf.predict(x))
    weight.append(tem)
    # print('{}在预测集模型的准确率为：\n'.format(label), metrics.accuracy_score(y_test, y_predict))
    # print('{}在训练集模型的准确率为：\n'.format(label), metrics.accuracy_score(y_train, clf.predict(x_train)))
    # print('{}的综合准确率为：\n'.format(label), metrics.accuracy_score(y, clf.predict(x)))
    # print('{}的ROC面积为：'.format(label), metrics.roc_auc_score(y, clf.predict(x)))
    # print('---'*8)
data2 = pd.DataFrame(data=metric_xlsx,index=['train_Accuracy','test_Accuracy',
                                    'Precision','Recall','AUC','f1']).T
data2

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,train_Accuracy,test_Accuracy,Precision,Recall,AUC,f1
LR,0.75,0.75,0.75,0.935484,0.923913,0.953704
Ada,1.0,1.0,1.0,1.0,0.967391,1.0
GBDT,1.0,0.666667,0.8,0.935484,1.0,1.0
svc,0.0,0.0,0.0,0.870968,1.0,1.0
rf,1.0,1.0,1.0,1.0,1.0,1.0
StackingClassifier,1.0,0.666667,0.8,0.935484,1.0,1.0
LGBMClassifier,1.0,1.0,1.0,1.0,0.967391,1.0


In [21]:
print(len(weight))
print(weight)

# 软投票
w = weight/sum(weight)

vote2 = VotingClassifier(estimators=[('LR', LR), ('Ada', Ada), ('GBDT', GBDT),
                                     ('SVC', svc), ('rf', rf),
                                     ('StackingClassifier', sclf)],
                         voting='soft',
                         weights=weight)
vote2.fit(x_train,y_train)
y_predict = vote2.predict(x_test)

print('{}在预测集模型的准确率为：\n'.format('soft Voting'),
      metrics.accuracy_score(y_test, y_predict))
print('{}在训练集模型的准确率为：\n'.format('soft Voting'),
      metrics.accuracy_score(y_train, vote2.predict(x_train)))
print('soft voting的综合表现:\n', metrics.accuracy_score(y, vote2.predict(x)))
print()
print('soft voting的ROC面积：\n', roc_auc_score(y, vote2.predict(x)))

P = vote2.predict_proba(x)[:, 1]
df = pd.DataFrame(data={'违约概率': P})
df.head()

6
[0.926829268292683, 0.975609756097561, 0.983739837398374, 0.967479674796748, 1.0, 0.983739837398374]
soft Voting在预测集模型的准确率为：
 1.0
soft Voting在训练集模型的准确率为：
 1.0
soft voting的综合表现:
 1.0

soft voting的ROC面积：
 1.0


Unnamed: 0,违约概率
0,0.116077
1,0.101989
2,0.088904
3,0.076958
4,0.09679


## ROC曲线的绘制

In [None]:
fpr,tpr,threshold = metrics.roc_curve(y,P)
# 计算AUC的值
roc_auc = metrics.auc(fpr,tpr)

#绘制面积图
plt.figure(figsize=(6,4),dpi=250)
plt.stackplot(fpr,tpr,color='steelblue',alpha=0.5,edgecolor='black')
# 添加边际线
plt.plot(fpr,tpr,color='black',lw=1)
# 添加对角线
plt.plot([0,1],[0,1],color='red',linestyle='--')
# 添加文本信息
plt.text(0.5,0.3,'ROC curve (area = %0.4f)' % roc_auc,fontsize=10)
# 添加x轴坐标与y轴坐标
plt.xlabel('1-Specificity')
plt.ylabel('Sensitivity')

fpr,tpr,threshold = metrics.roc_curve(y,P)
# 计算AUC的值
print('AUC的值为：',roc_auc)
plt.savefig('./latex/img/1/AUC.png')
plt.show()

In [None]:
df['信誉评级'] = data['信誉评级'].reset_index()['信誉评级']
df.head()

In [None]:
A_aver = sum(df[df['信誉评级']=='A']['违约概率'])/len(df[df['信誉评级']=='A']['违约概率'])
B_aver = sum(df[df['信誉评级']=='B']['违约概率'])/len(df[df['信誉评级']=='B']['违约概率'])
C_aver = sum(df[df['信誉评级']=='C']['违约概率'])/len(df[df['信誉评级']=='C']['违约概率'])
D_aver = sum(df[df['信誉评级']=='D']['违约概率'])/len(df[df['信誉评级']=='D']['违约概率'])
da = {'A企业违约风险':A_aver,'B企业违约风险':B_aver,'C企业违约风险':C_aver,'D企业违约风险':D_aver}
da1 = pd.DataFrame(data=da,index=[0])

da1.to_csv('./excel/result/1/平均违约风险最终结果.csv', encoding='gbk')
da1.head()


# 第二问预测

In [None]:
new_da = pd.read_csv('./excel/extract/1/第一问所有特征.csv', encoding='utf-8')
new_da.head()

In [None]:
new_x = new_da.iloc[:,1:-1].values
new_x = tranfer.transform(new_x)

In [None]:
wieyue = vote2.predict(new_x)
sigma = vote2.predict_proba(new_x)[:, 1]
new_da['是否违约'] = wieyue
new_da['违约风险'] = sigma
new_da.head()
new_da.to_csv('已经判断是否违约.csv', encoding='gbk')


#  第二问循环预测

In [None]:
x = data.drop(['是否违约','信誉评级'],axis=1).values      # 21个特征
y = pd.read_csv('违约风险.csv',encoding='gbk')['评级'].values  # 评价等级编码

In [1]:
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=30)

tranfer = StandardScaler()
x = tranfer.fit_transform(x)
x_train = tranfer.transform(x_train)
x_test = tranfer.transform(x_test)

# weight = []
# for clf, label in zip([LR, Ada, GBDT, svc, rf, sclf],
#                       ['LR',
#                        'Ada',
#                        'GBDT',
#                        'svc',
#                        'rf', 'StackingClassifier']):
#     clf.fit(x_train, y_train)
#     y_predict = clf.predict(x_test)
#     print('{}在预测集模型的准确率为：\n'.format(label), metrics.accuracy_score(y_test, y_predict))
#     print('{}在训练集模型的准确率为：\n'.format(label), metrics.accuracy_score(y_train, clf.predict(x_train)))
#     print('{}的综合准确率为：\n'.format(label), metrics.accuracy_score(y, clf.predict(x)))
#     tem = metrics.accuracy_score(y, clf.predict(x))
#     weight.append(tem)

# weight
# del weight[-1]

# # 软投票
# w = weight/sum(weight)

# vote2= VotingClassifier(estimators=[('LR',LR),('Ada',Ada), ('GBDT',GBDT), ('SVC',svc),('rf',rf)],
#                           voting='soft',weights=weight)
# vote2.fit(x_train,y_train)
# y_predict = vote2.predict(x_test)
# print('{}在预测集模型的准确率为：\n'.format('soft Voting'),metrics.accuracy_score(y_test,y_predict))
# print('{}在训练集模型的准确率为：\n'.format('soft Voting'),metrics.accuracy_score(y_train,vote2.predict(x_train)))
# print('soft voting的综合表现:\n',metrics.accuracy_score(y,vote2.predict(x)))

NameError: name 'train_test_split' is not defined

# XGB调参

In [None]:
other_params = {'learning_rate': 0.16, 'max_depth': 6, 'min_child_weight': 2, 'seed': 10,'estimator':60,
                    'subsample': 0.8, 'colsample_bytree': 0.9, 'gamma': 0.5, 'reg_alpha': 0.08, 'reg_lambda': 0.12}
from xgboost import XGBClassifier
estimator = XGBClassifier(objective='multi:softmax',num_class=4,eval_metric='auc',**other_params)

# 加入网格搜索和叫交叉验证
#param_dict = {'learning_rate': [i*0.01 for i in range(16,31)]}


#estimator = GridSearchCV(estimator,param_grid=param_dict,cv=5,scoring='accuracy')
estimator.fit(x_train,y_train)

In [None]:
# print('最好分数:',estimator.best_score_)
# print('最佳预估器:',estimator.best_params_)
# print('最佳参数：',estimator.best_estimator_)

y_predict = estimator.predict(x_test)
print('预测集模型的准确率为：\n', metrics.accuracy_score(y_test, y_predict))
print('训练集模型的准确率为：\n', metrics.accuracy_score(y_train, estimator.predict(x_train)))
print('综合准确率为：\n', metrics.accuracy_score(y,estimator.predict(x)))


In [None]:
new = pd.read_csv('已经判断是否违约.csv',encoding='gbk')
new

In [None]:
new_x = new.iloc[:,1:-1].values
new_x = tranfer.transform(new_x)

In [None]:
predict_y = estimator.predict(new_x)

In [None]:
new['信誉评级'] = predict_y
new

In [None]:
new.to_csv('第二问最终分类结果.csv',encoding='gbk')

In [None]:
new['企业代号'].values

In [8]:
input1 = pd.read_excel('./excel/附件2：302家无信贷记录企业的相关数据.xlsx',sheet_name='销项发票信息')
output1 = pd.read_excel('./excel/附件2：302家无信贷记录企业的相关数据.xlsx',sheet_name='进项发票信息')

In [10]:
in_20 = input1[input1['开票日期']>'2020']
out_20 = output1[output1['开票日期']>'2020']
in_19 = input1[(input1['开票日期'] > '2019') & (input1['开票日期'] < '2020')]
out_19 = output1[(output1['开票日期'] > '2019') & (output1['开票日期'] < '2020')]


In [11]:
C_i_20 = in_20['企业代号'].unique()       # 本身企业的数量
C_o_20 = out_20['企业代号'].unique()
C_i_19 = in_19['企业代号'].unique()
C_o_19 = out_19['企业代号'].unique()
print('C_o_20',len(C_o_20))
print('C_o_19',len(C_o_19))
all1 = new['企业代号'].values

C_o_20 131
C_o_19 300


NameError: name 'new' is not defined

In [12]:
jin_2020=[]          
for i in range(len(all1)):
    if all1[i] in C_i_20:
        tem = len(in_20[in_20['企业代号']==all1[i]]['购方单位代号'].unique())
    else:
        tem = 0
    
    jin_2020.append(tem)
    
    
jin_2019=[]          
for i in range(len(all1)):
    if all1[i] in C_i_19:
        tem = len(in_19[in_19['企业代号']==all1[i]]['购方单位代号'].unique())
    else:
        tem = 0
    
    jin_2019.append(tem)

    
xiao_2020=[]
for i in range(len(all1)):
    if all1[i] in C_o_20:
        tem = len(out_20[out_20['企业代号']==all1[i]]['销方单位代号'].unique())
    else:
        tem = 0
    
    xiao_2020.append(tem)
    
xiao_2019=[]
for i in range(len(all1)):
    if all1[i] in C_o_19:
        tem = len(out_19[out_19['企业代号']==all1[i]]['销方单位代号'].unique())
    else:
        tem = 0
    
    xiao_2019.append(tem)

NameError: name 'all1' is not defined

In [None]:
da2 = {'2020年进项交易企业数':jin_2020,
      '2020年销项交易企业数':xiao_2020,
      '2019年进项交易企业数':jin_2019,
      '2019年销项交易企业数':xiao_2019}
data2 = pd.DataFrame(data=da2)
data2

In [None]:
data2['进项交易企业变化率'] = data2['2020年进项交易企业数']-data2['2019年进项交易企业数']/data2['2019年进项交易企业数']
data2['销项交易企业变化率'] = data2['2020年销项交易企业数']-data2['2019年销项交易企业数']/data2['2019年销项交易企业数']
data2.loc[205,'销项交易企业变化率'] = 1
data2.loc[225,'进项交易企业变化率'] = 0
data2.loc[258,'销项交易企业变化率'] = 0
data2

In [None]:
data2[data2.isnull().values==True]

In [None]:
data2.to_csv('第三问交易企业变化率.csv',encoding='gbk')

#  聚类结果后特征描述性统计分析

In [None]:
data3 = pd.read_excel('第三问所有聚类特征与结果.xlsx',encoding='gbk')
data3

In [None]:
A = data3[data3['聚类类别']==1]
B = data3[data3['聚类类别']==2]
C = data3[data3['聚类类别']==3]

In [None]:
A.describe().loc[['count','mean','min','max']].to_csv('类别1描述.csv',encoding='gbk')
B.describe().loc[['count','mean','min','max']].to_csv('类别2描述.csv',encoding='gbk')
C.describe().loc[['count','mean','min','max']].to_csv('类别3描述.csv',encoding='gbk')

In [None]:
A.describe().loc[['count','mean','min','max']]

In [None]:
min(A['同比增长速度'])

In [None]:
A

# 画图

In [None]:
data4 = pd.read_excel('第三问所有聚类特征与结果.xlsx',encoding='gbk',sheet_name='Sheet1')
data4 = data4.iloc[0:34,:]

In [None]:
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['KaiTi'] # 指定默认字体
plt.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
plt.figure(figsize=(7,5),dpi=250)
plt.plot(data4.values,label='聚合系数')
plt.legend()
plt.show()

In [None]:
ddd = pd.read_csv('第二问最终分类结果.csv',encoding='gbk')
ddd

In [None]:
for i in range(len(ddd)):
    if ddd.loc[i,'信誉评级']==1:
        ddd.loc[i,'评级']='A'
    elif ddd.loc[i,'信誉评级']==2:
        ddd.loc[i,'评级']='B'
    elif ddd.loc[i,'信誉评级']==3:
        ddd.loc[i,'评级']='C'
    elif ddd.loc[i,'信誉评级']==4:
        ddd.loc[i,'评级']='D'

In [None]:
ddd.to_csv('有评级.csv',encoding='gbk')