In [1]:
import pandas as pd
import numpy as np

import toad
import xgboost as xgb
import statsmodels.api as sm
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from sklearn.metrics import average_precision_score

In [None]:
import warnings
warnings.filterwarnings('ignore')

#解决输出结果展示不全问题
pd.set_option('max_row',350)
pd.set_option('max_columns', 200)

#同时展示多个结果
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [None]:
#模型结果展示
#等频展示
def report_train(pred, y, k=10):
    bins, cut = pd.qcut(pred,k,retbins=True, duplicates='drop')
    df = pd.crosstab(bins,y).sort_index(ascending=False)
    df['total'] = df[0]+df[1] 
    df['1_%'] = df[1]/df['total']
    df['KS'] = abs(df[1].cumsum()/df[1].sum()-df[0].cumsum()/df[0].sum())
    return df,cut

def report_test(pred, y, cut):
    df = pd.crosstab(pd.cut(pred,cut),y).sort_index(ascending=False)
    df['total'] = df[0]+df[1] 
    df['1_%'] = df[1]/df['total']
    df['KS'] = abs(df[1].cumsum()/df[1].sum()-df[0].cumsum()/df[0].sum())
    return df

#等距展示
def report_dis(pred, y, cut):
    df = pd.crosstab(pd.cut(pred,cut),y).sort_index(ascending=False)
    df['total'] = df[0]+df[1] 
    df['1_%'] = df[1]/df['total']
    return df



def binary_metric(target, pred, threshold=0.5):
    '''
    ---二分类数据模型评价函数---
    target: 样本真实标签
    pred: 样本预测概率值
    threshold: 概率阈值
    '''
    pred_ = (pred>=threshold)*1
    accuracy = accuracy_score(target, pred_)     #accuracy_score准确率
    precision = precision_score(target, pred_)   #precision_score精确率
    recall = recall_score(target, pred_)         #recall_score召回率
    f1_measure = f1_score(target, pred_)         #f1_score  f1得分
    confusionMatrix = confusion_matrix(target, pred_)     #confusion_matrix  混淆矩阵
    fpr, tpr, thresholds = roc_curve(target, pred, pos_label=1)   #roc_curve ROC曲线
    auc = roc_auc_score(target, pred)     #roc_auc_score  AUC面积
    KS = max(abs(tpr-fpr))
    TP = confusionMatrix[1, 1]
    FP = confusionMatrix[0, 1]
    FN = confusionMatrix[1, 0]
    TN = confusionMatrix[0, 0]
    lift = (TP/(TP+FP))/((TP+FN)/(TP+FP+FN+TN))
    MAP = average_precision_score(target, pred)    #average_precision_score

    print ("------------------------- ")
    print ("confusion matrix:")
    print ("------------------------- ")
    print ("| TP: %5d | FP: %5d |" % (confusionMatrix[1, 1], confusionMatrix[0, 1]))
    print ("----------------------- ")
    print ("| FN: %5d | TN: %5d |" % (confusionMatrix[1, 0], confusionMatrix[0, 0]))
    print (" ------------------------- ")
    print ("Accuracy:       %.2f%%" % (accuracy * 100))
    print ("Recall:         %.2f%%" % (recall * 100))
    print ("Precision:      %.2f%%" % (precision * 100))
    print ("F1-measure:     %.2f%%" % (f1_measure * 100))
    print ("AUC:            %.2f%%" % (auc * 100))
    print ("KS:             %.2f%%" % (KS * 100))
    print ("lift:           %.2f%%" % (lift * 100))
    print ("MAP:            %.2f%%" % (MAP * 100))
    print ("------------------------- ")
    # return (auc)

In [None]:
#数据读取
df_220412 = pd.read_csv(r'数据文件.csv', encoding='gbk')

In [None]:
#删除无意义字段
df_220412 = df_220412.drop(['xx字段', 'xx_字段'], axis=1)

In [None]:
#删除缺失率>=0.9的字段
df_220412 = df_220412.drop(list(df_220412.isnull().mean()[df_220412.isnull().mean()>=0.9].index), axis=1)

In [None]:
#去除同一值占比过高字段
def qu_tyz(data, var_not, a=0.9):
    all_var=data.columns
    sd=data.shape[0]
    zs_var=list(set(all_var)-set(var_not))
    paichu=[]
    for i in zs_var:
        c=max(data[i].value_counts())
        if c/sd>a:
            paichu.append(i)
    var_drop=paichu
    return var_drop

var_drop = qu_tyz(df_220412, var_not=['lable'], a=0.9)
var_drop
df_220412 = df_220412.drop(var_drop, axis=1)

In [None]:
#类别型变量编码(使用坏账率编码)
def char_encoding(df, vars, target):
    '''
    ---以每个类别对应的坏账率替换替换相应类别---
    df: 数据源
    vars: 需要转换的类别变量列表
    target: 好坏标签变量
    '''
    for var in vars:
        char_info = df.groupby([var])[target].agg(['sum','count'])
        char_info['rate'] = char_info['sum']/char_info['count']
        df[var] = df[var].map(char_info['rate'])
    return df

vars = ['类别字段名列表']

df_encode = char_encoding(df_220412, vars=vars, target='lable')

In [None]:
#分层抽样划分训练测试集
train, test=train_test_split(df_encode, test_size=0.3, stratify=df_encode['lable'], random_state=1234)
x_train = train.drop('lable', axis=1)
y_train = train['lable']
x_test = test.drop('lable', axis=1)
y_test = test['lable']

In [2]:
params={"boostrt":"gbtree",
       "objective":"binary:logistic",
       "gamma":0.4, #节点最小分裂损失
       "min_child_weight":0.1, #叶子节点最小权重和
       "alpha":1, #L1正则项权重
       "lambda":4,#L2正则项权重
       "subsample":0.7, #样本随机比例
       "colsampel_bytree":0.7, #特征随机比例
       "scale_pos_weight":8,
       "silent":1,
       "eta":0.1, #学习速率
       "seed":1000,
       "nthread":-1,
       "eval_metric":"auc",
       "max_depth":3 #树的最大深度
	}

In [None]:
xgb_val=xgb.DMatrix(x_test,label=y_test)
xgb_train=xgb.DMatrix(x_train,label=y_train)
xgb_test=xgb.DMatrix(x_test)

plst=list(params.items())
num_rounds=200
watchlist=[(xgb_train,"train"),(xgb_val,"val")]

model=xgb.train(plst,xgb_train, num_rounds, watchlist, early_stopping_rounds=10)


In [None]:
#特征重要性展示
importance_ = model.get_score(importance_type='gain')
importance_df = pd.DataFrame(index=list(importance_.keys()))
importance_df['importance'] = list(importance_.values())
importance_df.sort_values(by='importance', ascending=False)

In [None]:
#训练预测结果展示
pred_train = model.predict(xgb_train)
pred_test = model.predict(xgb_val)

train_report, cut = report_train(pred_train, y_train['lable'], k=10)
test_report = report_test(pred_test, y_test['lable'], cut)
train_report
test_report

In [None]:
binary_metric(y_train['lable'], pred_train, threshold=0.115)
binary_metric(y_test['lable'], pred_test, threshold=0.115)