# 陈天奇的  XGBoost


In [None]:
from xgboost import XGBRegressor as XGBR
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.linear_model import LinearRegression as LinearR
from sklearn.datasets import load_boston
from sklearn.model_selection import KFold, cross_val_score as CVS, train_test_split as TTS
from sklearn.metrics import mean_squared_error as MSE
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import time
import datetime

from xgboost import XGBClassifier 


## 回归

In [None]:
#xgb实现法
import xgboost as xgb


data = load_boston()
X = data.data
y = data.target

Xtrain,Xtest,Ytrain,Ytest = TTS(X,y,test_size=0.1,random_state=420)

#使用类DMatrix读取数据
dtrain = xgb.DMatrix( Xtrain,Ytrain ) #特征矩阵和标签都进行一个传入
dtest = xgb.DMatrix( Xtest,Ytest )



In [None]:
X.shape

In [None]:

import pandas as pd

pd.DataFrame(Xtrain)


In [None]:
#写明参数
param = {
          'objective':'reg:squarederror'
         ,"eta":0.1}
num_round = 250 #n_estimators

#类train，可以直接导入的参数是训练数据，树的数量，其他参数都需要通过params来导入
bst = xgb.train(param, dtrain, num_round)

#接口predict
preds = bst.predict(dtest)

MSE(Ytest,preds)


## 多分类

In [None]:
def loadData( fileName, n=1000):
        '''
        加载文件

        :param fileName:要加载的文件路径
        :param n: 返回的数据集的规模
        :return: 数据集和标签集
        '''
        # 存放数据及标记
        dataArr = []
        labelArr = []
        # 读取文件
        fr = open(fileName)

        cnt = 0  # 计数器

        # 遍历文件中的每一行
        for line in fr.readlines():


            if cnt == n:
                break

            # 获取当前行，并按“，”切割成字段放入列表中
            # strip：去掉每行字符串首尾指定的字符（默认空格或换行符）
            # split：按照指定的字符将字符串切割成每个字段，返回列表形式
            curLine = line.strip().split(',')
            # 将每行中除标记外的数据放入数据集中（curLine[0]为标记信息）
            # 在放入的同时将原先字符串形式的数据转换为整型
            # 此外将数据进行了二值化处理，大于128的转换成1，小于的转换成0，方便后续计算
            dataArr.append([int(int(num) > 128) for num in curLine[1:]])

            # 将标记信息放入标记集中
            labelArr.append(int(curLine[0]))
            cnt += 1

        fr.close()

        # 返回数据集和标记
        return dataArr, labelArr

    
    


### sklearn 的 xgboost 接口

In [None]:
n_train=60000
n_test=10000

# 获取训练集
trainDataList, trainLabelList = loadData('../Mnist/mnist_train.csv', n=n_train)

print('train data, row num:{} , column num:{} '.format(len(trainDataList), len(trainDataList[0])))

trainDataArr = np.array(trainDataList)
trainLabelArr = np.array(trainLabelList)

# 开始时间
print('start training model....')
start = time.time()


"""

XGBClassifier 测试1:
max_depth=3, n_estimators=20, learning_rate=0.5, 
n_train=60000
n_test=10000
训练时间 : 38 s
准确率: 0.9155

"""


clf = XGBClassifier(
    max_depth=3, #
    learning_rate=0.5, # 学习率 eta 
    n_estimators=20, # 使用多少个弱分类器
    
    eval_metric='mlogloss',
    
    num_class=10,
   
    gamma=0, # 损失函数中 树的总叶子个数T 的系数, 可以控制模型的复杂度
    min_child_weight=1,
    max_delta_step=0,
    subsample=1, # 随机抽样的时候抽取的样本比例, 范围 (0,1]
    colsample_bytree=1,
    reg_alpha=0, # L1 正则化的强度
    reg_lambda=1, # L2 正则化的强度
    use_label_encoder=False
)

clf.fit(trainDataArr, trainLabelArr)


# 结束时间
end = time.time()
print('training cost time :', end - start)

# 获取测试集
testDataList, testLabelList = loadData('../Mnist/mnist_test.csv', n=n_test)

print('test data, row num:{} , column num:{} '.format(len(testDataList), len(testDataList[0])))

testDataArr = np.array(testDataList)
testLabelArr = np.array(testLabelList)

print('test dataset accuracy: {} '.format(clf.score(testDataArr, testLabelArr)))

#### 学习曲线

In [None]:
n_train=6000
n_test=1000

# 获取训练集
trainDataList, trainLabelList = loadData('../Mnist/mnist_train.csv', n=n_train)

trainDataArr = np.array(trainDataList)
trainLabelArr = np.array(trainLabelList)

Xtrain,Ytrain = trainDataArr, trainLabelArr 


# 获取测试集
testDataList, testLabelList = loadData('../Mnist/mnist_test.csv', n=n_test)

print('test data, row num:{} , column num:{} '.format(len(testDataList), len(testDataList[0])))

testDataArr = np.array(testDataList)
testLabelArr = np.array(testLabelList)

Xtest,Ytest = testDataArr, testLabelArr 



In [None]:
def plot_learning_curve(estimator,title, X, y, 
                        ax=None, #选择子图
                        ylim=None, #设置纵坐标的取值范围
                        cv=None, #交叉验证
                        n_jobs=None #设定索要使用的线程
                       ):
    
    from sklearn.model_selection import learning_curve
    import matplotlib.pyplot as plt
    import numpy as np
    
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y
                                                            ,shuffle=True
                                                            ,cv=cv
                                                            ,random_state=420
                                                            ,n_jobs=n_jobs)      
    if ax == None:
        ax = plt.gca()
    else:
        ax = plt.figure()
    ax.set_title(title)
    if ylim is not None:
        ax.set_ylim(*ylim)
    ax.set_xlabel("Training examples")
    ax.set_ylabel("Score")
    ax.grid() #绘制网格，不是必须
    ax.plot(train_sizes, np.mean(train_scores, axis=1), 'o-'
            , color="r",label="Training score")
    ax.plot(train_sizes, np.mean(test_scores, axis=1), 'o-'
            , color="g",label="Test score")
    ax.legend(loc="best")
    return ax


In [None]:
clf = XGBClassifier(
    max_depth=3, #
    learning_rate=0.5, # 学习率 eta 
    n_estimators=20, # 使用多少个弱分类器
    
    eval_metric='mlogloss',
    
    num_class=10,
   
    gamma=0, # 损失函数中 树的总叶子个数T 的系数, 可以控制模型的复杂度
    min_child_weight=1,
    max_delta_step=0,
    subsample=1, # 随机抽样的时候抽取的样本比例, 范围 (0,1]
    colsample_bytree=1,
    reg_alpha=0, # L1 正则化的强度
    reg_lambda=1, # L2 正则化的强度
    use_label_encoder=False
)

plot_learning_curve(clf
                    ,"XGBoost",Xtrain,Ytrain,ax=None,cv=5)
plt.show()

#### 参数调优 - 交叉验证

In [None]:
%%time 

# 超参数 n_estimators 调优
#=====【TIME WARNING： 6min 】=====#

axisx = range(10,100,10)
rs = []
for i in axisx:
    
    clf = XGBClassifier(
        max_depth=3, #
        learning_rate=0.5, # 学习率 eta 
        n_estimators=i, # 使用多少个弱分类器

        eval_metric='mlogloss',

        num_class=10,

        gamma=0, # 损失函数中 树的总叶子个数T 的系数, 可以控制模型的复杂度
        min_child_weight=1,
        max_delta_step=0,
        subsample=1, # 有放回的随机抽样 的时候抽取的样本比例, 范围 (0,1]
        colsample_bytree=1, # 构造 每棵树 随机抽样出的特征占总特征的比例
        reg_alpha=0, # L1 正则化的强度
        reg_lambda=1, # L2 正则化的强度
        
        use_label_encoder=False
    )
    
    rs.append( CVS( clf , Xtrain, Ytrain, cv=5 , n_jobs=-1).mean() ) #  n_jobs=-1 开启所有的 CPU 核
    
print( axisx[rs.index(max(rs))], max(rs) ) # n_estimators=90   accuracy= 0.932

plt.figure(figsize=(20,5))
plt.plot( axisx,rs,c="red",label="XGBoost" )
plt.legend()
plt.show()

In [None]:
%%time 

# 超参数 learning_rate 调优
#=====【TIME WARNING：12min  】=====#


axisx = np.linspace(0.1,1,10)
rs = []
for i in axisx:
    
    clf = XGBClassifier(
        max_depth=3, #
        learning_rate=i, # 学习率 eta 
        n_estimators=90, # 使用多少个弱分类器

        eval_metric='mlogloss',

        num_class=10,

        gamma=0, # 损失函数中 树的总叶子个数T 的系数, 可以控制模型的复杂度
        min_child_weight=1,
        max_delta_step=0,
        subsample=1, # 有放回的随机抽样 的时候抽取的样本比例, 范围 (0,1]
        colsample_bytree=1, # 构造 每棵树 随机抽样出的特征占总特征的比例
        reg_alpha=0, # L1 正则化的强度
        reg_lambda=1, # L2 正则化的强度
        
        use_label_encoder=False
    )
    
    rs.append( CVS( clf,Xtrain,Ytrain,cv=5 ).mean() ) # 
    
    
print( "best param:{} , score:{}".format( axisx[rs.index(max(rs))], max(rs) )) # learning_rate=0.4 accuracy=0.9335000000000001

plt.figure(figsize=(20,5))
plt.plot( axisx,rs,c="red",label="XGBoost" )
plt.legend()
plt.show()

In [None]:
%%time 

# 超参数 subsample 调优

#=====【TIME WARNING：9 min 】=====#



#首先我们先来定义一个评分函数，这个评分函数能够帮助我们直接打印Xtrain上的交叉验证结果
def clfassess(clf,Xtrain,Ytrain,scoring = ["accuracy"],show=True):
    
    score = []
    for i in range(len(scoring)):
        
        c=CVS (clf,Xtrain,Ytrain,cv=5,scoring=scoring[i]).mean()
        
        if show:
            print("{}:{:.2f}".format(scoring[i] #模型评估指标的名字
                                ,c))
            
        score.append((c).mean())
        
    return score

axisx = np.linspace(0.5,1,5)
rs = []
te = []
for i in axisx:
    
    clf = XGBClassifier(
        
        max_depth=3, #
        learning_rate=0.4, # 学习率 eta 
        n_estimators=90, # 使用多少个弱分类器

        eval_metric='mlogloss',

        num_class=10,

        gamma=0, # 损失函数中 树的总叶子个数T 的系数, 可以控制模型的复杂度
        min_child_weight=1,
        max_delta_step=0,
        
        subsample=i, # 有放回的随机抽样 的时候抽取的样本比例, 范围 (0,1]
        
        colsample_bytree=1, # 构造 每棵树 随机抽样出的特征占总特征的比例
        reg_alpha=0, # L1 正则化的强度
        reg_lambda=1, # L2 正则化的强度
        
        use_label_encoder=False
    )
    
    score = clfassess( clf, Xtrain, Ytrain, scoring = ["accuracy"], show=True)
    
    test = clf.fit( Xtrain,Ytrain ).score( Xtest, Ytest )
    
    rs.append(score[0])
    te.append(test)
    
     
print("best param:{} , score:{}".format(axisx[rs.index(max(rs))],max(rs))) # subsample=0.625 accuracy=0.9338

plt.figure(figsize=(20,5))

plt.plot(axisx,te,c="gray",label="test")
plt.plot(axisx,rs,c="green",label="train")
plt.legend()
plt.show()


In [None]:
# 查看测试集 上的混淆矩阵

from sklearn.metrics import confusion_matrix 

from sklearn.utils.class_weight import compute_sample_weight


y_pred= clf.predict( testDataArr )
y_true=testLabelArr


confusion_matrix(y_true, y_pred) # 

sw = compute_sample_weight(class_weight='balanced',y=y_true)

confusion_matrix(y_true, y_pred, sample_weight=sw)


#### 参数调优 - 网格搜索

In [None]:
#来查看一下sklearn中所有的 模型评估指标
import sklearn
sorted(sklearn.metrics.SCORERS.keys())


ref:
https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

In [None]:
from sklearn.model_selection import GridSearchCV


def print_best_score(gsearch,param_test):
     # 输出best score
    print("Best score: %0.3f" % gsearch.best_score_)
    print("Best parameters set:")
    # 输出最佳的分类器到底使用了怎样的参数
    best_parameters = gsearch.best_estimator_.get_params()
    for param_name in sorted(param_test.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

        


In [None]:

param = {
    'gamma':  [0,2,5],
    'max_depth': range(1,5,1)
}
#网格搜索 是 两个 参数集合的全组合(笛卡尔积), 因此 集合中的元素个数 不宜过多

estimator = XGBClassifier(
        
        max_depth=3, #
        learning_rate=0.4, # 学习率 eta 
        n_estimators=90, # 使用多少个弱分类器

        eval_metric='mlogloss',

        num_class=10,

        gamma=0, # 损失函数中 树的总叶子个数T 的系数, 可以控制模型的复杂度
        
        min_child_weight=1,
        max_delta_step=0,
        
        subsample=0.625, # 有放回的随机抽样, 抽取的样本比例, 范围 (0,1]
        
        colsample_bytree=1, # 构造 每棵树 随机抽样出的特征占总特征的比例
        reg_alpha=0, # L1 正则化的强度
        reg_lambda=1, # L2 正则化的强度
        
        use_label_encoder=False
    )

gsearch = GridSearchCV( estimator , param_grid = param, scoring='accuracy', cv=5 , n_jobs=-1 )

gsearch.fit( Xtrain,Ytrain )


print_best_score(gsearch,param)

# Best score: 0.937
# Best parameters set:
# 	gamma: 0
# 	max_depth: 4


## 二分类

### xgboost 原生接口

In [None]:

def loadData_2classification( fileName, n=1000):
    '''
    加载文件

    将 数据集 的标签 转换为 二分类的标签

    :param fileName:要加载的文件路径
    :param n: 返回的数据集的规模
    :return: 数据集和标签集
    '''
    # 存放数据及标记
    dataArr = []
    labelArr = []
    # 读取文件
    fr = open(fileName)

    cnt = 0  # 计数器

    # 遍历文件中的每一行
    for line in fr.readlines():

        if cnt == n:
            break

        # 获取当前行，并按“，”切割成字段放入列表中
        # strip：去掉每行字符串首尾指定的字符（默认空格或换行符）
        # split：按照指定的字符将字符串切割成每个字段，返回列表形式
        curLine = line.strip().split(',')
        # 将每行中除标记外的数据放入数据集中（curLine[0]为标记信息）
        # 在放入的同时将原先字符串形式的数据转换为整型
        # 此外将数据进行了二值化处理，大于128的转换成1，小于的转换成0，方便后续计算
        dataArr.append([int(int(num) > 128) for num in curLine[1:]])

        # 将标记信息放入标记集中
        # 转换成二分类任务
        # 标签0设置为1，反之为0

        # 显然这会导致 正负 样本的 分布不均衡, 1 的样本很少(10%), 而0 的很多
        if int(curLine[0]) == 0:
            labelArr.append(1)
        else:
            labelArr.append(0)

        # if int(curLine[0]) <= 5:
        #     labelArr.append(1)
        # else:
        #     labelArr.append(0)

        cnt += 1

    fr.close()

    # 返回数据集和标记
    return dataArr, labelArr
    
    



In [None]:
n_train=6000

# 获取训练集
trainDataList, trainLabelList =loadData_2classification('../Mnist/mnist_train.csv', n=n_train)

print('train data, row num:{} , column num:{} '.format(len(trainDataList), len(trainDataList[0])))

trainDataArr = np.array(trainDataList)
trainLabelArr = np.array(trainLabelList)


n_test=1000

# 获取测试集
testDataList, testLabelList = loadData_2classification('../Mnist/mnist_test.csv', n=n_test)

print('test data, row num:{} , column num:{} '.format(len(testDataList), len(testDataList[0])))

testDataArr = np.array(testDataList)
testLabelArr = np.array(testLabelList)


In [None]:

import xgboost as xgb


#使用类DMatrix读取数据
dtrain = xgb.DMatrix( trainDataArr,trainLabelArr ) #特征矩阵和标签都进行一个传入
dtest = xgb.DMatrix( testDataArr,testLabelArr )


In [None]:
# 使用 pandas 查看样本

import pandas as pd

pd.DataFrame(trainDataArr)

In [None]:
# estimator = XGBClassifier(
        
#         max_depth=3, #
#         learning_rate=0.4, # 学习率 eta 
#         n_estimators=90, # 使用多少个弱分类器
#         eval_metric='mlogloss',
#         num_class=10,
#         gamma=0, # 损失函数中 树的总叶子个数T 的系数, 可以控制模型的复杂度
#         min_child_weight=1,
#         max_delta_step=0,
#         subsample=0.625, # 有放回的随机抽样, 抽取的样本比例, 范围 (0,1]
#         colsample_bytree=1, # 构造 每棵树 随机抽样出的特征占总特征的比例
#         reg_alpha=0, # L1 正则化的强度
#         reg_lambda=1, # L2 正则化的强度
#         use_label_encoder=False
#     )


# param= {'silent':True,'objective':'binary:logistic',"eta":0.4}

param= {'eval_metric':'logloss',"eta":0.4,}


num_round = 90 #n_estimators

#类train，可以直接导入的参数是训练数据，树的数量，其他参数都需要通过params来导入
bst = xgb.train( param, dtrain, num_round )


In [None]:
from sklearn.metrics import confusion_matrix 
from sklearn.utils.class_weight import compute_sample_weight

from sklearn.metrics import accuracy_score

from sklearn.metrics import precision_score

from sklearn.metrics import recall_score

from sklearn.metrics import f1_score


y_pred =( bst.predict(dtest) > 0.5 ).astype(int) #  predict() 返回的是概率  

y_true= testLabelArr

# 1.正确率
print('test dataset accuracy: {} '.format(accuracy_score(y_true, y_pred)))

print('====================')

#### 样本不均衡问题



In [None]:

print( '0 负样本所占的比例: {} '.format( len(trainLabelArr[trainLabelArr==0])/len(trainLabelArr) ))  



In [None]:
from sklearn.metrics import confusion_matrix as cm, accuracy_score as accuracy ,recall_score as recall, roc_auc_score as auc


#写明参数
scale_pos_weight = [ 0.5 , 1 , 5 , 9 ,10]
names = [
    
         "negative vs positive: 0.5 ",
         "negative vs positive: 1",
         "negative vs positive: 5",
         "negative vs positive: 9",
         "negative vs positive: 10"
        
        ]


[*zip(names,scale_pos_weight)]


for name,i in zip(names,scale_pos_weight):
    
    param= { 'eval_metric':'logloss',"eta":0.4,"scale_pos_weight":i } # scale_pos_weight = 负样本 / 正样本
    
    num_round = 40
    
    clf = xgb.train(param, dtrain, num_round)
    
    preds = clf.predict(dtest)
    
    ypred = preds.copy()
    ypred[preds > 0.5] = 1
    ypred[ypred != 1] = 0
    
    print(name)
    
    print("\tAccuracy:{}".format(accuracy(testLabelArr,ypred)))
    print("\tRecall:{}".format(recall(testLabelArr,ypred)))
    print("\tAUC:{}".format(auc(testLabelArr,preds)))


#### 交叉验证

In [None]:
#设定参数
param1 = { 'eval_metric':'logloss',"eta":0.4,"scale_pos_weight":9 , "gamma":0 }
param2 = { 'eval_metric':'logloss',"eta":0.4,"scale_pos_weight":9 , "gamma":5 }

num_round = 40
n_fold=5 # sklearn - KFold


cvresult1 = xgb.cv(param1, dtrain, num_round ,n_fold ,  metrics='auc')


cvresult2 = xgb.cv(param2, dtrain, num_round ,n_fold ,  metrics='auc')

plt.figure(figsize=(20,5))
plt.grid()

plt.plot(range(1,41),cvresult1.iloc[:,0],c="red",label="train,gamma=0")
plt.plot(range(1,41),cvresult1.iloc[:,2],c="orange",label="test,gamma=0")
plt.plot(range(1,41),cvresult2.iloc[:,0],c="green",label="train,gamma=5")
plt.plot(range(1,41),cvresult2.iloc[:,2],c="blue",label="test,gamma=5")

plt.legend()
plt.show()


In [None]:
#看看类xgb.cv生成了什么结果？

cvresult1 


### Higgs 数据集


下载数据集


原始 Higgs 数据集
ref:
https://archive.ics.uci.edu/ml/datasets/HIGGS

总记录数
11000000


论文中使用的大小
Higgs 10M( million = 百万) dataset


kaggle 竞赛数据集 
ref:https://www.kaggle.com/c/higgs-boson/data

总记录数
550000


In [None]:
import pandas as pd

Higgs_dataset_path= '../dataset/higgs/kaggle'


# 取前10 行 看看长啥样子
data = pd.read_csv(Higgs_dataset_path+'/training.csv',skiprows=0,nrows =10 )

data.shape
data # 


In [None]:
# load in training data, directly use numpy
dtrain = np.loadtxt(Higgs_dataset_path+'/training.csv' , delimiter=',', skiprows=1, converters={32: lambda x:int(x=='s'.encode('utf-8')) } )

# converters 对最后一列进行转换

print ('finish loading from csv ')

In [None]:
dtrain.shape

In [None]:
label  = dtrain[:,32]
data   = dtrain[:,1:31]

test_size = 550000

# rescale weight to make it same as test set
weight = dtrain[:,31] * float(test_size) / len(label)

sum_wpos = sum( weight[i] for i in range(len(label)) if label[i] == 1.0  )
sum_wneg = sum( weight[i] for i in range(len(label)) if label[i] == 0.0  )

# print weight statistics
print ('weight statistics: wpos=%g, wneg=%g, ratio=%g' % ( sum_wpos, sum_wneg, sum_wneg/sum_wpos ))

# construct xgboost.DMatrix from numpy array, treat -999.0 as missing value
xgmat = xgb.DMatrix( data, label=label, missing = -999.0, weight=weight )

# setup parameters for xgboost
param = {}
# use logistic regression loss, use raw prediction before logistic transformation
# since we only need the rank
param['objective'] = 'binary:logitraw'
# scale weight of positive examples
param['scale_pos_weight'] = sum_wneg/sum_wpos
param['eta'] = 0.1
param['max_depth'] = 6
param['eval_metric'] = 'auc'
param['nthread'] = 16

# you can directly throw param in, though we want to watch multiple metrics here
plst = list(param.items())+[('eval_metric', 'ams@0.15')] # 

watchlist = [ (xgmat,'train') ]
# boost 120 trees
num_round = 120
print ('loading data end, start to boost trees')
bst = xgb.train( plst, xgmat, num_round, watchlist );
# save out model
bst.save_model('higgs.model')

print ('finish training')

**watchlist 使用** 

作用: 在训练的时候 查看模型的训练效果

划分20%为验证集 (dval)，准备一个watchlist 给train和validation set ,这样我们能发现每一个round 的验证集预测结果，如果在某一个round后 validation set 的预测误差上升了，你就可以停止掉正在运行的程序了( early stop )。

训练效果的 评价指标 通过参数 'eval_metric' 控制

eg.

In [None]:
param['eval_metric'] = 'auc'

watchlist = [(dtrain,'train'),(dval,'val')]

model = xgb.train(params,dtrain,num_boost_round=100,evals = watchlist)

In [None]:
bst = xgb.Booster({'nthread': 4})  # init model
bst.load_model('higgs.model')

In [None]:
# 取前10 行 看看长啥样子
data = pd.read_csv(Higgs_dataset_path+'/test.csv',skiprows=0,nrows =10 )

data.shape # 发现测试数据集 没有标签列, 模型预测完测试集后提交到 kaggle 平台验证
data # 

In [None]:
### load data in do training
train = np.loadtxt(Higgs_dataset_path+'/training.csv', delimiter=',', skiprows=1, converters={32: lambda x:int(x=='s'.encode('utf-8')) } )

label  = train[:,32]
data   = train[:,1:31]
weight = train[:,31]
dtrain = xgb.DMatrix( data, label=label, missing = -999.0, weight=weight )
param = {'max_depth':6, 'eta':0.1, 'objective':'binary:logitraw', 'nthread':4}
num_round = 120

print ('running cross validation, with preprocessing function')
# define the preprocessing function
# used to return the preprocessed training, test data, and parameter
# we can use this to do weight rescale, etc.
# as a example, we try to set scale_pos_weight
def fpreproc(dtrain, dtest, param):
    label = dtrain.get_label()
    ratio = float(np.sum(label == 0)) / np.sum(label==1)
    param['scale_pos_weight'] = ratio
    wtrain = dtrain.get_weight()
    wtest = dtest.get_weight()
    sum_weight = sum(wtrain) + sum(wtest)
    wtrain *= sum_weight / sum(wtrain)
    wtest *= sum_weight / sum(wtest)
    dtrain.set_weight(wtrain)
    dtest.set_weight(wtest)
    return (dtrain, dtest, param)

# do cross validation, for each fold
# the dtrain, dtest, param will be passed into fpreproc
# then the return value of fpreproc will be used to generate
# results of that fold

xgb.cv(param, dtrain, num_round, nfold=5,metrics={'ams@0.15', 'auc'}, seed = 0, fpreproc = fpreproc)



### 近似算法

In [None]:
#设定参数
param1 = { 'objective':'binary:logistic',"eta":0.1,"max_depth":6 , "nthread":16}
param2 = { 'objective':'binary:logistic',"eta":0.1,"max_depth":6 , "nthread":16, "tree_method": 'approx', "sketch_eps":0.3}

num_round = 90
n_fold=5 # sklearn - KFold

In [None]:
%%time
cvresult1 = xgb.cv(param1, dtrain, num_round ,n_fold ,  metrics='auc')

# 2min 18s

In [None]:
%%time
cvresult2 = xgb.cv(param2, dtrain, num_round ,n_fold ,  metrics='auc')

# 1min 25s

In [None]:
plt.figure(figsize=(20,5))

plt.grid()

plt.plot(range(1,91),cvresult1.iloc[:,2],c="orange",label="test,exact greedy")
plt.plot(range(1,91),cvresult2.iloc[:,2],c="blue",label="test,global eps=0.3")

plt.legend()
plt.show()

### 划分数据集的技巧

In [None]:
from sklearn.model_selection import KFold, cross_val_score , train_test_split 


data = np.loadtxt(Higgs_dataset_path+'/training.csv' , delimiter=',', skiprows=1,max_rows=10000, converters={32: lambda x:int(x=='s'.encode('utf-8')) } )

# max_rows 设置读取的行数
# converters 对最后一列进行转换

X  = data[:,1:31]
y  = data[:,32]



In [None]:
np.shape(X)

**划分数据集**

划分数据集为 训练集 验证集 和测试集

In [None]:
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val  = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

dtrain = xgb.DMatrix( X_train,label=y_train)
dval = xgb.DMatrix( X_val,label=y_val)
dtest = xgb.DMatrix( X_test,label=y_test)


In [None]:
param1 = { 'objective':'binary:logistic',"eta":0.1,"max_depth":3 , "nthread":16}

watchlist = [(dtrain,'train'),(dval,'val')]

# watchlist = [(dtrain,'train')]

param1['eval_metric'] = 'auc'

num_round = 120

bst = xgb.train( param1 ,dtrain  ,num_round, evals =watchlist )



In [None]:
preds=bst.predict(dtest)

ypred = preds.copy()
ypred[preds > 0.5] = 1
ypred[ypred != 1] = 0

print("\tAccuracy:{}".format(accuracy(y_test,ypred)))
print("\tAUC:{}".format(auc(y_test,preds)))

**划分数据集**

分层抽样

ref:https://blog.csdn.net/haoji007/article/details/106165488

In [None]:

len(y[y ==1]) / len(y) # 原数据集的 正样本比例

len(y_train[y_train ==1]) / len(y_train)# 训练数据集的 正样本比例


In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0)
print(split )       
 
for train_index, test_index in split.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
 


In [None]:
len(y[y ==1]) / len(y) # 原数据集的 正样本比例

len(y_train[y_train ==1]) / len(y_train)# 训练数据集的 正样本比例


### 稀疏感知

详见 Kaggle_Allstate_Claim_Prediction_Challenge.ipynb


## 排序模型


MQ2008 数据集 描述

每一行是一个查询文档对。第一列是 这个文档对的相关性的标签，第二列是查询id，下面的列是特性，行结尾是关于 文档对的注释，包括文档id。相关性标签越大，查询文档对越相关。查询文档对由46维的特征向量表示。以下是MQ2007数据集中的几个示例行:

=================================

2 qid:10032 1:0.056537 2:0.000000 3:0.666667 4:1.000000 5:0.067138 … 45:0.000000 46:0.076923 #docid = GX029-35-5894638 inc = 0.0119881192468859 prob = 0.139842

0 qid:10032 1:0.279152 2:0.000000 3:0.000000 4:0.000000 5:0.279152 … 45:0.250000 46:1.000000 #docid = GX030-77-6315042 inc = 1 prob = 0.341364

0 qid:10032 1:0.130742 2:0.000000 3:0.333333 4:0.000000 5:0.134276 … 45:0.750000 46:1.000000 #docid = GX140-98-13566007 inc = 1 prob = 0.0701303

1 qid:10032 1:0.593640 2:1.000000 3:0.000000 4:0.000000 5:0.600707 … 45:0.500000 46:0.000000 #docid = GX256-43-0740276 inc = 0.0136292023050293 prob = 0.400738

=================================


ref:
https://www.microsoft.com/en-us/research/project/letor-learning-rank-information-retrieval





In [10]:
# 数据转换

import sys

def save_data(group_data,output_feature,output_group):
    if len(group_data) == 0:
        return

    output_group.write(str(len(group_data))+"\n")
    for data in group_data:

        # only include nonzero features
        feats = [ p for p in data[2:] if float(p.split(':')[1]) != 0.0 ]
        output_feature.write(data[0] + " " + " ".join(feats) + "\n")


# 传入参数:
# ../../../dataset/MQ2008/Fold1/train.txt
# ../../../dataset/MQ2008/mq2008.train
# ../../../dataset/MQ2008/mq2008.train.group



fi = open('../dataset/MQ2008/Fold1/train.txt')

output_feature = open('../dataset/MQ2008/mq2008.train',"w")
output_group = open('../dataset/MQ2008/mq2008.train.group',"w")

group_data = []
group = ""
for line in fi:
    if not line:
        break
    if "#" in line:
        line = line[:line.index("#")]
    splits = line.strip().split(" ")

    if splits[1] != group:
        save_data(group_data,output_feature,output_group)
        group_data = []

    group = splits[1]
    group_data.append(splits)

save_data(group_data,output_feature,output_group)

fi.close()
output_feature.close()
output_group.close()




对原始 数据集进行处理 生成 特征文件 mq2008.train  和 分组文件 train.group

(1) 特征文件 中第一列为 文档对 query 相关度的打分, 第二列开始为 特征id: 特征值 ,特征值为0 的特征被排除, 以下是几个实例行：

0 1:0.007477 3:1.000000 5:0.007470 11:0.471076 13:1.000000 15:0.477541 16:0.005120
0 1:0.603738 3:1.000000 5:0.603175 13:0.122130 16:0.998377 17:0.375000 18:1.000000
0 1:0.214953 5:0.213819 11:0.401330 15:0.402388 16:0.140868 17:1.000000 18:0.285714 19:0.333333 20:0.141484 
0 3:1.000000 11:0.458053 13:0.495975 15:0.461687 18:0.571429 19:0.833333 21:0.273864 22:0.148498 29:0.387106 

(2) 分组文件 中每一行 代表一个 组, 每一行的数字代表这一组拥有的 样本的个数; (只有同一个 group 中的样本才有排序的意义。对于IR任务来说，不同 query对应不同group。)

以下是几个实例行：
8  当前group 拥有8个样本
8
8
8
8
16


In [1]:
import xgboost as xgb
from xgboost import DMatrix
from sklearn.datasets import load_svmlight_file


dir='../dataset/MQ2008/'

#  This script demonstrate how to do ranking with xgboost.train
x_train, y_train = load_svmlight_file(dir+"mq2008.train") # load_svmlight_file 载入 libsvm 格式的数据, 并将其转换为  CSR matrix
x_valid, y_valid = load_svmlight_file(dir+"mq2008.vali")
x_test, y_test = load_svmlight_file(dir+"mq2008.test")

# libsvm 使用的文件格式如下：
#
#  [label] [index1]:[value1] [index2]:[value2] …
#
# label  目标值，就是说class（属于哪一类），就是你要分类的种类，通常是一些整数。
# index 是有顺序的索引，通常是连续的整数。就是指特征编号，必须按照升序排列
# value 就是特征值，用来train的数据，通常是一堆实数组成。

# from scipy.sparse import csr_matrix
# X = csr_matrix([[0, 0, 1], [2, 3, 0]])
# X
# <2x3 sparse matrix of type '<type 'numpy.int64'>'
#     with 3 stored elements in Compressed Sparse Row format>
# X.toarray()
# array([[0, 0, 1],
#        [2, 3, 0]])
# print(X) #  仅仅是 打印出来的样子 , 并不代表实际的存储格式
#   (0, 2)    1
#   (1, 0)    2
#   (1, 1)    3

# x_train 采用 稀疏编码 (CSR) 进行存储


group_train = []
with open( dir+"mq2008.train.group", "r") as f:
    data = f.readlines()
    for line in data:
        group_train.append(int(line.split("\n")[0]))

group_valid = []
with open( dir+"mq2008.vali.group", "r") as f:
    data = f.readlines()
    for line in data:
        group_valid.append(int(line.split("\n")[0]))

group_test = []
with open(dir+"mq2008.test.group", "r") as f:
    data = f.readlines()
    for line in data:
        group_test.append( int(line.split("\n")[0]) )

train_dmatrix = DMatrix(x_train, y_train)
valid_dmatrix = DMatrix(x_valid, y_valid)
test_dmatrix = DMatrix(x_test)

# DMatrix有set_group方法，调用设置 groupId。
# (groupId 的概念在 rank 中广泛适用，只有同一个 group 中的样本才有排序的意义。对于IR任务来说，不同 query对应不同group。)
# 注意set_group 方法传入的是每个 group 中元素的个数，


train_dmatrix.set_group(group_train)
valid_dmatrix.set_group(group_valid)

params = {'objective': 'rank:ndcg', 'eta': 0.1, 'gamma': 1.0,
          'min_child_weight': 0.1, 'max_depth': 6}

xgb_model = xgb.train(params, train_dmatrix, num_boost_round=4,
                      evals=[(valid_dmatrix, 'validation')])

pred = xgb_model.predict(test_dmatrix) # 输出对 文档对的打分 , 对这些分值进行排序 即可得到最后的 doc list 


[0]	validation-map:0.70906
[1]	validation-map:0.72783
[2]	validation-map:0.72909
[3]	validation-map:0.73380


In [2]:
pred

array([0.78897315, 0.17356825, 0.78815585, ..., 0.3806271 , 0.42701086,
       0.17356825], dtype=float32)

# 我的 xgboost

In [None]:
idx=np.array(range(3)).reshape(-1, 1)
f=np.array([6,5,4]).reshape(-1, 1)

f
idx

f_idx=np.concatenate([f,idx],axis=1)

f_idx

# np.sort(f_idx,axis=0 )
# np.sort(f_idx,axis=1 )

f_idx = f_idx[f_idx[:,0].argsort()] # 按照第0列 对行排序
f_idx

In [None]:
f_idx[:1+1,:]
f_idx[1+1:,:]

In [None]:
f_idx[:2+1,:]
f_idx[2+1:,:]

In [None]:
left=f_idx[:1+1,:]
right=f_idx[1+1:,:]

index_left=left[:,1]
index_left

In [None]:
block_k=np.array([[10, 0],
                  [11, 1],
                  [12, 2]])

# block_k[:,1]==[1,2]

# block_k[:,1]==[0,1,2]

# block_k[ block_k[:,1]==[0,1,2] , : ]


In [None]:
condition= np.array([ True if sample_id in set(index_left) else False for sample_id in block_k[:,1]  ] )

condition

In [None]:
block_k[ condition , : ]

In [None]:
condition_not= ~condition
block_k[ condition_not , : ]

In [4]:
import numpy as np 


(1-1/np.log2(10))*0.5

np.ln(10)

0.3494850021680094

AttributeError: module 'numpy' has no attribute 'ln'

In [2]:
a=0

~a

-1

In [8]:

class GradeStats:
    """
    梯度 统计信息

    """

    def __init__(self):

        # 一阶梯度的和
        self.sum_grad=0

        # 二阶梯度的和
        self.sum_hess=0

# GradeStats_list = [GradeStats()]*10
        
GradeStats_list = [GradeStats() for i in range(10)]

for i in range(10):
    
    GradeStats_list[i].sum_grad=i

for i in range(10):
    
    print(GradeStats_list[i].sum_grad)

0
1
2
3
4
5
6
7
8
9


In [9]:
id(GradeStats_list)

1799015944768