In [68]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelBinarizer,LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin

In [69]:
#读取数据
train = pd.read_csv('D:/Anconda/data/data/train.csv', index_col = 'index')
test = pd.read_csv('D:/Anconda/data/data/test.csv', index_col = 'index')
submit = pd.read_csv('D:/Anconda/data/data/submit_sample.csv', index_col = 'index')

In [70]:
#清洗数据,由于树模型的特点，所以一般不用onehot，xgboost也能够处理缺失值，所以只需把非数值类型化成数值类型即可
class RobustLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.classes_ = []
        self.set_classes_ = None
        self.unseen_tag = 'unseen'
    def fit(self, list_):
        self.set_classes_ = set(list_)
        self.classes_ = list(self.set_classes_) + [self.unseen_tag]
        return self
    def transform(self, list_):
        list_ = [obj if obj in self.set_classes_ else self.unseen_tag for obj in list_]
        dct = dict(zip(self.classes_, range(len(self.classes_))))
        res = [dct[obj] for obj in list_]
        return res
def clean_nul(data):
    if data==None:
        data = 'cd'
    else:
        data = data
    return data
train['c'] = train['c'].apply(clean_nul)
test['c'] = test['c'].apply(clean_nul)
df_train = train.merge(test,how='left')
rle_c = RobustLabelEncoder()
rle_c.fit_transform(df_train['c'])
rle_d = RobustLabelEncoder()
rle_d.fit_transform(df_train['d'])
train['c'] = rle_c.transform(train['c'])
train['d'] = rle_d.transform(train['d'])
test['c'] = rle_c.transform(test['c'])
test['d'] = rle_d.transform(test['d'])

Y_train = np.array(train['label'])
y_mean = np.mean(Y_train)
train = train.drop(['level_0','label'],axis=1)
test_index = test['level_0']
test = test.drop(['level_0'],axis=1)

In [71]:
test.head()

Unnamed: 0_level_0,a,b,c,d
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1024,1.092965,99.0,9,5
1025,1.086392,97.0,6,4
1026,2.58264,35.0,9,5
1027,1.044559,18.0,3,1
1028,2.267531,56.0,2,3


In [72]:
#modeling
xgb_params = {
    'eta': 0.007,
    'max_depth': 5,
    'subsample': 0.60,
    'objective': 'reg:logistic',
    'eval_metric': 'error',
    'lambda': 5.0,
    'alpha': 0.65,
    'colsample_bytree': 0.5,
  #  'base_score': y_mean,
    'silent': 1
}
dtrain = xgb.DMatrix(train, Y_train)
dtest = xgb.DMatrix(test)

early_stopping_rounds = round( 1.5/ xgb_params['eta'] )
num_boost_rounds = round( 20 / xgb_params['eta'] )
# cross-validation
cv_result = xgb.cv(xgb_params, 
                   dtrain, 
                   nfold=5,
                   num_boost_round=num_boost_rounds,
                   early_stopping_rounds=early_stopping_rounds,
                   verbose_eval=10, 
                   show_stdv=False
                  )
num_boost_rounds = len(cv_result)
print(num_boost_rounds)
# train model
model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)
pred = model.predict(dtest)
y_pred=[]

for i,predict in enumerate(pred):
    y_pred.append(str(round(predict,4)))

y_pred=np.array(y_pred)

    
#

[0]	train-error:0.343414	test-error:0.413658
[10]	train-error:0.190975	test-error:0.284878
[20]	train-error:0.162439	test-error:0.276098
[30]	train-error:0.146585	test-error:0.246829
[40]	train-error:0.149512	test-error:0.241951
[50]	train-error:0.140244	test-error:0.230244
[60]	train-error:0.139512	test-error:0.232195
[70]	train-error:0.135122	test-error:0.22439
[80]	train-error:0.126585	test-error:0.223415
[90]	train-error:0.127317	test-error:0.227317
[100]	train-error:0.12561	test-error:0.216585
[110]	train-error:0.122195	test-error:0.217561
[120]	train-error:0.116585	test-error:0.213658
[130]	train-error:0.117317	test-error:0.209756
[140]	train-error:0.117317	test-error:0.21561
[150]	train-error:0.114878	test-error:0.208781
[160]	train-error:0.110488	test-error:0.203902
[170]	train-error:0.108049	test-error:0.203902
[180]	train-error:0.105854	test-error:0.199024
[190]	train-error:0.10561	test-error:0.193171
[200]	train-error:0.105366	test-error:0.197073
[210]	train-error:0.105122	t

[1720]	train-error:0.0341462	test-error:0.111219
[1730]	train-error:0.0341462	test-error:0.110244
[1740]	train-error:0.0341462	test-error:0.108293
[1750]	train-error:0.0339024	test-error:0.108293
[1760]	train-error:0.0331706	test-error:0.108293
[1770]	train-error:0.0336584	test-error:0.109268
[1780]	train-error:0.0334144	test-error:0.109268
[1790]	train-error:0.0331706	test-error:0.109268
[1800]	train-error:0.0331704	test-error:0.108293
[1810]	train-error:0.0329266	test-error:0.108293
[1820]	train-error:0.0329266	test-error:0.109268
[1830]	train-error:0.032195	test-error:0.109268
[1840]	train-error:0.032195	test-error:0.109268
[1850]	train-error:0.0319512	test-error:0.107317
[1860]	train-error:0.0319512	test-error:0.108293
[1870]	train-error:0.0312196	test-error:0.108293
[1880]	train-error:0.0312196	test-error:0.108293
[1890]	train-error:0.0312196	test-error:0.107317
[1900]	train-error:0.0309756	test-error:0.108293
[1910]	train-error:0.0309758	test-error:0.108293
[1920]	train-error:0.0

TypeError: '>' not supported between instances of 'numpy.str_' and 'float'

In [78]:
#输出
for j,i in enumerate(y_pred):
    if float(i)>0.5:
        y_pred[j]=1
    else:
        y_pred[j]=0
output = pd.DataFrame({'index':test_index,'label':y_pred})
output.to_csv('D:/Anconda/data/data/11749009.csv',index=False)