# 0.前言
这个文档主要是用来入门下XGBOOST，主要就是参考的https://blog.csdn.net/qq_24519677/article/details/81869196

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn import preprocessing 
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import cross_validation
from sklearn.preprocessing import LabelEncoder
import sklearn
 
import warnings
warnings.filterwarnings('ignore')

# 1.数据特征处理

In [7]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
train.info()  # 打印训练数据的信息

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [8]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


对数据的缺失值进行处理，这里采用的方法是对连续值用该列的平均值进行填充，非连续值用该列的众数进行填充，还可以使用机器学习的模型对缺失值进行预测，用预测的值来填充缺失值，该方法这里不做介绍：

In [9]:
def handle_na(train, test):  # 将Cabin特征删除
    fare_mean = train['Fare'].mean()  # 测试集的fare特征有缺失值，用训练数据的均值填充
    test.loc[pd.isnull(test.Fare), 'Fare'] = fare_mean
 
    embarked_mode = train['Embarked'].mode()  # 用众数填充
    train.loc[pd.isnull(train.Embarked), 'Embarked'] = embarked_mode[0]
    
    train.loc[pd.isnull(train.Age), 'Age'] = train['Age'].mean()  # 用均值填充年龄
    test.loc[pd.isnull(test.Age), 'Age'] = train['Age'].mean()
    return train, test
 
new_train, new_test = handle_na(train, test)  # 填充缺失值

由于Embarked，Sex，Pclass特征是离散特征，所以对其进行one-hot/get_dummies编码

In [10]:
# 对Embarked和male特征进行one-hot/get_dummies编码
new_train = pd.get_dummies(new_train, columns=['Embarked', 'Sex', 'Pclass'])
new_test = pd.get_dummies(new_test, columns=['Embarked', 'Sex', 'Pclass'])

然后再去除掉PassengerId，Name，Ticket，Cabin, Survived列，这里不使用这些特征做预测

In [11]:
target = new_train['Survived'].values
# 删除PassengerId，Name，Ticket，Cabin, Survived列
df_train = new_train.drop(['PassengerId','Name','Ticket','Cabin','Survived'], axis=1).values
df_test = new_test.drop(['PassengerId','Name','Ticket','Cabin'], axis=1).values

# 2.XGBoost模型
## 2.1使用XGBoost原生版本模型

In [12]:
X_train,X_test,y_train,y_test = train_test_split(df_train,target,test_size = 0.3,random_state = 1) # 将数据划分为训练集和测试集
 
data_train = xgb.DMatrix(X_train, y_train)  # 使用XGBoost的原生版本需要对数据进行转化
data_test = xgb.DMatrix(X_test, y_test)
 
param = {'max_depth': 5, 'eta': 1, 'objective': 'binary:logistic'}
watchlist = [(data_test, 'test'), (data_train, 'train')]
n_round = 3  # 迭代训练3轮
booster = xgb.train(param, data_train, num_boost_round=n_round, evals=watchlist)
 
# 计算错误率
y_predicted = booster.predict(data_test)
y = data_test.get_label()
 
accuracy = sum(y == (y_predicted > 0.5))
accuracy_rate = float(accuracy) / len(y_predicted)
print ('样本总数：{0}'.format(len(y_predicted)))
print ('正确数目：{0}'.format(accuracy) )
print ('正确率：{0:.3f}'.format((accuracy_rate)))

[0]	test-error:0.231343	train-error:0.126806
[1]	test-error:0.227612	train-error:0.117175
[2]	test-error:0.223881	train-error:0.104334
样本总数：268
正确数目：208
正确率：0.776


## 2.2XGBoost的sklearn接口版本

In [13]:
X_train,X_test,y_train,y_test = train_test_split(df_train,target,test_size = 0.3,random_state = 1)
 
model = xgb.XGBClassifier(max_depth=3, n_estimators=200, learn_rate=0.01)
model.fit(X_train, y_train)  
test_score = model.score(X_test, y_test)
print('test_score: {0}'.format(test_score))

test_score: 0.7723880597014925


利用xgboost做一次预测。

In [21]:
try_pred = X_test[[0,1],:]
try_pred

array([[48.        ,  0.        ,  0.        , 25.9292    ,  0.        ,
         0.        ,  1.        ,  1.        ,  0.        ,  1.        ,
         0.        ,  0.        ],
       [29.69911765,  0.        ,  0.        ,  7.8958    ,  0.        ,
         0.        ,  1.        ,  0.        ,  1.        ,  0.        ,
         0.        ,  1.        ]])

In [22]:
try_pred_y = y_test[0:2]
try_pred_y

array([1, 0], dtype=int64)

In [23]:
pred = model.predict(try_pred)
pred

array([1, 0], dtype=int64)

# 3.使用其他模型于XGBoost进行对比

In [17]:
# 应用模型进行预测
model_lr = LogisticRegression()
model_rf = RandomForestClassifier(n_estimators=200)
model_xgb = xgb.XGBClassifier(max_depth=5, n_estimators=200, learn_rate=0.01)
models = [model_lr, model_rf, model_xgb]
model_name = ['LogisticRegression', '随机森林', 'XGBoost']
 
cv =cross_validation.ShuffleSplit(len(df_train), n_iter=3, test_size=0.3, random_state=1)
for i in range(3):
    print(model_name[i] + ":")
    model = models[i]
    for train, test in cv:    
        model.fit(df_train[train], target[train])
        train_score = model.score(df_train[train], target[train])
        test_score = model.score(df_train[test], target[test])
        print('train score: {0:.5f} \t test score: {0:.5f}'.format(train_score, test_score))

LogisticRegression:
train score: 0.81380 	 test score: 0.81380
train score: 0.81862 	 test score: 0.81862
train score: 0.82022 	 test score: 0.82022
随机森林:
train score: 0.98876 	 test score: 0.98876
train score: 0.99037 	 test score: 0.99037
train score: 0.99037 	 test score: 0.99037
XGBoost:
train score: 0.95185 	 test score: 0.95185
train score: 0.96629 	 test score: 0.96629
train score: 0.95345 	 test score: 0.95345
