<h1>6.5 LR+GBDT<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#6.5.3-LR+GBDT算法实现" data-toc-modified-id="6.5.3-LR+GBDT算法实现-1">6.5.3 LR+GBDT算法实现</a></span></li></ul></div>

# 6.5.3 LR+GBDT算法实现

In [1]:
#coding:utf8
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve,auc   
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from mlxtend.classifier import StackingClassifier
import re
import warnings
warnings.simplefilter('ignore')


# 导入数据
train_set = pd.read_csv('../data/train.csv')
test_set = pd.read_csv('../data/test.csv')

# 特征含义
# PassengerId 乘客编号
# Survived 是否幸存
# Pclass 船票等级
# Name 乘客姓名
# Sex 乘客性别
# Age 乘客年龄
# SibSp 兄弟姐妹/配偶数量
# Parch 父母/子女数量
# Ticket 船票号码
# Fare 船票价格
# Cabin 船舱
# Embarked 登录港口

        
train_test = pd.concat([train_set,test_set],axis=0)


train_test['Embarked'].value_counts()
train_test['Pclass'].value_counts()
train_test['Embarked'].fillna('S',inplace=True)

#票价与pclass和Embarked有关
train_test.groupby(['Pclass','Embarked']).Fare.mean()
train_test['Fare'].fillna(14.435422,inplace=True)

# 缺失值填充
train_test['Age'].fillna(train_test['Age'].median(),inplace=True)


# 特征工程
train_test['SibSp_Parch'] = train_test['Parch'] + train_test['SibSp']

#从名字中提取出称呼
train_test['Name_new'] = train_test['Name'].str.extract('.+,(.+)',expand=False).str.extract('^(.+?)\.',expand=False).str.strip()
# print(train_test['Name_new'].unique())
# print('\n')

#将姓名分类处理()
train_test['Name_new'].replace(['Capt', 'Col', 'Major', 'Dr', 'Rev'], 'Officer' , inplace = True)
train_test['Name_new'].replace(['Jonkheer', 'Don', 'Sir', 'the Countess', 'Dona', 'Lady'], 'Royalty' , inplace = True)
train_test['Name_new'].replace(['Mme', 'Ms', 'Mrs'], 'Mrs',inplace=True)
train_test['Name_new'].replace(['Mlle', 'Miss'], 'Miss',inplace=True)
train_test['Name_new'].replace(['Mr'], 'Mr' , inplace = True)
train_test['Name_new'].replace(['Master'], 'Master' , inplace = True)
# print(train_test['Name_new'].unique())
# print('\n')

# 分类变量数值化
train_test['Name_new'] = train_test['Name_new'].map({'Mr':0,'Mrs':1,'Miss':2,'Master':3,'Royalty':4,'Officer':5}).astype(int)
train_test['Sex'] = train_test['Sex'].map({'female':1,'male':0}).astype(int)
train_test['Embarked'] = train_test['Embarked'].map({'S':0,'C':1,'Q':2}).astype(int)


#将年龄划分阶段
train_test['Age']=pd.cut(train_test['Age'],bins=[0,18,30,40,50,100],labels=[1,2,3,4,5])
train_test['Age'] = train_test['Age'].astype('float64')

# 剔除不需要的特征
train_test.drop(['PassengerId','Ticket','Name','Cabin'],axis=1,inplace=True)
# print(train_test.info())

#特征工程完成，划分数据集
train_data=train_test[:891]
test_data=train_test[891:]
train_data_X=train_data.drop(['Survived'],axis=1)
train_data_Y=train_data['Survived']
test_data_X=test_data.drop(['Survived'],axis=1)
test_data_Y=test_data['Survived']

In [2]:
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.metrics.ranking import roc_auc_score
from sklearn.preprocessing.data import OneHotEncoder

class GradientBoostingWithLR(object):
    def __init__(self):
        self.gbdt_model = None
        self.lr_model = None
        self.gbdt_encoder = None
        self.X_train_leafs = None
        self.X_test_leafs = None
        self.X_trans = None

    def gbdt_train(self, X_train, y_train):
        """定义GBDT模型
        """
        gbdt_model = GradientBoostingClassifier(n_estimators=10, 
                                          max_depth=6, 
                                          verbose=0,
                                          max_features=0.5)
        # 训练学习
        gbdt_model.fit(X_train, y_train)
        return gbdt_model

    def lr_train(self, X_train, y_train):
        """定义LR模型
        """
        lr_model = LogisticRegression()
        lr_model.fit(X_train, y_train)    # 预测及AUC评测
        return lr_model
    

    def gbdt_lr_train(self,X_train, y_train,X_test):
        """训练gbdt+lr模型
        """
        self.gbdt_model = self.gbdt_train(X_train, y_train)

        # 使用GBDT的apply方法对原有特征进行编码
        self.X_train_leafs = self.gbdt_model.apply(X_train)[:,:,0]
        
        # 对特征进行ont-hot编码
        self.gbdt_encoder = OneHotEncoder(categories='auto')
        self.gbdt_encoder.fit(self.X_train_leafs)
        self.X_trans = self.gbdt_encoder.fit_transform(self.X_train_leafs)
        
        #采用LR进行训练
        self.lr_model = self.lr_train(self.X_trans, y_train)
        return self.lr_model
    
    def gbdt_lr_pred(self, model, X_test, y_test):
        """预测及AUC评估
        """
        self.X_test_leafs = self.gbdt_model.apply(X_test)[:,:,0]
        
        (train_rows, cols) =self.X_train_leafs.shape
        X_trans_all = self.gbdt_encoder.fit_transform(np.concatenate((self.X_train_leafs, self.X_test_leafs), axis=0))
        
        y_pred = model.predict_proba(X_trans_all[train_rows:])[:, 1]
        auc_score = roc_auc_score(y_test, y_pred)
        print('GBDT+LR AUC score: %.5f' % auc_score)
        return auc_score
    
    def model_assessment(self, model, X_test, y_test, model_name="GBDT"):
        """模型评估
        """
        y_pred = model.predict_proba(X_test)[:,1]
        auc_score = roc_auc_score(y_test, y_pred)
        print("%s AUC score: %.5f" % (model_name,auc_score))
        return auc_score

In [3]:
model = GradientBoostingWithLR()
model.gbdt_lr_train(train_data_X,train_data_Y,test_data_X)
model.gbdt_lr_pred(model.lr_model,test_data_X,test_data_Y)

GBDT+LR AUC score: 0.89751


0.8975069252077563