# 模型融合

In [12]:
import sys
sys.path.append('/home/aistudio/external-libraries')

## 基础工具
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.special import jn
from scipy.sparse import csr_matrix
from IPython.display import display, clear_output
import time

warnings.filterwarnings('ignore')
%matplotlib inline

## 模型预测的
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor

## 数据降维处理的
from sklearn.decomposition import PCA,FastICA,FactorAnalysis,SparsePCA

import lightgbm as lgbm
import xgboost as xgb

## 参数搜索和评价的
from sklearn.model_selection import GridSearchCV,cross_val_score,StratifiedKFold,KFold,train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer

## 模型
import pickle as cPickle

In [None]:
# # ## feature selection
# feat_importance = pd.read_csv('./used_car/feat_importance/lightGBM_feat_importance_0403_v2.csv')
# feature_name = feat_importance['name'].values
# feature_importance = feat_importance['importance'].values

# drop_col = feature_name[feature_importance<4000]
# train.drop(drop_col, axis=1, inplace=True)
# test.drop(drop_col, axis=1, inplace=True)


## 读取特征

In [None]:
train = pd.read_csv('../training/train_data_180.csv')
test = pd.read_csv('../training/test_data_180.csv')

train_X = train.drop(['price','SaleID'], axis=1)
train_y = train['price']
test_X = test.drop(['SaleID'], axis=1)

feat_name = train_X.columns

# 交叉验证折数
S_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=33)
kfold = KFold(n_splits=5, shuffle=True, random_state=33)

print(train_X.shape)
print(test_X.shape)

In [14]:
# 用sklearn的train_test_split函数，随机划分验证集
tr_X,val_X,tr_y,val_y = train_test_split(train_X,train_y,test_size=0.2, random_state=2000)

# 用lightGBM模型时，数据需要转换成Dataset类型
train_data_lgb = lgbm.Dataset(tr_X, label=tr_y, free_raw_data=True)
val_data_lgb = lgbm.Dataset(val_X, label=val_y, free_raw_data=True)

# 用XGBoost模型时，数据需要转换成DMatrix类型
train_data_xgb = xgb.DMatrix(tr_X, label=tr_y)
val_data_xgb = xgb.DMatrix(val_X, label=val_y)

In [15]:
## 定义了一个统计函数，方便后续信息统计
def Sta_inf(data):
    print('_min',np.min(data))
    print('_max:',np.max(data))
    print('_mean',np.mean(data))
    print('_ptp',np.ptp(data))
    print('_std',np.std(data))
    print('_var',np.var(data))

## 模型选择

In [None]:
from sklearn.feature_selection import SelectFromModel

#lgb作为基模型的特征选择
slt = SelectFromModel(lgbm.LGBMRegressor(n_estimators=3000)).fit(train_X, train_y)

train_X = pd.DataFrame(slt.transform(train_X))
test_X = pd.DataFrame(slt.transform(test_X))
test_X.shape

In [25]:
def build_model_lr(x_train,y_train):
    reg_model = linear_model.LinearRegression()
    reg_model.fit(x_train,y_train)
    return reg_model

def build_model_lgb(x_train,y_train):
    gbm = lgb.LGBMRegressor(n_estimators=8000, max_depth=10, num_leaves=85, min_data_in_leaf=5 ,subsample=0.9, colsample_bytree=0.8, 
                            learning_rate=0.05, feature_fraction=0.8,lambda_l1=1, lambda_l2=2, n_jobs=-1)
    gbm.fit(x_train, y_train)
    return gbm

def build_model_xgb(x_train,y_train):
    xgb_r = xgb.XGBRegressor(n_estimators=8000, learning_rate=0.05, max_depth=7, min_child_weight=3, subsample=0.9,
                        colsample_bytree=0.7, reg_alpha=3, reg_lambda=5, n_jobs=-1)
    xgb_r.fit(x_train, y_train)
    return xgb_r

In [None]:
## 用XGBoost训练模型并预测测试集
print('Training XGB...')
model_xgb = build_model_xgb(tr_X,tr_y)

print('Predict XGB...')
val_xgb = model_xgb.predict(val_X)
subA_xgb = np.expm1(model_xgb.predict(test_X))

MAE_xgb = mean_absolute_error(np.expm1(val_y),np.expm1(val_xgb))
print('MAE of val with xgb:',MAE_xgb)
print('Status of Predict xgb:')
Sta_inf(subA_xgb)

## 用lightGBM训练模型并预测测试集
print('Training lgb...')
model_lgb = build_model_lgb(tr_X,tr_y)

print('Predict lgb...')
val_lgb = model_lgb.predict(val_X)
subA_lgb = np.expm1(model_lgb.predict(test_X))

MAE_lgb = mean_absolute_error(np.expm1(val_y),np.expm1(val_lgb))
print('MAE of val with lgb:',MAE_lgb)
print('Status of Predict lgb:')
Sta_inf(subA_lgb)

## 这里采取了简单的加权融合的方式
val_Weighted = (1-MAE_lgb/(MAE_xgb+MAE_lgb))*val_lgb+(1-MAE_xgb/(MAE_xgb+MAE_lgb))*val_xgb
val_Weighted[val_Weighted<0]=10 # 由于我们发现预测的最小值有负数，而真实情况下，price为负是不存在的，由此我们进行对应的后修正
print('MAE of val with Weighted ensemble:',mean_absolute_error(y_val,val_Weighted))

sub_Weighted = (1-MAE_lgb/(MAE_xgb+MAE_lgb))*subA_lgb+(1-MAE_xgb/(MAE_xgb+MAE_lgb))*subA_xgb
sub = pd.DataFrame()
sub['SaleID'] = test.SaleID
sub['price'] = sub_Weighted
sub.to_csv('./used_car/submission/sub_Weighted_01.csv',index=False)

## 1. 第一种融合方式 - 加权融合

In [12]:
def Weighted_method(test_pre1,test_pre2,test_pre3,w=[1/3,1/3,1/3]):
    Weighted_result = w[0]*pd.Series(test_pre1)+w[1]*pd.Series(test_pre2)+w[2]*pd.Series(test_pre3)
    return Weighted_result

# Init the Weight
w = [0.3,0.4,0.3]

# 测试验证集准确度
val_pre = Weighted_method(val_lgb,val_xgb,val_gbdt,w)
MAE_Weighted = mean_absolute_error(y_val,val_pre)
print('MAE of Weighted of val:',MAE_Weighted)

# 预测数据部分
subA = Weighted_method(subA_lgb,subA_xgb,subA_gbdt,w)
print('Sta inf:')
Sta_inf(subA)

# 生成提交文件
sub = pd.DataFrame()
sub['SaleID'] = test.SaleID
sub['price'] = subA
sub.to_csv('../submission/sub_Weighted_128_0409_01.csv',index=False)

## 2. 第二种融合方式 - 二层Stacking融合

In [None]:
## Starking

## 第一层
train_lgb_pred = np.expm1(model_lgb.predict(tr_X))
train_xgb_pred = np.expm1(model_xgb.predict(tr_X))
# train_gbdt_pred = model_gbdt.predict(x_train)

Strak_X_train = pd.DataFrame()
Strak_X_train['Method_1'] = train_lgb_pred
Strak_X_train['Method_2'] = train_xgb_pred
# Strak_X_train['Method_3'] = train_gbdt_pred

Strak_X_val = pd.DataFrame()
Strak_X_val['Method_1'] = val_lgb
Strak_X_val['Method_2'] = val_xgb
# Strak_X_val['Method_3'] = val_gbdt

Strak_X_test = pd.DataFrame()
Strak_X_test['Method_1'] = subA_lgb
Strak_X_test['Method_2'] = subA_xgb
# Strak_X_test['Method_3'] = subA_gbdt

In [None]:
## level2-method 
model_lr_Stacking = build_model_lr(Strak_X_train,tr_y)
## 训练集
train_pre_Stacking = model_lr_Stacking.predict(Strak_X_train)
print('MAE of Stacking-LR:',mean_absolute_error((tr_y),np.expm1(train_pre_Stacking))

## 验证集
val_pre_Stacking = model_lr_Stacking.predict(Strak_X_val)
print('MAE of Stacking-LR:',mean_absolute_error(np.expm1(val_y),np.expm1(val_pre_Stacking))

## 预测集
print('Predict Stacking-LR...')
subA_Stacking = np.expm1(model_lr_Stacking.predict(Strak_X_test))

subA_Stacking[subA_Stacking<10]=10  ## 去除过小的预测值

sub = pd.DataFrame()
sub['SaleID'] = test.SaleID
sub['price'] = subA_Stacking
sub.to_csv('../submission/sub_Stacking.csv',index=False)