In [1]:
import pandas as pd 

train_data_file = 'data/zhengqi_train.txt'
test_data_file = 'data/zhengqi_test.txt'

train_data = pd.read_csv(train_data_file, sep = '\t', encoding = 'utf-8')
test_data = pd.read_csv(test_data_file, sep = '\t', encoding = 'utf_8')

## 定义特征构造方法

In [2]:
eps = 1e-5

# 交叉特征方式
func_dict = {
    'add':lambda x, y: x + y,
    'multi':lambda x, y: x * y,
    'div':lambda x, y: x / (y + eps),
}

In [3]:
# 特征构造方法
def auto_features(train_data, test_data, func_dict, col_list):
    train_data, test_data = train_data.copy(), test_data.copy()
    for col_i in col_list:
        for col_j in col_list:
            for func_name, func in func_dict.items():
                for data in [train_data, test_data]:
                    func_features = func(data[col_i], data[col_j])
                    col_func_features = '-'.join([col_i, func_name, col_j])
                    data[col_func_features] = func_features
    
    return train_data, test_data

## 构造特征并降维

In [4]:
# 构造特征
train_data2, test_data2 = auto_features(train_data,test_data,func_dict,col_list=test_data.columns)

In [5]:
# PCA降维
from sklearn.decomposition import PCA

pca = PCA(n_components = 500)
train_data2_pca = pca.fit_transform(train_data2.iloc[:,0:-1])
test_data2_pca = pca.transform(test_data2)
train_data2_pca = pd.DataFrame(train_data2_pca)
test_data2_pca = pd.DataFrame(test_data2_pca)
train_data2_pca['target'] = train_data2['target']

# 训练准备
X_train2 = train_data2[test_data2.columns].values
y_train = train_data2['target']

## LGB模型训练

In [6]:
# ls_validation i
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import numpy as np

# 5折交叉验证
Folds = 5
kf = KFold(n_splits = Folds, random_state = 0, shuffle = True)
# 记录训练和预测MSE
MSE_DICT = {
    'train_mse':[],
    'test_mse':[]
}

# 线下训练预测
for i, (train_index, test_index) in enumerate(kf.split(X_train2)):
    # lgb树模型
    lgb_reg = lgb.LGBMRegressor(
        boosting_type = 'gbdt',
        objective = 'regression',
        metric = 'mse',
        train_metric = True,
        n_estimators = 3000,
        early_stopping_rounds = 100,
        n_jobs = -1,
        learning_rate = 0.01,
        max_depth = 4,
        feature_fraction = 0.8,
        feature_fraction_seed = 0,
        bagging_fraction = 0.8, 
        bagging_freq = 2,
        bagging_seed = 0,
        lambda_l1 = 1,
        lambda_l2 = 1,
        verbosity = 1
    )
   
    # 切分训练集和预测集
    X_train_KFold, X_test_KFold = X_train2[train_index], X_train2[test_index]
    y_train_KFold, y_test_KFold = y_train[train_index], y_train[test_index]
    
    # 训练模型
    lgb_reg.fit(
            X=X_train_KFold,y=y_train_KFold,
            eval_set=[(X_train_KFold, y_train_KFold),(X_test_KFold, y_test_KFold)],
            eval_names=['Train','Test'],
            early_stopping_rounds=100,
            eval_metric='MSE',
            verbose=600
        )


    # 训练集预测 测试集预测
    y_train_KFold_predict = lgb_reg.predict(X_train_KFold,num_iteration=lgb_reg.best_iteration_)
    y_test_KFold_predict = lgb_reg.predict(X_test_KFold,num_iteration=lgb_reg.best_iteration_) 
    
    print('第{}折 训练和预测 训练MSE 预测MSE'.format(i))
    train_mse = mean_squared_error(y_train_KFold_predict, y_train_KFold)
    print('------\t', '训练MSE\t', train_mse, '\t------')
    test_mse = mean_squared_error(y_test_KFold_predict, y_test_KFold)
    print('------\t', '预测MSE\t', test_mse, '\t------\n')
    
    MSE_DICT['train_mse'].append(train_mse)
    MSE_DICT['test_mse'].append(test_mse)
print('------\t', '训练平均MSE\t', np.mean(MSE_DICT['train_mse']), '\t------')
print('------\t', '预测平均MSE\t', np.mean(MSE_DICT['test_mse']), '\t------')

Training until validation scores don't improve for 100 rounds.
[600]	Train's l2: 0.0429543	Test's l2: 0.117409
[1200]	Train's l2: 0.0206657	Test's l2: 0.11418
Early stopping, best iteration is:
[1698]	Train's l2: 0.0119091	Test's l2: 0.112934
第0折 训练和预测 训练MSE 预测MSE
------	 训练MSE	 0.011909097708741524 	------
------	 预测MSE	 0.11293385783352594 	------

Training until validation scores don't improve for 100 rounds.
[600]	Train's l2: 0.0446672	Test's l2: 0.104044
[1200]	Train's l2: 0.0207863	Test's l2: 0.100736
[1800]	Train's l2: 0.010513	Test's l2: 0.0996182
Early stopping, best iteration is:
[1808]	Train's l2: 0.0104171	Test's l2: 0.0995806
第1折 训练和预测 训练MSE 预测MSE
------	 训练MSE	 0.010417092420258707 	------
------	 预测MSE	 0.09958063481516348 	------

Training until validation scores don't improve for 100 rounds.
[600]	Train's l2: 0.044053	Test's l2: 0.10463
[1200]	Train's l2: 0.0209939	Test's l2: 0.101379
[1800]	Train's l2: 0.0109506	Test's l2: 0.0996974
[2400]	Train's l2: 0.00607394	Test'