## 特征优化

### 导入数据

In [1]:
import warnings
import pandas as pd

warnings.filterwarnings("ignore")

train_data_file = "./zhengqi_train.txt"
test_data_file =  "./zhengqi_test.txt"

train_data = pd.read_csv(train_data_file, sep='\t', encoding='utf-8')
test_data = pd.read_csv(test_data_file, sep='\t', encoding='utf-8')

### 定义特征构造方法，构造特征

In [2]:
epsilon=1e-5

#组交叉特征，可以自行定义，如增加： x*x/y, log(x)/y 等等
func_dict = {
        'add': lambda x,y: x+y,
        'mins': lambda x,y: x-y,
        'div': lambda x,y: x/(y+epsilon),
        'multi': lambda x,y: x*y
        }

### 定义特征构造的函数

In [3]:
def auto_features_make(train_data,test_data,func_dict,col_list):
    train_data, test_data = train_data.copy(), test_data.copy()
    for col_i in col_list:
        for col_j in col_list:
            for func_name, func in func_dict.items():
                for data in [train_data,test_data]:
                    func_features = func(data[col_i],data[col_j])
                    col_func_features = '-'.join([col_i,func_name,col_j])
                    data[col_func_features] = func_features
    return train_data,test_data

### 对训练集和测试集数据进行特征构造

In [4]:
train_data2, test_data2 = auto_features_make(train_data,test_data,func_dict,col_list=test_data.columns)

In [5]:
from sklearn.decomposition import PCA   #主成分分析法

#PCA方法降维
pca = PCA(n_components=500)
train_data2_pca = pca.fit_transform(train_data2.iloc[:,0:-1])
test_data2_pca = pca.transform(test_data2)
train_data2_pca = pd.DataFrame(train_data2_pca)
test_data2_pca = pd.DataFrame(test_data2_pca)
train_data2_pca['target'] = train_data2['target']

In [6]:
X_train2 = train_data2[test_data2.columns].values
y_train = train_data2['target']

### 使用lightgbm模型对新构造的特征进行模型训练和评估

In [7]:
# ls_validation i
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import numpy as np

# 5折交叉验证
Folds=5
# kf = KFold(len(X_train2), n_splits=Folds, random_state=2019, shuffle=True)
kf = KFold(len(X_train2), random_state=2019, shuffle=True)
# 记录训练和预测MSE
MSE_DICT = {
    'train_mse':[],
    'test_mse':[]
}

# 线下训练预测
for i, (train_index, test_index) in enumerate(kf.split(X_train2)):
    # lgb树模型
    lgb_reg = lgb.LGBMRegressor(
        learning_rate=0.01,
        max_depth=-1,
        n_estimators=5000,
        boosting_type='gbdt',
        random_state=2019,
        objective='regression',
    )
   
    # 切分训练集和预测集
    X_train_KFold, X_test_KFold = X_train2[train_index], X_train2[test_index]
    y_train_KFold, y_test_KFold = y_train[train_index], y_train[test_index]
    
    # 训练模型
    lgb_reg.fit(
            X=X_train_KFold,y=y_train_KFold,
            eval_set=[(X_train_KFold, y_train_KFold),(X_test_KFold, y_test_KFold)],
            eval_names=['Train','Test'],
            early_stopping_rounds=100,
            eval_metric='MSE',
            verbose=50
        )

    # 训练集预测 测试集预测
    y_train_KFold_predict = lgb_reg.predict(X_train_KFold,num_iteration=lgb_reg.best_iteration_)
    y_test_KFold_predict = lgb_reg.predict(X_test_KFold,num_iteration=lgb_reg.best_iteration_) 
    
    print('第{}折 训练和预测 训练MSE 预测MSE'.format(i))
    train_mse = mean_squared_error(y_train_KFold_predict, y_train_KFold)
    print('------\n', '训练MSE\n', train_mse, '\n------')
    test_mse = mean_squared_error(y_test_KFold_predict, y_test_KFold)
    print('------\n', '预测MSE\n', test_mse, '\n------\n')
    
    MSE_DICT['train_mse'].append(train_mse)
    MSE_DICT['test_mse'].append(test_mse)
print('------\n', '训练MSE\n', MSE_DICT['train_mse'], '\n', np.mean(MSE_DICT['train_mse']), '\n------')
print('------\n', '预测MSE\n', MSE_DICT['test_mse'], '\n', np.mean(MSE_DICT['test_mse']), '\n------')

[50]	Train's l2: 0.418976	Test's l2: 0.105755
[100]	Train's l2: 0.203665	Test's l2: 0.0242962
[150]	Train's l2: 0.114456	Test's l2: 0.00489616
[200]	Train's l2: 0.0741974	Test's l2: 2.93437e-07
[250]	Train's l2: 0.0535211	Test's l2: 0.000800416
第0折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.07475946684347008 
------
------
 预测MSE
 3.306387977686079e-08 
------

[50]	Train's l2: 0.419015	Test's l2: 0.140632
[100]	Train's l2: 0.203691	Test's l2: 0.0985215
[150]	Train's l2: 0.114536	Test's l2: 0.079759
[200]	Train's l2: 0.074307	Test's l2: 0.0605385
[250]	Train's l2: 0.0536499	Test's l2: 0.0513692
[300]	Train's l2: 0.0416162	Test's l2: 0.049854
[350]	Train's l2: 0.0335032	Test's l2: 0.0429859
[400]	Train's l2: 0.0276629	Test's l2: 0.0404468
[450]	Train's l2: 0.0231634	Test's l2: 0.0412943
[500]	Train's l2: 0.0195293	Test's l2: 0.0397455
[550]	Train's l2: 0.0165658	Test's l2: 0.0393974
[600]	Train's l2: 0.0141633	Test's l2: 0.0407649
第1折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.01672784298833254 
------

[500]	Train's l2: 0.0194155	Test's l2: 0.00633378
[550]	Train's l2: 0.016443	Test's l2: 0.00380764
[600]	Train's l2: 0.0140302	Test's l2: 0.00372934
[650]	Train's l2: 0.0120808	Test's l2: 0.0035822
第15折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.01606915797637025 
------
------
 预测MSE
 0.003129310364319631 
------

[50]	Train's l2: 0.418345	Test's l2: 1.97427
[100]	Train's l2: 0.203341	Test's l2: 1.17486
[150]	Train's l2: 0.114403	Test's l2: 0.715629
[200]	Train's l2: 0.0743004	Test's l2: 0.48508
[250]	Train's l2: 0.0537036	Test's l2: 0.329718
[300]	Train's l2: 0.0415312	Test's l2: 0.228379
[350]	Train's l2: 0.0333408	Test's l2: 0.190106
[400]	Train's l2: 0.0275096	Test's l2: 0.159225
[450]	Train's l2: 0.0230432	Test's l2: 0.149278
[500]	Train's l2: 0.019421	Test's l2: 0.139129
[550]	Train's l2: 0.0164762	Test's l2: 0.124128
[600]	Train's l2: 0.0140842	Test's l2: 0.114802
[650]	Train's l2: 0.0121376	Test's l2: 0.106123
[700]	Train's l2: 0.0104972	Test's l2: 0.097351
[750]	Train's l2: 0.0091160

[50]	Train's l2: 0.418369	Test's l2: 2.26038
[100]	Train's l2: 0.203413	Test's l2: 1.4536
[150]	Train's l2: 0.114356	Test's l2: 1.08486
[200]	Train's l2: 0.0741732	Test's l2: 0.925499
[250]	Train's l2: 0.0534706	Test's l2: 0.894237
[300]	Train's l2: 0.0414622	Test's l2: 0.949825
第29折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.05739820761799305 
------
------
 预测MSE
 0.8789052267146239 
------

[50]	Train's l2: 0.418938	Test's l2: 0.0845047
[100]	Train's l2: 0.203639	Test's l2: 0.105317
第30折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.9512141800301067 
------
------
 预测MSE
 0.02781226325866893 
------

[50]	Train's l2: 0.418949	Test's l2: 0.00393978
[100]	Train's l2: 0.203635	Test's l2: 0.00167705
[150]	Train's l2: 0.114463	Test's l2: 0.000576895
[200]	Train's l2: 0.0741904	Test's l2: 0.000422131
[250]	Train's l2: 0.0534759	Test's l2: 5.33377e-05
[300]	Train's l2: 0.0413848	Test's l2: 3.9383e-06
[350]	Train's l2: 0.0333057	Test's l2: 8.21557e-06
第31折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.045814183253876435

第47折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.01793796086592235 
------
------
 预测MSE
 0.06212818757186978 
------

[50]	Train's l2: 0.418987	Test's l2: 0.0140196
[100]	Train's l2: 0.20377	Test's l2: 0.0175767
第48折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.951219553298611 
------
------
 预测MSE
 0.005244422362982837 
------

[50]	Train's l2: 0.419043	Test's l2: 0.0383013
[100]	Train's l2: 0.203773	Test's l2: 0.0100435
[150]	Train's l2: 0.114617	Test's l2: 0.00396733
[200]	Train's l2: 0.0743197	Test's l2: 0.00269928
[250]	Train's l2: 0.0536444	Test's l2: 0.0011435
[300]	Train's l2: 0.0415285	Test's l2: 0.000652502
[350]	Train's l2: 0.0334391	Test's l2: 0.000418474
[400]	Train's l2: 0.0275712	Test's l2: 2.88312e-05
[450]	Train's l2: 0.0230815	Test's l2: 3.80808e-07
[500]	Train's l2: 0.019448	Test's l2: 3.51029e-05
第49折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.023241571829709148 
------
------
 预测MSE
 2.904966126806416e-09 
------

[50]	Train's l2: 0.418975	Test's l2: 0.166034
[100]	Train's l2: 0.203831	Test

[100]	Train's l2: 0.203772	Test's l2: 0.00628724
第66折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.9512273766820094 
------
------
 预测MSE
 0.00441110047553642 
------

[50]	Train's l2: 0.418975	Test's l2: 0.00742269
[100]	Train's l2: 0.203659	Test's l2: 0.00190738
[150]	Train's l2: 0.114592	Test's l2: 0.00794872
第67折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.2674154108159777 
------
------
 预测MSE
 2.3305240159585706e-06 
------

[50]	Train's l2: 0.419023	Test's l2: 0.0415342
[100]	Train's l2: 0.203769	Test's l2: 0.0330994
[150]	Train's l2: 0.114508	Test's l2: 0.0231635
[200]	Train's l2: 0.0742715	Test's l2: 0.0189494
[250]	Train's l2: 0.0535659	Test's l2: 0.0171538
[300]	Train's l2: 0.0415403	Test's l2: 0.0171363
第68折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.06326062302181207 
------
------
 预测MSE
 0.016851925159968112 
------

[50]	Train's l2: 0.418084	Test's l2: 3.20952
[100]	Train's l2: 0.203183	Test's l2: 2.08006
[150]	Train's l2: 0.114078	Test's l2: 1.56473
[200]	Train's l2: 0.0740295	Test's l2: 1.20414

[900]	Train's l2: 0.0061124	Test's l2: 0.118195
[950]	Train's l2: 0.00536725	Test's l2: 0.114097
[1000]	Train's l2: 0.00472066	Test's l2: 0.111787
[1050]	Train's l2: 0.00417715	Test's l2: 0.110171
[1100]	Train's l2: 0.00369346	Test's l2: 0.107048
[1150]	Train's l2: 0.00326762	Test's l2: 0.104639
[1200]	Train's l2: 0.00289847	Test's l2: 0.103328
[1250]	Train's l2: 0.00258124	Test's l2: 0.104057
[1300]	Train's l2: 0.00229657	Test's l2: 0.103554
[1350]	Train's l2: 0.00205067	Test's l2: 0.102891
[1400]	Train's l2: 0.00183246	Test's l2: 0.101237
[1450]	Train's l2: 0.00164186	Test's l2: 0.100237
[1500]	Train's l2: 0.00147402	Test's l2: 0.0988037
[1550]	Train's l2: 0.00132496	Test's l2: 0.0977198
[1600]	Train's l2: 0.00119326	Test's l2: 0.0968302
[1650]	Train's l2: 0.00107388	Test's l2: 0.0961431
[1700]	Train's l2: 0.000970239	Test's l2: 0.0956871
[1750]	Train's l2: 0.000878459	Test's l2: 0.095275
[1800]	Train's l2: 0.000795052	Test's l2: 0.0946709
[1850]	Train's l2: 0.000721914	Test's l2: 0.

[200]	Train's l2: 0.0742994	Test's l2: 0.107627
[250]	Train's l2: 0.053589	Test's l2: 0.0725954
[300]	Train's l2: 0.0414712	Test's l2: 0.0479131
[350]	Train's l2: 0.0333712	Test's l2: 0.037782
[400]	Train's l2: 0.0275169	Test's l2: 0.0305524
[450]	Train's l2: 0.0230531	Test's l2: 0.0291581
[500]	Train's l2: 0.0194282	Test's l2: 0.0266891
[550]	Train's l2: 0.0165008	Test's l2: 0.0245644
[600]	Train's l2: 0.0141418	Test's l2: 0.0217423
[650]	Train's l2: 0.0121737	Test's l2: 0.018622
[700]	Train's l2: 0.0105214	Test's l2: 0.0176585
[750]	Train's l2: 0.00913066	Test's l2: 0.0166352
[800]	Train's l2: 0.0079499	Test's l2: 0.0163536
[850]	Train's l2: 0.00692945	Test's l2: 0.0162581
第99折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.008216467003589861 
------
------
 预测MSE
 0.016039685065663382 
------

[50]	Train's l2: 0.41899	Test's l2: 0.0296411
[100]	Train's l2: 0.203735	Test's l2: 0.0116791
[150]	Train's l2: 0.114594	Test's l2: 0.00122921
[200]	Train's l2: 0.0743235	Test's l2: 1.50395e-06
[250]	Trai

[50]	Train's l2: 0.418648	Test's l2: 0.364509
[100]	Train's l2: 0.203375	Test's l2: 0.762343
第118折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.9511930819089869 
------
------
 预测MSE
 0.024532421283671266 
------

[50]	Train's l2: 0.418932	Test's l2: 0.00243219
[100]	Train's l2: 0.203575	Test's l2: 0.0255424
第119折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.5244027851340799 
------
------
 预测MSE
 1.9438412849939942e-07 
------

[50]	Train's l2: 0.418985	Test's l2: 0.00222108
[100]	Train's l2: 0.203706	Test's l2: 8.1599e-06
[150]	Train's l2: 0.114472	Test's l2: 0.00142137
第120折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.21761012417323125 
------
------
 预测MSE
 1.6198240532680981e-09 
------

[50]	Train's l2: 0.418993	Test's l2: 0.000858458
[100]	Train's l2: 0.203737	Test's l2: 0.00184948
[150]	Train's l2: 0.114561	Test's l2: 0.0050258
第121折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.3327342300877382 
------
------
 预测MSE
 2.588510987910583e-07 
------

[50]	Train's l2: 0.418919	Test's l2: 0.0752782
[100]	Train's l2: 0.20

[850]	Train's l2: 0.00697401	Test's l2: 0.0440121
[900]	Train's l2: 0.0061143	Test's l2: 0.0419541
[950]	Train's l2: 0.00537623	Test's l2: 0.0414121
[1000]	Train's l2: 0.00474652	Test's l2: 0.0403733
[1050]	Train's l2: 0.00419163	Test's l2: 0.0391563
[1100]	Train's l2: 0.00370895	Test's l2: 0.0377543
[1150]	Train's l2: 0.00329042	Test's l2: 0.0366689
[1200]	Train's l2: 0.0029209	Test's l2: 0.035806
[1250]	Train's l2: 0.00260439	Test's l2: 0.0353882
[1300]	Train's l2: 0.00232037	Test's l2: 0.0349751
[1350]	Train's l2: 0.00206843	Test's l2: 0.0334287
[1400]	Train's l2: 0.00184531	Test's l2: 0.0330335
[1450]	Train's l2: 0.00165145	Test's l2: 0.0329243
[1500]	Train's l2: 0.00148318	Test's l2: 0.0327867
[1550]	Train's l2: 0.00133198	Test's l2: 0.0326902
[1600]	Train's l2: 0.00119847	Test's l2: 0.0325663
[1650]	Train's l2: 0.00108228	Test's l2: 0.0328261
[1700]	Train's l2: 0.000978958	Test's l2: 0.0327987
第131折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.0011935961627038845 
------
------
 预测MSE
 0.0

[1350]	Train's l2: 0.00205251	Test's l2: 0.162502
[1400]	Train's l2: 0.00183395	Test's l2: 0.160777
[1450]	Train's l2: 0.00164472	Test's l2: 0.161216
[1500]	Train's l2: 0.00147566	Test's l2: 0.16012
[1550]	Train's l2: 0.00132715	Test's l2: 0.159545
[1600]	Train's l2: 0.00119861	Test's l2: 0.158554
[1650]	Train's l2: 0.00108197	Test's l2: 0.15873
[1700]	Train's l2: 0.000976928	Test's l2: 0.158314
[1750]	Train's l2: 0.000882446	Test's l2: 0.158232
第145折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.001000334507481624 
------
------
 预测MSE
 0.15788772022960437 
------

[50]	Train's l2: 0.419038	Test's l2: 0.00179043
[100]	Train's l2: 0.20367	Test's l2: 0.000115689
[150]	Train's l2: 0.11452	Test's l2: 7.20295e-06
[200]	Train's l2: 0.0742732	Test's l2: 4.10451e-05
[250]	Train's l2: 0.0535568	Test's l2: 0.00114664
第146折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.099155410008297 
------
------
 预测MSE
 9.352160670776017e-09 
------

[50]	Train's l2: 0.418956	Test's l2: 0.383917
[100]	Train's l2: 0.203784	Test's 

[600]	Train's l2: 0.0140369	Test's l2: 0.293085
[650]	Train's l2: 0.012106	Test's l2: 0.296694
第155折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.014568198216446015 
------
------
 预测MSE
 0.29035482954736297 
------

[50]	Train's l2: 0.41891	Test's l2: 0.000840902
[100]	Train's l2: 0.203614	Test's l2: 1.70224e-05
[150]	Train's l2: 0.114471	Test's l2: 1.23523e-05
[200]	Train's l2: 0.074115	Test's l2: 0.000305336


KeyboardInterrupt: 