In [1]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm
tqdm.pandas()

from scipy import stats

import warnings
warnings.filterwarnings("ignore")

from sklearn.linear_model import LinearRegression  #线性回归
from sklearn.linear_model import SGDRegressor
from sklearn.neighbors import KNeighborsRegressor  #K近邻回归
from sklearn.tree import DecisionTreeRegressor     #决策树回归
from sklearn.ensemble import RandomForestRegressor #随机森林回归
from sklearn.svm import SVR  #支持向量回归
import lightgbm as lgb #lightGbm模型

from sklearn.model_selection import train_test_split # 切分数据
from sklearn.metrics import mean_squared_error #评价指标

In [2]:
train_data_file = "data/zhengqi_train.txt"
test_data_file =  "data/zhengqi_test.txt"

train_data = pd.read_csv(train_data_file, sep = '\t', encoding = 'utf-8')
test_data = pd.read_csv(test_data_file, sep = '\t', encoding = 'utf-8')

In [3]:
# 预处理
from sklearn import preprocessing

features_columns = [col for col in train_data.columns if col not in ['target']]
min_max_scaler = preprocessing.MinMaxScaler()

min_max_scaler  = min_max_scaler.fit(train_data[features_columns])
train_data_scaler = min_max_scaler.transform(train_data[features_columns])
test_data_scaler = min_max_scaler.transform(test_data[features_columns])

train_data_scaler = pd.DataFrame(train_data_scaler)
train_data_scaler.columns = features_columns
test_data_scaler = pd.DataFrame(test_data_scaler)
test_data_scaler.columns = features_columns
train_data_scaler['target'] = train_data['target']

In [4]:
# PCA降维
from sklearn.decomposition import PCA

# 保留16个主成分
pca = PCA(n_components = 16)
new_train_pca_16 = pca.fit_transform(train_data_scaler.iloc[:, 0:-1])
new_test_pca_16 = pca.transform(test_data_scaler)
new_train_pca_16 = pd.DataFrame(new_train_pca_16)
new_test_pca_16 = pd.DataFrame(new_test_pca_16)
new_train_pca_16['target'] = train_data_scaler['target']

new_train_pca_16 = new_train_pca_16.fillna(0)
train = new_train_pca_16[new_test_pca_16.columns]
target = new_train_pca_16['target']
test = test_data_scaler

# 划分验证集
X_train, X_val, y_train, y_val = train_test_split(train, target, test_size = 0.2, random_state = 0)

## 欠拟合

In [5]:
clf = SGDRegressor(max_iter = 500, tol = 1e-2)
clf.fit(X_train, y_train)
score_train = mean_squared_error(y_train, clf.predict(X_train))
score_val = mean_squared_error(y_val, clf.predict(X_val))
print("SGDRegressor train MSE:   ", score_train)
print("SGDRegressor val MSE:   ", score_val)

SGDRegressor train MSE:    0.1515726845129755
SGDRegressor val MSE:    0.15586174356637902


## 过拟合/正常拟合

In [10]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(4)
X_train_poly = poly.fit_transform(X_train)
X_val_poly = poly.transform(X_val)
clf = SGDRegressor(max_iter = 1000, tol = 1e-3)
clf.fit(X_train_poly, y_train)
score_train = mean_squared_error(y_train, clf.predict(X_train_poly))
score_val = mean_squared_error(y_val, clf.predict(X_val_poly))
print("SGDRegressor train MSE:   ", score_train)
print("SGDRegressor val MSE:   ", score_val)

SGDRegressor train MSE:    0.13238208914548613
SGDRegressor val MSE:    0.14297697760646055


## 模型正则化

In [11]:
# L2正则化
poly = PolynomialFeatures(3)
X_train_poly = poly.fit_transform(X_train)
X_val_poly =poly.transform(X_val)
clf = SGDRegressor(max_iter = 1000, tol = 1e-3, penalty = 'l2', alpha = 0.0001)
clf.fit(X_train_poly, y_train)
score_train = mean_squared_error(y_train, clf.predict(X_train_poly))
score_val = mean_squared_error(y_val, clf.predict(X_val_poly))
print("SGDRegressor train MSE:   ", score_train)
print("SGDRegressor test MSE:   ", score_val)

SGDRegressor train MSE:    0.13432411873987127
SGDRegressor test MSE:    0.14280502868587774


In [13]:
# L1正则化
poly = PolynomialFeatures(3)
X_train_poly = poly.fit_transform(X_train)
X_val_poly =poly.transform(X_val)
clf = SGDRegressor(max_iter = 1000, tol = 1e-3, penalty = 'l1', alpha = 0.00001)
clf.fit(X_train_poly, y_train)
score_train = mean_squared_error(y_train, clf.predict(X_train_poly))
score_val = mean_squared_error(y_val, clf.predict(X_val_poly))
print("SGDRegressor train MSE:   ", score_train)
print("SGDRegressor test MSE:   ", score_val)

SGDRegressor train MSE:    0.13434456290733987
SGDRegressor test MSE:    0.14256865012598008


In [14]:
# Elastic Net L1和L2范数加权正则化
poly = PolynomialFeatures(3)
X_train_poly = poly.fit_transform(X_train)
X_val_poly =poly.transform(X_val)
clf = SGDRegressor(max_iter = 1000, tol = 1e-3, penalty = 'elasticnet', l1_ratio = 0.9, alpha = 0.00001)
clf.fit(X_train_poly, y_train)
score_train = mean_squared_error(y_train, clf.predict(X_train_poly))
score_val = mean_squared_error(y_val, clf.predict(X_val_poly))
print("SGDRegressor train MSE:   ", score_train)
print("SGDRegressor test MSE:   ", score_val)

SGDRegressor train MSE:    0.13429286643201233
SGDRegressor test MSE:    0.14224556452390136


## 模型交叉验证

In [15]:
# 简单交叉验证
# train_test_split

In [16]:
# K折交叉验证
from sklearn.model_selection import KFold

kf = KFold(n_splits = 5)
for k, (train_index, val_index) in enumerate(kf.split(train)):
    X_train, X_val = train.values[train_index], train.values[val_index]
    y_train, y_val = target[train_index], target[val_index]
    clf = SGDRegressor(max_iter = 1000, tol = 1e-3)
    clf.fit(X_train, y_train)
    score_train = mean_squared_error(y_train, clf.predict(X_train))
    score_val = mean_squared_error(y_val, clf.predict(X_val))
    print(k, " 折", "SGDRegressor train MSE:   ", score_train)
    print(k, " 折", "SGDRegressor test MSE:   ", score_val, '\n') 

0  折 SGDRegressor train MSE:    0.15000356251049293
0  折 SGDRegressor test MSE:    0.10622087416418229 

1  折 SGDRegressor train MSE:    0.13364067803379637
1  折 SGDRegressor test MSE:    0.1823658908030779 

2  折 SGDRegressor train MSE:    0.14722005039466307
2  折 SGDRegressor test MSE:    0.1334506106367118 

3  折 SGDRegressor train MSE:    0.14065476442376965
3  折 SGDRegressor test MSE:    0.1618515108030737 

4  折 SGDRegressor train MSE:    0.1388734233363064
4  折 SGDRegressor test MSE:    0.1654348638354631 



## 模型超参空间及调参

In [18]:
# 网格搜索
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train, target, test_size = 0.2, random_state = 0)

rfRegressor = RandomForestRegressor()
parameters = {
    'n_estimators':[50, 100, 200],
    'max_depth':[1, 2, 3],
}

clf = GridSearchCV(rfRegressor, parameters, cv = 5)
clf.fit(X_train, y_train)

score_val = mean_squared_error(y_val, clf.predict(X_val))

print('RandomForestRegressor GridSearchCV test MSE: ', score_val)
sorted(clf.cv_results_.keys())

RandomForestRegressor GridSearchCV test MSE:  0.2569545475330938


['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'param_max_depth',
 'param_n_estimators',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score']

In [24]:
clf.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=3,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=50,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [26]:
#随机参数优化
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train, target, test_size = 0.2, random_state = 0)

rfRegressor = RandomForestRegressor()
parameters = {
    'n_estimators':[50, 100, 200, 300],
    'max_depth':[1, 2, 3, 4, 5],
}

clf = RandomizedSearchCV(rfRegressor, parameters, cv = 5)
clf.fit(X_train, y_train)

score_val = mean_squared_error(y_val, clf.predict(X_val))

print('RandomForestRegressor RandomizedSearchCV test MSE: ', score_val)
sorted(clf.cv_results_.keys())

RandomForestRegressor RandomizedSearchCV test MSE:  0.19751458490309642


['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'param_max_depth',
 'param_n_estimators',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score']

In [28]:
clf.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [29]:
# LGB调参
clf = lgb.LGBMRegressor(num_leaves = 31)

parameters = {
    'learning_rate':[0.01, 0.1, 1],
    'n_estimators': [20, 40],
}

clf = RandomizedSearchCV(clf, parameters, cv = 5)
clf.fit(X_train, y_train)

print('Best parameters found by random search are:', clf.best_params_)
score_val = mean_squared_error(y_val, clf.predict(X_val))
print("LGBMRegressor RandomizedSearchCV test MSE:   ", score_val)

Best parameters found by random search are: {'n_estimators': 40, 'learning_rate': 0.1}
LGBMRegressor RandomizedSearchCV test MSE:    0.1518395195375679


## LGB线下验证

In [54]:
# LGB模型

# 5 折交叉验证
Folds = 5
kf = KFold(n_splits = 5, random_state = 0, shuffle = True)
# 记录训练和预测MSE
MSE_DICT = {
    'train_mse': [],
    'val_mse': [],
}

# 线下训练预测
for i, (train_index, val_index) in enumerate(kf.split(train)):
    lgb_reg = lgb.LGBMRegressor(
        boosting_type = 'gbdt',
        objective = 'regression',
        metric = 'mse',
        train_metric = True,
        n_estimators = 3000,
        early_stopping_rounds = 100,
        n_jobs = -1,
        learning_rate = 0.01,
        max_depth = 4,
        feature_fraction = 0.9,
        feature_fraction_seed = 0,
        bagging_fraction = 0.9, 
        bagging_freq = 2,
        bagging_seed = 0,
        lambda_l1 = 1,
        lambda_l2 = 1,
        verbosity = 1
    )

    # 切分训练集和测试集
    X_train_kf, X_val_kf = train.values[train_index], train.values[val_index]
    y_train_kf, y_val_kf = target[train_index], target[val_index]

    # 训练模型
    lgb_reg.fit(X_train_kf, y_train_kf,
                eval_set = [(X_train_kf, y_train_kf), (X_val_kf, y_val_kf)], 
                eval_names = ['Train', 'Test'], eval_metric = 'mse',
                verbose = 600)
    # 训练集和验证集预测
    y_train_kf_predict = lgb_reg.predict(X_train_kf, num_iteration = lgb_reg.best_iteration_)
    y_val_kf_predict = lgb_reg.predict(X_val_kf, num_iteration = lgb_reg.best_iteration_) 
    
    print('\n第{}折 训练和预测 训练MSE 预测MSE'.format(i))
    train_mse = mean_squared_error(y_train_kf_predict, y_train_kf)
    print('------\t', '训练MSE\t', train_mse, '\t------')
    val_mse = mean_squared_error(y_val_kf_predict, y_val_kf)
    print('------\t', '预测MSE\t', val_mse, '\t------\n')

    MSE_DICT['train_mse'].append(train_mse)
    MSE_DICT['val_mse'].append(val_mse)

print('训练集平均MSE: ', np.mean(MSE_DICT['train_mse']))
print('验证集平均MSE: ', np.mean(MSE_DICT['val_mse']))

Training until validation scores don't improve for 100 rounds.
[600]	Train's l2: 0.0885543	Test's l2: 0.14705
[1200]	Train's l2: 0.065332	Test's l2: 0.137558
Early stopping, best iteration is:
[1648]	Train's l2: 0.0551673	Test's l2: 0.135952

第0折 训练和预测 训练MSE 预测MSE
------	 训练MSE	 0.05516727388413645 	------
------	 预测MSE	 0.1359523071002833 	------

Training until validation scores don't improve for 100 rounds.
[600]	Train's l2: 0.090066	Test's l2: 0.144457
[1200]	Train's l2: 0.0663808	Test's l2: 0.133415
[1800]	Train's l2: 0.0523241	Test's l2: 0.131299
Early stopping, best iteration is:
[1778]	Train's l2: 0.0528158	Test's l2: 0.131152

第1折 训练和预测 训练MSE 预测MSE
------	 训练MSE	 0.0528157973823114 	------
------	 预测MSE	 0.13115224061373387 	------

Training until validation scores don't improve for 100 rounds.
[600]	Train's l2: 0.0904795	Test's l2: 0.126143
[1200]	Train's l2: 0.0663034	Test's l2: 0.118513
Early stopping, best iteration is:
[1330]	Train's l2: 0.0629847	Test's l2: 0.11819

第2折 