In [2]:
#将kilometer当做类别变量处理试试,异常值用groupby处理,'匿名特征可以进一步处理一下'
## 基础工具
import numpy as np
import pandas as pd
import warnings
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.special import jn
from IPython.display import display, clear_output
import time
from tqdm import tqdm
import itertools

warnings.filterwarnings('ignore')
%matplotlib inline

## 模型预测的
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor

## 数据降维处理的
from sklearn.decomposition import PCA,FastICA,FactorAnalysis,SparsePCA

import lightgbm as lgb
import xgboost as xgb

## 参数搜索和评价的
from sklearn.model_selection import GridSearchCV,cross_val_score,StratifiedKFold,train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

import scipy.signal as signal

In [3]:
#处理异常值
def smooth_cols(group,out_value,kind):
    cols = ['power']
    if kind == 'g':
        for col in cols:
            yes_no = (group[col]<out_value).astype('int')
            new = yes_no * group[col]
            group[col] = new.replace(0,group[col].quantile(q=0.995))
        return group
    if kind == 'l':
        for col in cols:
            yes_no = (group[col]>out_value).astype('int')
            new = yes_no * group[col]
            group[col] = new.replace(0,group[col].quantile(q=0.07))
        return group        
def date_proc(x):
    m = int(x[4:6])
    if m == 0:
        m = 1
    return x[:4] + '-' + str(m) + '-' + x[6:]

#定义日期提取函数
def date_tran(df,fea_col):
    for f in tqdm(fea_col):
        df[f] = pd.to_datetime(df[f].astype('str').apply(date_proc))
        df[f + '_year'] = df[f].dt.year
        df[f + '_month'] = df[f].dt.month
        df[f + '_day'] = df[f].dt.day
        df[f + '_dayofweek'] = df[f].dt.dayofweek
    return (df)

#分桶操作
def cut_group(df,cols,num_bins=50):
    for col in cols:
        all_range = int(df[col].max()-df[col].min())
        bin = [i*all_range/num_bins for i in range(all_range)]
        df[col+'_bin'] = pd.cut(df[col], bin, labels=False)
    return df

### count编码
def count_coding(df,fea_col):
    for f in fea_col:
        df[f + '_count'] = df[f].map(df[f].value_counts())
    return(df)
#定义交叉特征统计
def cross_cat_num(df,num_col,cat_col):
    for f1 in tqdm(cat_col):
        g = df.groupby(f1, as_index=False)
        for f2 in tqdm(num_col):
            feat = g[f2].agg({
                '{}_{}_max'.format(f1, f2): 'max', '{}_{}_min'.format(f1, f2): 'min',
                '{}_{}_median'.format(f1, f2): 'median',
            })
            df = df.merge(feat, on=f1, how='left')
    return(df)
### 类别特征的二阶交叉
from scipy.stats import entropy
def cross_qua_cat_num(df):
    for f_pair in tqdm([
        ['model', 'brand'], ['model', 'regionCode'], ['brand', 'regionCode']
    ]):
        ### 共现次数
        df['_'.join(f_pair) + '_count'] = df.groupby(f_pair)['SaleID'].transform('count')
        ### n unique、熵
        df = df.merge(df.groupby(f_pair[0], as_index=False)[f_pair[1]].agg({
            '{}_{}_nunique'.format(f_pair[0], f_pair[1]): 'nunique',
            '{}_{}_ent'.format(f_pair[0], f_pair[1]): lambda x: entropy(x.value_counts() / x.shape[0])
        }), on=f_pair[0], how='left')
        df = df.merge(df.groupby(f_pair[1], as_index=False)[f_pair[0]].agg({
            '{}_{}_nunique'.format(f_pair[1], f_pair[0]): 'nunique',
            '{}_{}_ent'.format(f_pair[1], f_pair[0]): lambda x: entropy(x.value_counts() / x.shape[0])
        }), on=f_pair[1], how='left')
        ### 比例偏好
        df['{}_in_{}_prop'.format(f_pair[0], f_pair[1])] = df['_'.join(f_pair) + '_count'] / df[f_pair[1] + '_count']
        df['{}_in_{}_prop'.format(f_pair[1], f_pair[0])] = df['_'.join(f_pair) + '_count'] / df[f_pair[0] + '_count']
    return (df)
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df


In [4]:
## 通过Pandas对于数据进行读取 (pandas是一个很友好的数据读取函数库)
Train_data = reduce_mem_usage(pd.read_csv('./used_car_train_20200313.csv', sep=' '))
TestA_data = reduce_mem_usage(pd.read_csv('./used_car_testB_20200421.csv', sep=' '))

#Train_data = Train_data[Train_data['price']>100]
#Train_data['price'] = np.log1p(Train_data['price'])
## 输出数据的大小信息
print('Train data shape:',Train_data.shape)
print('TestA data shape:',TestA_data.shape)


#合并数据集
concat_data = pd.concat([Train_data,TestA_data])
concat_data['notRepairedDamage'] = concat_data['notRepairedDamage'].replace('-',0).astype('float16')
concat_data = concat_data.fillna(concat_data.mode().iloc[0,:])
#concat_data.index = range(200000)
#concat_data = concat_data.groupby('bodyType').apply(smooth_cols,out_value=600,kind='g')
#concat_data.index = range(200000)
#concat_data['power'] = np.log(concat_data['power'])
print('concat_data shape:',concat_data.shape)

Memory usage of dataframe is 37200080.00 MB
Memory usage after optimization is: 10200184.00 MB
Decreased by 72.6%
Memory usage of dataframe is 12000080.00 MB
Memory usage after optimization is: 3200184.00 MB
Decreased by 73.3%
Train data shape: (150000, 31)
TestA data shape: (50000, 30)
concat_data shape: (200000, 31)


In [5]:
#截断异常值
concat_data['power'][concat_data['power']>600] = 600
concat_data['power'][concat_data['power']<1] = 1

concat_data['v_13'][concat_data['v_13']>6] = 6
concat_data['v_14'][concat_data['v_14']>4] = 4

In [6]:
for i in ['v_' +str(i) for i in range(14)]:
    for j in ['v_' +str(i) for i in range(14)]:
        concat_data[str(i)+'+'+str(j)] = concat_data[str(i)]+concat_data[str(j)]
for i in ['model','brand', 'bodyType', 'fuelType','gearbox', 'power', 'kilometer', 'notRepairedDamage', 'regionCode']:
    for j in ['v_' +str(i) for i in range(14)]:
        concat_data[str(i)+'*'+str(j)] = concat_data[i]*concat_data[j]    
concat_data.shape

(200000, 353)

In [7]:
#提取日期信息
date_cols = ['regDate', 'creatDate']
concat_data = date_tran(concat_data,date_cols)

100%|██████████| 2/2 [00:00<00:00,  2.83it/s]


In [8]:
data = concat_data.copy()

#count编码
count_list = ['regDate', 'creatDate', 'model', 'brand', 'regionCode','bodyType','fuelType','name','regDate_year', 'regDate_month', 'regDate_day',
       'regDate_dayofweek' , 'creatDate_month','creatDate_day', 'creatDate_dayofweek','kilometer']
       
data = count_coding(data,count_list)

In [9]:
#特征构造
# 使用时间：data['creatDate'] - data['regDate']，反应汽车使用时间，一般来说价格与使用时间成反比
# 不过要注意，数据里有时间出错的格式，所以我们需要 errors='coerce'
data['used_time1'] = (pd.to_datetime(data['creatDate'], format='%Y%m%d', errors='coerce') - 
                            pd.to_datetime(data['regDate'], format='%Y%m%d', errors='coerce')).dt.days
data['used_time2'] = (pd.datetime.now() - pd.to_datetime(data['regDate'], format='%Y%m%d', errors='coerce')).dt.days                        
data['used_time3'] = (pd.datetime.now() - pd.to_datetime(data['creatDate'], format='%Y%m%d', errors='coerce') ).dt.days

#分桶操作
cut_cols = ['power']+['used_time1','used_time2','used_time3']
data = cut_group(data,cut_cols,50)


In [10]:
### 用数值特征对类别特征做统计刻画，随便挑了几个跟price相关性最高的匿名特征
cross_cat = ['model', 'brand','regDate_year']
cross_num = ['v_0','v_3', 'v_4', 'v_8', 'v_12','power']
data = cross_cat_num(data,cross_num,cross_cat)#一阶交叉
#data = cross_qua_cat_num(data)#二阶交叉

  0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/6 [00:00<?, ?it/s]
 17%|█▋        | 1/6 [00:01<00:08,  1.63s/it]
 33%|███▎      | 2/6 [00:02<00:05,  1.26s/it]
 50%|█████     | 3/6 [00:02<00:02,  1.00it/s]
 67%|██████▋   | 4/6 [00:02<00:01,  1.22it/s]
 83%|████████▎ | 5/6 [00:03<00:00,  1.44it/s]
 33%|███▎      | 1/3 [00:03<00:07,  3.64s/it]
  0%|          | 0/6 [00:00<?, ?it/s]
 17%|█▋        | 1/6 [00:02<00:10,  2.03s/it]
 33%|███▎      | 2/6 [00:02<00:06,  1.54s/it]
 50%|█████     | 3/6 [00:02<00:03,  1.19s/it]
 67%|██████▋   | 4/6 [00:03<00:01,  1.05it/s]
 83%|████████▎ | 5/6 [00:03<00:00,  1.27it/s]
 67%|██████▋   | 2/3 [00:07<00:03,  3.79s/it]
  0%|          | 0/6 [00:00<?, ?it/s]
 17%|█▋        | 1/6 [00:02<00:12,  2.46s/it]
 33%|███▎      | 2/6 [00:02<00:07,  1.84s/it]
 50%|█████     | 3/6 [00:03<00:04,  1.42s/it]
 67%|██████▋   | 4/6 [00:03<00:02,  1.11s/it]
 83%|████████▎ | 5/6 [00:04<00:00,  1.11it/s]
100%|██████████| 3/3 [00:12<00:00,  4.06s/it]


In [11]:
## 选择特征列
numerical_cols = data.columns
#print(numerical_cols)

cat_fea = ['SaleID','offerType','seller']
feature_cols = [col for col in numerical_cols if col not in cat_fea]
feature_cols = [col for col in feature_cols if col not in ['price']]

## 提前特征列，标签列构造训练样本和测试样本
X_data = data.iloc[:len(Train_data),:][feature_cols]
Y_data = Train_data['price']
X_test  = data.iloc[len(Train_data):,:][feature_cols]

In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold,KFold
from itertools import product
class MeanEncoder:
    def __init__(self, categorical_features, n_splits=10, target_type='classification', prior_weight_func=None):
        """
        :param categorical_features: list of str, the name of the categorical columns to encode
 
        :param n_splits: the number of splits used in mean encoding
 
        :param target_type: str, 'regression' or 'classification'
 
        :param prior_weight_func:
        a function that takes in the number of observations, and outputs prior weight
        when a dict is passed, the default exponential decay function will be used:
        k: the number of observations needed for the posterior to be weighted equally as the prior
        f: larger f --> smaller slope
        """
 
        self.categorical_features = categorical_features
        self.n_splits = n_splits
        self.learned_stats = {}
 
        if target_type == 'classification':
            self.target_type = target_type
            self.target_values = []
        else:
            self.target_type = 'regression'
            self.target_values = None
 
        if isinstance(prior_weight_func, dict):
            self.prior_weight_func = eval('lambda x: 1 / (1 + np.exp((x - k) / f))', dict(prior_weight_func, np=np))
        elif callable(prior_weight_func):
            self.prior_weight_func = prior_weight_func
        else:
            self.prior_weight_func = lambda x: 1 / (1 + np.exp((x - 2) / 1))
 
    @staticmethod
    def mean_encode_subroutine(X_train, y_train, X_test, variable, target, prior_weight_func):
        X_train = X_train[[variable]].copy()
        X_test = X_test[[variable]].copy()
 
        if target is not None:
            nf_name = '{}_pred_{}'.format(variable, target)
            X_train['pred_temp'] = (y_train == target).astype(int)  # classification
        else:
            nf_name = '{}_pred'.format(variable)
            X_train['pred_temp'] = y_train  # regression
        prior = X_train['pred_temp'].mean()
 
        col_avg_y = X_train.groupby(by=variable, axis=0)['pred_temp'].agg({'mean': 'mean', 'beta': 'size'})
        col_avg_y['beta'] = prior_weight_func(col_avg_y['beta'])
        col_avg_y[nf_name] = col_avg_y['beta'] * prior + (1 - col_avg_y['beta']) * col_avg_y['mean']
        col_avg_y.drop(['beta', 'mean'], axis=1, inplace=True)
 
        nf_train = X_train.join(col_avg_y, on=variable)[nf_name].values
        nf_test = X_test.join(col_avg_y, on=variable).fillna(prior, inplace=False)[nf_name].values
 
        return nf_train, nf_test, prior, col_avg_y
 
    def fit_transform(self, X, y):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :param y: pandas Series or numpy array, n_samples
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()
        if self.target_type == 'classification':
            skf = StratifiedKFold(self.n_splits)
        else:
            skf = KFold(self.n_splits)
 
        if self.target_type == 'classification':
            self.target_values = sorted(set(y))
            self.learned_stats = {'{}_pred_{}'.format(variable, target): [] for variable, target in
                                  product(self.categorical_features, self.target_values)}
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(y, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, target, self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        else:
            self.learned_stats = {'{}_pred'.format(variable): [] for variable in self.categorical_features}
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(y, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, None, self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        return X_new
 
    def transform(self, X):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()
 
        if self.target_type == 'classification':
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits
        else:
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits
 
        return X_new


In [13]:
class_list = ['model','brand','name','regionCode']+date_cols
MeanEnocodeFeature = class_list#声明需要平均数编码的特征
ME = MeanEncoder(MeanEnocodeFeature,target_type='regression') #声明平均数编码的类
X_data = ME.fit_transform(X_data,Y_data)#对训练数据集的X和y进行拟合
#x_train_fav = ME.fit_transform(x_train,y_train_fav)#对训练数据集的X和y进行拟合
X_test = ME.transform(X_test)#对测试集进行编码

In [14]:
X_data['price'] = Train_data['price']

In [15]:
from sklearn.model_selection import KFold

### target encoding目标编码，回归场景相对来说做目标编码的选择更多，不仅可以做均值编码，还可以做标准差编码、中位数编码等
enc_cols = []
stats_default_dict = {
    'max': X_data['price'].max(),
    'min': X_data['price'].min(),
    'median': X_data['price'].median(),
    'mean': X_data['price'].mean(),
    'sum': X_data['price'].sum(),
    'std': X_data['price'].std(),
    'skew': X_data['price'].skew(),
    'kurt': X_data['price'].kurt(),
    'mad': X_data['price'].mad()
}
### 暂且选择这三种编码
enc_stats = ['max','min','mean']
skf = KFold(n_splits=10, shuffle=True, random_state=42)
for f in tqdm(['regionCode','brand','regDate_year','creatDate_year','kilometer','model']):
    enc_dict = {}
    for stat in enc_stats:
        enc_dict['{}_target_{}'.format(f, stat)] = stat
        X_data['{}_target_{}'.format(f, stat)] = 0
        X_test['{}_target_{}'.format(f, stat)] = 0
        enc_cols.append('{}_target_{}'.format(f, stat))
    for i, (trn_idx, val_idx) in enumerate(skf.split(X_data, Y_data)):
        trn_x, val_x = X_data.iloc[trn_idx].reset_index(drop=True), X_data.iloc[val_idx].reset_index(drop=True)
        enc_df = trn_x.groupby(f, as_index=False)['price'].agg(enc_dict)
        val_x = val_x[[f]].merge(enc_df, on=f, how='left')
        test_x = X_test[[f]].merge(enc_df, on=f, how='left')
        for stat in enc_stats:
            val_x['{}_target_{}'.format(f, stat)] = val_x['{}_target_{}'.format(f, stat)].fillna(stats_default_dict[stat])
            test_x['{}_target_{}'.format(f, stat)] = test_x['{}_target_{}'.format(f, stat)].fillna(stats_default_dict[stat])
            X_data.loc[val_idx, '{}_target_{}'.format(f, stat)] = val_x['{}_target_{}'.format(f, stat)].values 
            X_test['{}_target_{}'.format(f, stat)] += test_x['{}_target_{}'.format(f, stat)].values / skf.n_splits

100%|██████████| 6/6 [00:25<00:00,  4.23s/it]


In [16]:

drop_list = ['regDate', 'creatDate','brand_power_min', 'regDate_year_power_min']
x_train = X_data.drop(drop_list+['price'],axis=1)
x_test = X_test.drop(drop_list,axis=1)
x_train.shape

(150000, 454)

In [17]:
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')

In [18]:
from sklearn.preprocessing import MinMaxScaler
#特征归一化
min_max_scaler = MinMaxScaler()
min_max_scaler.fit(pd.concat([x_train,x_test]).values)
all_data = min_max_scaler.transform(pd.concat([x_train,x_test]).values)

In [19]:
from sklearn import decomposition
pca = decomposition.PCA(n_components=146)
all_pca = pca.fit_transform(all_data)
X_pca = all_pca[:len(x_train)]
test = all_pca[len(x_train):]
y = Train_data['price'].values

In [20]:
from keras.layers import Conv1D, Activation, MaxPool1D, Flatten, Dense
from keras.layers import Input, Dense, Concatenate, Reshape, Dropout, merge, Add
def NN_model(input_dim):
    init = keras.initializers.glorot_uniform(seed=1)
    model = keras.models.Sequential()
    model.add(Dense(units=300, input_dim=input_dim, kernel_initializer=init, activation='softplus'))
    #model.add(Dropout(0.2))
    model.add(Dense(units=300, kernel_initializer=init, activation='softplus'))
    #model.add(Dropout(0.2))
    model.add(Dense(units=64, kernel_initializer=init, activation='softplus'))
    model.add(Dense(units=32, kernel_initializer=init, activation='softplus'))
    model.add(Dense(units=8, kernel_initializer=init, activation='softplus'))
    model.add(Dense(units=1))
    return model


In [21]:
from keras.callbacks import Callback, EarlyStopping
class Metric(Callback):
    def __init__(self, model, callbacks, data):
        super().__init__()
        self.model = model
        self.callbacks = callbacks
        self.data = data

    def on_train_begin(self, logs=None):
        for callback in self.callbacks:
            callback.on_train_begin(logs)

    def on_train_end(self, logs=None):
        for callback in self.callbacks:
            callback.on_train_end(logs)

    def on_epoch_end(self, batch, logs=None):
        X_train, y_train = self.data[0][0], self.data[0][1]
        y_pred3 = self.model.predict(X_train)
        y_pred = np.zeros((len(y_pred3), ))
        y_true = np.zeros((len(y_pred3), ))
        for i in range(len(y_pred3)):
            y_pred[i] = y_pred3[i]
        for i in range(len(y_pred3)):
            y_true[i] = y_train[i]
        trn_s = mean_absolute_error(y_true, y_pred)
        logs['trn_score'] = trn_s
        
        X_val, y_val = self.data[1][0], self.data[1][1]
        y_pred3 = self.model.predict(X_val)
        y_pred = np.zeros((len(y_pred3), ))
        y_true = np.zeros((len(y_pred3), ))
        for i in range(len(y_pred3)):
            y_pred[i] = y_pred3[i]
        for i in range(len(y_pred3)):
            y_true[i] = y_val[i]
        val_s = mean_absolute_error(y_true, y_pred)
        logs['val_score'] = val_s
        print('trn_score', trn_s, 'val_score', val_s)

        for callback in self.callbacks:
            callback.on_epoch_end(batch, logs)

In [22]:
import keras.backend as K
from keras.callbacks import LearningRateScheduler
  
def scheduler(epoch):
    # 每隔100个epoch，学习率减小为原来的1/10
    if epoch % 20 == 0 and epoch != 0:
        lr = K.get_value(model.optimizer.lr)
        K.set_value(model.optimizer.lr, lr * 0.6)
        print("lr changed to {}".format(lr * 0.6))
    return K.get_value(model.optimizer.lr)
reduce_lr = LearningRateScheduler(scheduler)
#model.fit(train_x, train_y, batch_size=32, epochs=5, callbacks=[reduce_lr])

In [26]:
n_splits = 6
kf = KFold(n_splits=n_splits, shuffle=True)

import keras 

b_size = 2000
max_epochs = 145
oof_pred = np.zeros((len(X_pca), ))

sub = pd.read_csv('./used_car_testB_20200421.csv',sep = ' ')[['SaleID']].copy()
sub['price'] = 0

avg_mae = 0
for fold, (trn_idx, val_idx) in enumerate(kf.split(X_pca, y)):
    print('fold:', fold)
    X_train, y_train = X_pca[trn_idx], y[trn_idx]
    X_val, y_val = X_pca[val_idx], y[val_idx]
    
    model = NN_model(X_train.shape[1])
    simple_adam = keras.optimizers.Adam(lr = 0.015)
    
    model.compile(loss='mae', optimizer=simple_adam,metrics=['mae'])
    es = EarlyStopping(monitor='val_score', patience=10, verbose=2, mode='min', restore_best_weights=True,)
    es.set_model(model)
    metric = Metric(model, [es], [(X_train, y_train), (X_val, y_val)])
    model.fit(X_train, y_train, batch_size=b_size, epochs=max_epochs, 
              validation_data = (X_val, y_val),
              callbacks=[reduce_lr], shuffle=True, verbose=2)
    y_pred3 = model.predict(X_val)
    y_pred = np.zeros((len(y_pred3), ))
    sub['price'] += model.predict(test).reshape(-1,)/n_splits
    for i in range(len(y_pred3)):
        y_pred[i] = y_pred3[i]
        
    oof_pred[val_idx] = y_pred
    val_mae = mean_absolute_error(y[val_idx], y_pred)
    avg_mae += val_mae/n_splits
    print()
    print('val_mae is:{}'.format(val_mae))
    print()
mean_absolute_error(y, oof_pred)

fold: 0
Epoch 1/145
63/63 - 1s - loss: 2569.0168 - mae: 2569.0168 - val_loss: 904.7879 - val_mae: 904.7879
Epoch 2/145
63/63 - 1s - loss: 797.3423 - mae: 797.3423 - val_loss: 702.4642 - val_mae: 702.4642
Epoch 3/145
63/63 - 1s - loss: 678.4730 - mae: 678.4730 - val_loss: 608.1737 - val_mae: 608.1737
Epoch 4/145
63/63 - 1s - loss: 672.4891 - mae: 672.4891 - val_loss: 676.0610 - val_mae: 676.0610
Epoch 5/145
63/63 - 1s - loss: 569.9478 - mae: 569.9478 - val_loss: 536.1880 - val_mae: 536.1880
Epoch 6/145
63/63 - 1s - loss: 562.4293 - mae: 562.4293 - val_loss: 793.1977 - val_mae: 793.1977
Epoch 7/145
63/63 - 1s - loss: 562.5613 - mae: 562.5613 - val_loss: 513.3967 - val_mae: 513.3967
Epoch 8/145
63/63 - 1s - loss: 634.1051 - mae: 634.1051 - val_loss: 703.5747 - val_mae: 703.5747
Epoch 9/145
63/63 - 1s - loss: 544.2729 - mae: 544.2729 - val_loss: 570.0464 - val_mae: 570.0464
Epoch 10/145
63/63 - 1s - loss: 544.3489 - mae: 544.3489 - val_loss: 597.1138 - val_mae: 597.1138
Epoch 11/145
63/63 

Epoch 84/145
63/63 - 1s - loss: 374.7967 - mae: 374.7967 - val_loss: 421.0315 - val_mae: 421.0315
Epoch 85/145
63/63 - 1s - loss: 373.9765 - mae: 373.9765 - val_loss: 420.3019 - val_mae: 420.3019
Epoch 86/145
63/63 - 1s - loss: 373.0105 - mae: 373.0105 - val_loss: 420.4583 - val_mae: 420.4583
Epoch 87/145
63/63 - 1s - loss: 373.1844 - mae: 373.1844 - val_loss: 420.6867 - val_mae: 420.6867
Epoch 88/145
63/63 - 1s - loss: 373.8559 - mae: 373.8559 - val_loss: 422.5433 - val_mae: 422.5433
Epoch 89/145
63/63 - 1s - loss: 372.9210 - mae: 372.9210 - val_loss: 420.4937 - val_mae: 420.4937
Epoch 90/145
63/63 - 1s - loss: 373.0705 - mae: 373.0705 - val_loss: 427.4324 - val_mae: 427.4324
Epoch 91/145
63/63 - 1s - loss: 373.0430 - mae: 373.0430 - val_loss: 418.8100 - val_mae: 418.8100
Epoch 92/145
63/63 - 1s - loss: 371.2543 - mae: 371.2543 - val_loss: 426.1605 - val_mae: 426.1605
Epoch 93/145
63/63 - 1s - loss: 373.4494 - mae: 373.4494 - val_loss: 431.3881 - val_mae: 431.3881
Epoch 94/145
63/63 -

Epoch 21/145
lr changed to 0.008999999798834323
63/63 - 1s - loss: 446.3428 - mae: 446.3428 - val_loss: 493.9195 - val_mae: 493.9195
Epoch 22/145
63/63 - 1s - loss: 470.1374 - mae: 470.1374 - val_loss: 454.8164 - val_mae: 454.8164
Epoch 23/145
63/63 - 1s - loss: 444.0622 - mae: 444.0622 - val_loss: 471.0299 - val_mae: 471.0299
Epoch 24/145
63/63 - 1s - loss: 439.6766 - mae: 439.6766 - val_loss: 449.0094 - val_mae: 449.0094
Epoch 25/145
63/63 - 1s - loss: 439.9623 - mae: 439.9623 - val_loss: 452.9448 - val_mae: 452.9448
Epoch 26/145
63/63 - 1s - loss: 434.9466 - mae: 434.9466 - val_loss: 451.9504 - val_mae: 451.9504
Epoch 27/145
63/63 - 1s - loss: 438.5275 - mae: 438.5275 - val_loss: 447.3272 - val_mae: 447.3272
Epoch 28/145
63/63 - 1s - loss: 466.4695 - mae: 466.4695 - val_loss: 472.4979 - val_mae: 472.4979
Epoch 29/145
63/63 - 1s - loss: 457.0103 - mae: 457.0103 - val_loss: 467.0770 - val_mae: 467.0770
Epoch 30/145
63/63 - 1s - loss: 433.0109 - mae: 433.0109 - val_loss: 443.8711 - val

Epoch 103/145
63/63 - 1s - loss: 371.7419 - mae: 371.7419 - val_loss: 416.1692 - val_mae: 416.1692
Epoch 104/145
63/63 - 1s - loss: 371.8199 - mae: 371.8199 - val_loss: 414.7339 - val_mae: 414.7339
Epoch 105/145
63/63 - 1s - loss: 370.6423 - mae: 370.6423 - val_loss: 416.0244 - val_mae: 416.0244
Epoch 106/145
63/63 - 1s - loss: 370.9224 - mae: 370.9224 - val_loss: 416.5233 - val_mae: 416.5233
Epoch 107/145
63/63 - 1s - loss: 370.5094 - mae: 370.5094 - val_loss: 415.2392 - val_mae: 415.2392
Epoch 108/145
63/63 - 1s - loss: 370.6679 - mae: 370.6679 - val_loss: 416.3966 - val_mae: 416.3966
Epoch 109/145
63/63 - 1s - loss: 370.8288 - mae: 370.8288 - val_loss: 415.1439 - val_mae: 415.1439
Epoch 110/145
63/63 - 1s - loss: 369.5905 - mae: 369.5905 - val_loss: 415.3889 - val_mae: 415.3889
Epoch 111/145
63/63 - 1s - loss: 369.5031 - mae: 369.5031 - val_loss: 415.9321 - val_mae: 415.9321
Epoch 112/145
63/63 - 1s - loss: 369.2960 - mae: 369.2960 - val_loss: 415.2680 - val_mae: 415.2680
Epoch 113/

Epoch 40/145
63/63 - 1s - loss: 442.1925 - mae: 442.1925 - val_loss: 456.8714 - val_mae: 456.8714
Epoch 41/145
lr changed to 0.005399999767541885
63/63 - 1s - loss: 406.6389 - mae: 406.6389 - val_loss: 446.1329 - val_mae: 446.1329
Epoch 42/145
63/63 - 1s - loss: 403.1006 - mae: 403.1006 - val_loss: 439.0726 - val_mae: 439.0726
Epoch 43/145
63/63 - 1s - loss: 400.3065 - mae: 400.3065 - val_loss: 438.6960 - val_mae: 438.6960
Epoch 44/145
63/63 - 1s - loss: 400.9860 - mae: 400.9860 - val_loss: 442.9516 - val_mae: 442.9516
Epoch 45/145
63/63 - 1s - loss: 416.7345 - mae: 416.7345 - val_loss: 437.0114 - val_mae: 437.0114
Epoch 46/145
63/63 - 1s - loss: 399.0310 - mae: 399.0310 - val_loss: 436.9010 - val_mae: 436.9010
Epoch 47/145
63/63 - 1s - loss: 399.5738 - mae: 399.5738 - val_loss: 440.8683 - val_mae: 440.8683
Epoch 48/145
63/63 - 1s - loss: 399.4825 - mae: 399.4825 - val_loss: 443.8055 - val_mae: 443.8055
Epoch 49/145
63/63 - 1s - loss: 408.4734 - mae: 408.4734 - val_loss: 440.3185 - val

Epoch 122/145
63/63 - 1s - loss: 356.5604 - mae: 356.5604 - val_loss: 427.7741 - val_mae: 427.7741
Epoch 123/145
63/63 - 1s - loss: 356.4935 - mae: 356.4935 - val_loss: 428.4892 - val_mae: 428.4892
Epoch 124/145
63/63 - 1s - loss: 356.4485 - mae: 356.4485 - val_loss: 428.7938 - val_mae: 428.7938
Epoch 125/145
63/63 - 1s - loss: 356.3582 - mae: 356.3582 - val_loss: 428.1001 - val_mae: 428.1001
Epoch 126/145
63/63 - 1s - loss: 356.3052 - mae: 356.3052 - val_loss: 428.5671 - val_mae: 428.5671
Epoch 127/145
63/63 - 1s - loss: 355.4116 - mae: 355.4116 - val_loss: 428.3115 - val_mae: 428.3115
Epoch 128/145
63/63 - 1s - loss: 355.6132 - mae: 355.6132 - val_loss: 427.6711 - val_mae: 427.6711
Epoch 129/145
63/63 - 1s - loss: 355.2769 - mae: 355.2769 - val_loss: 427.9124 - val_mae: 427.9124
Epoch 130/145
63/63 - 1s - loss: 355.5967 - mae: 355.5967 - val_loss: 428.7119 - val_mae: 428.7119
Epoch 131/145
63/63 - 1s - loss: 354.8536 - mae: 354.8536 - val_loss: 428.6156 - val_mae: 428.6156
Epoch 132/

Epoch 59/145
63/63 - 1s - loss: 402.7613 - mae: 402.7613 - val_loss: 426.9366 - val_mae: 426.9366
Epoch 60/145
63/63 - 1s - loss: 401.6692 - mae: 401.6692 - val_loss: 426.2747 - val_mae: 426.2747
Epoch 61/145
lr changed to 0.0032399998046457766
63/63 - 1s - loss: 390.3248 - mae: 390.3248 - val_loss: 422.5131 - val_mae: 422.5131
Epoch 62/145
63/63 - 1s - loss: 391.9995 - mae: 391.9995 - val_loss: 422.8121 - val_mae: 422.8121
Epoch 63/145
63/63 - 1s - loss: 390.2666 - mae: 390.2666 - val_loss: 425.0165 - val_mae: 425.0165
Epoch 64/145
63/63 - 1s - loss: 389.3222 - mae: 389.3222 - val_loss: 424.4386 - val_mae: 424.4386
Epoch 65/145
63/63 - 1s - loss: 392.3930 - mae: 392.3930 - val_loss: 429.6477 - val_mae: 429.6477
Epoch 66/145
63/63 - 1s - loss: 388.9505 - mae: 388.9505 - val_loss: 423.9532 - val_mae: 423.9532
Epoch 67/145
63/63 - 1s - loss: 387.6384 - mae: 387.6384 - val_loss: 421.8427 - val_mae: 421.8427
Epoch 68/145
63/63 - 1s - loss: 389.5551 - mae: 389.5551 - val_loss: 425.5286 - va

Epoch 141/145
lr changed to 0.0004199039773084223
63/63 - 1s - loss: 362.2347 - mae: 362.2347 - val_loss: 417.0142 - val_mae: 417.0142
Epoch 142/145
63/63 - 1s - loss: 362.2676 - mae: 362.2676 - val_loss: 417.0424 - val_mae: 417.0424
Epoch 143/145
63/63 - 1s - loss: 362.4232 - mae: 362.4232 - val_loss: 417.0135 - val_mae: 417.0135
Epoch 144/145
63/63 - 1s - loss: 362.2274 - mae: 362.2274 - val_loss: 418.0527 - val_mae: 418.0527
Epoch 145/145
63/63 - 1s - loss: 361.8947 - mae: 361.8947 - val_loss: 417.3794 - val_mae: 417.3794

val_mae is:417.37943368774415

fold: 4
Epoch 1/145
63/63 - 1s - loss: 2515.1609 - mae: 2515.1609 - val_loss: 888.0394 - val_mae: 888.0394
Epoch 2/145
63/63 - 1s - loss: 796.9133 - mae: 796.9133 - val_loss: 681.8772 - val_mae: 681.8772
Epoch 3/145
63/63 - 1s - loss: 707.0456 - mae: 707.0456 - val_loss: 624.8246 - val_mae: 624.8246
Epoch 4/145
63/63 - 1s - loss: 613.5575 - mae: 613.5575 - val_loss: 650.8071 - val_mae: 650.8071
Epoch 5/145
63/63 - 1s - loss: 595.8278

Epoch 78/145
63/63 - 1s - loss: 384.4962 - mae: 384.4962 - val_loss: 419.4508 - val_mae: 419.4508
Epoch 79/145
63/63 - 1s - loss: 385.3644 - mae: 385.3644 - val_loss: 416.0286 - val_mae: 416.0286
Epoch 80/145
63/63 - 1s - loss: 383.3365 - mae: 383.3365 - val_loss: 415.3479 - val_mae: 415.3479
Epoch 81/145
lr changed to 0.0019439998548477888
63/63 - 1s - loss: 378.7584 - mae: 378.7584 - val_loss: 415.2724 - val_mae: 415.2724
Epoch 82/145
63/63 - 1s - loss: 378.5179 - mae: 378.5179 - val_loss: 414.5436 - val_mae: 414.5436
Epoch 83/145
63/63 - 1s - loss: 378.1747 - mae: 378.1747 - val_loss: 414.3937 - val_mae: 414.3937
Epoch 84/145
63/63 - 1s - loss: 377.1210 - mae: 377.1210 - val_loss: 413.6130 - val_mae: 413.6130
Epoch 85/145
63/63 - 1s - loss: 378.2303 - mae: 378.2303 - val_loss: 417.7773 - val_mae: 417.7773
Epoch 86/145
63/63 - 1s - loss: 376.7776 - mae: 376.7776 - val_loss: 415.0431 - val_mae: 415.0431
Epoch 87/145
63/63 - 1s - loss: 377.0661 - mae: 377.0661 - val_loss: 413.8941 - va

Epoch 15/145
63/63 - 1s - loss: 501.1956 - mae: 501.1956 - val_loss: 466.8242 - val_mae: 466.8242
Epoch 16/145
63/63 - 1s - loss: 507.7339 - mae: 507.7339 - val_loss: 481.3199 - val_mae: 481.3199
Epoch 17/145
63/63 - 1s - loss: 505.5224 - mae: 505.5224 - val_loss: 648.6382 - val_mae: 648.6382
Epoch 18/145
63/63 - 1s - loss: 536.9136 - mae: 536.9136 - val_loss: 457.7835 - val_mae: 457.7835
Epoch 19/145
63/63 - 1s - loss: 480.5285 - mae: 480.5284 - val_loss: 472.1486 - val_mae: 472.1486
Epoch 20/145
63/63 - 1s - loss: 511.1381 - mae: 511.1381 - val_loss: 459.7254 - val_mae: 459.7254
Epoch 21/145
lr changed to 0.008999999798834323
63/63 - 1s - loss: 448.8853 - mae: 448.8853 - val_loss: 441.4634 - val_mae: 441.4634
Epoch 22/145
63/63 - 1s - loss: 448.6555 - mae: 448.6555 - val_loss: 436.9794 - val_mae: 436.9794
Epoch 23/145
63/63 - 1s - loss: 445.1579 - mae: 445.1579 - val_loss: 442.0049 - val_mae: 442.0049
Epoch 24/145
63/63 - 1s - loss: 465.0173 - mae: 465.0173 - val_loss: 432.8260 - val

Epoch 98/145
63/63 - 1s - loss: 372.5159 - mae: 372.5159 - val_loss: 408.2253 - val_mae: 408.2253
Epoch 99/145
63/63 - 1s - loss: 373.3732 - mae: 373.3732 - val_loss: 409.6261 - val_mae: 409.6261
Epoch 100/145
63/63 - 1s - loss: 373.4946 - mae: 373.4946 - val_loss: 409.1254 - val_mae: 409.1254
Epoch 101/145
lr changed to 0.0011663999408483504
63/63 - 1s - loss: 369.5492 - mae: 369.5492 - val_loss: 407.1517 - val_mae: 407.1517
Epoch 102/145
63/63 - 1s - loss: 369.1087 - mae: 369.1087 - val_loss: 408.9312 - val_mae: 408.9312
Epoch 103/145
63/63 - 1s - loss: 368.3390 - mae: 368.3390 - val_loss: 406.6789 - val_mae: 406.6789
Epoch 104/145
63/63 - 1s - loss: 368.4200 - mae: 368.4200 - val_loss: 406.4166 - val_mae: 406.4166
Epoch 105/145
63/63 - 1s - loss: 367.7512 - mae: 367.7512 - val_loss: 406.4414 - val_mae: 406.4414
Epoch 106/145
63/63 - 1s - loss: 367.8327 - mae: 367.8327 - val_loss: 406.5229 - val_mae: 406.5229
Epoch 107/145
63/63 - 1s - loss: 368.7365 - mae: 368.7365 - val_loss: 413.1

416.2007160488721

In [27]:
sub.to_csv('nn_sub_{}_{}.csv'.format('mae', sub['price'].mean()), index=False)