In [0]:
%matplotlib inline

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings
filterwarnings('ignore')
from tqdm import tqdm_notebook

FIGSIZE=(20,10)
SEED=17

In [0]:
from xgboost import XGBClassifier, plot_importance
from sklearn.model_selection import cross_val_score, train_test_split, KFold, StratifiedKFold

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, make_scorer


from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# pool of regressors
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# nn
from keras.layers import Input, Dense, Dropout
from keras.models import Model
from keras.optimizers import Adam
from keras import callbacks


from sklearn.base import clone

Solution is based on stacking of:
- 5 neural networks trained on different parts of data
- 3 boosting trees. 

Ridge regression is used as metaestimator

### Split the data

In [0]:
target = 'Energy_consumption'

In [0]:
train_initial = pd.read_csv('../data/train.csv')
test_initial = pd.read_csv('../data/test.csv')

X_test = test_initial.drop('Id', axis=1)

X = train_initial.drop(['Id', target], axis=1)
y = train_initial[target].values.reshape(-1,1)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.25,
                                                   random_state=SEED,
                                                   shuffle=True)

In [0]:
catcols = X_train.select_dtypes(include='object').columns
onehot = OneHotEncoder()

train_trans = pd.DataFrame(onehot.fit_transform(X_train[catcols]).toarray())
test_trans = pd.DataFrame(onehot.transform(X_valid[catcols]).toarray())

X_train = pd.concat([X_train.drop(catcols, axis=1).reset_index(drop=True), 
                     train_trans], axis=1)

X_valid = pd.concat([X_valid.drop(catcols, axis=1).reset_index(drop=True), 
                    test_trans], axis=1)


catcols = X.select_dtypes(include='object').columns
onehot = OneHotEncoder() 

train_trans = pd.DataFrame(onehot.fit_transform(X[catcols]).toarray())
X_train_full = pd.concat([X.drop(catcols, axis=1).reset_index(drop=True),
                    train_trans], axis=1)

test_trans = pd.DataFrame(onehot.transform(X_test[catcols]).toarray())
X_test = pd.concat([X_test.drop(catcols, axis=1).reset_index(drop=True), 
                    test_trans], axis=1)

In [0]:
# transform the data
scaler = StandardScaler()
X_array = scaler.fit_transform(X_train_full.values)
X_test_array = scaler.transform(X_test.values)

### Utils

In [0]:
def prediction_cluster_folds(train_X, train_y, test_X,model,
                            n_clusters=4,n_splits=5, type_model=1,
                             seed=SEED):
    """
        train_X -- ndarray
        train_y -- ndarray with (n, 1) shape
    """
    kmeans = KMeans(n_clusters=n_clusters, random_state=SEED)
    predicted_target = kmeans.fit(train_y).predict(train_y)
    
    predictions_holdout = np.zeros((test_X.shape[0], 1))
    predictions_val = np.zeros((train_X.shape[0], 1))

    scores = []
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

    for (tr_ind, val_ind) in skf.split(train_X, predicted_target):
        X_train_fold = train_X[tr_ind]
        y_train_fold = np.squeeze(train_y[tr_ind])

        X_valid_fold = train_X[val_ind]
        y_valid_fold = np.squeeze(train_y[val_ind])

        if type_model == 1:
          model_fold = clone(model)
          y_pred = model_fold.fit(X_train_fold, y_train_fold).predict(X_valid_fold).reshape(-1,1)
          y_pred_holdout = model_fold.predict(test_X).reshape(-1,1)
        else:
          y_pred = model.fit(X_train_fold, y_train_fold, X_valid_fold, y_valid_fold).predict(X_valid_fold).reshape(-1,1)
          y_pred_holdout = model.predict(test_X).reshape(-1,1)

        
        predictions_val[val_ind] = y_pred
        scores.append(mean_squared_error(y_valid_fold, y_pred))

        predictions_holdout += y_pred_holdout
        
    
    predictions_holdout /= skf.n_splits
    return predictions_holdout, scores, predictions_val

In [0]:
def prediction_cluster_folds_ensemble(train_X, train_y, test_X, models, 
                                      n_clusters=4,n_splits=5,
                                     seed=SEED):
    """
        train_X -- ndarray
        train_y -- ndarray with (n, 1) shape
    """
    kmeans = KMeans(n_clusters=n_clusters, random_state=SEED)
    predicted_target = kmeans.fit(train_y).predict(train_y)
    
    predictions_holdout = np.zeros((test_X.shape[0], 1))
    predictions_val = np.zeros((train_X.shape[0], 1))

    scores = []
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

    for (tr_ind, val_ind) in tqdm_notebook(skf.split(train_X, predicted_target), 
                                           total=skf.n_splits):
        X_train_fold = train_X[tr_ind]
        y_train_fold = np.squeeze(train_y[tr_ind])

        X_valid_fold = train_X[val_ind]
        y_valid_fold = np.squeeze(train_y[val_ind])
        
        # ensembling weighting average
        pred_test_inline = np.zeros((test_X.shape[0], 1))
        pred_val_inline = np.zeros((X_valid_fold.shape[0], 1))
        weight_sum = 0
        for (type_model, alpha, model) in models:
            if type_model == 1:
                model_fold = clone(model)
                y_pred = model_fold.fit(X_train_fold, y_train_fold).predict(X_valid_fold).reshape(-1,1)
                pred_val_inline += alpha * y_pred

                y_pred = model_fold.predict(test_X).reshape(-1,1)
                pred_test_inline += alpha * y_pred
                weight_sum += alpha
            else:
                y_pred = model.fit(X_train_fold, y_train_fold, 
                                   X_valid_fold, y_valid_fold).predict(X_valid_fold)
                pred_val_inline += alpha * y_pred
                
                y_pred = model.predict(test_X)
                pred_test_inline += alpha * y_pred
                weight_sum += alpha
        
        pred_test_inline /= weight_sum
        pred_val_inline /= weight_sum
        ####
        predictions_val[val_ind] = pred_val_inline
        
        scores.append(mean_squared_error(y_valid_fold, pred_val_inline))
        predictions_holdout += pred_test_inline
        
    
    predictions_holdout /= skf.n_splits
    return predictions_holdout, scores, predictions_val

In [0]:
class NN():
    
    def __init__(self, input_shape, epochs, batch_size, dropout=.2):
        self.input_shape = input_shape
        self.epochs = epochs
        self.batch_size=batch_size
        self.dropout = dropout
        
        
    def _init_model(self):
        inps = Input(shape=self.input_shape)
        x = Dense(256, activation='relu')(inps)
        x = Dropout(self.dropout)(x)
        x = Dense(128, activation='relu')(x)
        x = Dropout(.5 * self.dropout)(x)
        x = Dense(1)(x)
        model = Model(inputs=inps, outputs=x)
        model.compile(
            optimizer=Adam(lr=1e-3),
            loss=['mse']
        )
        return model
    
    def fit(self, train_X, train_y, val_X, val_y):
        self.model = self._init_model()

        es = callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=50, verbose=0, 
                                     mode='auto', restore_best_weights=True)
        rlr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=1e-6, 
                                          mode='auto', verbose=0)
        self.model.fit(
            train_X, train_y, epochs=self.epochs, batch_size=self.batch_size, 
            validation_data=(val_X, val_y), verbose=False, 
            callbacks=[es, rlr]
        )
        return self
    
    def predict(self, test_X):
        pred = self.model.predict(test_X).flatten().reshape(-1,1)
        return pred

In [0]:
def meta_transformer(train_X, train_y, test_X, 
                       models, n_clusters=4,n_splits=5, seed=SEED):
    """
        train_X -- ndarray
        train_y -- ndarray with (n, 1) shape
    """
    kmeans = KMeans(n_clusters=n_clusters, random_state=SEED)
    predicted_target = kmeans.fit(train_y).predict(train_y)
    
    
    num_models = len(models)
    meta_matrix_train = np.zeros((train_X.shape[0], num_models))
    meta_matrix_test = np.zeros((test_X.shape[0], num_models))

    scores = []
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

    for (tr_ind, val_ind) in tqdm_notebook(skf.split(train_X, predicted_target), 
                                           total=skf.n_splits):
        X_train_fold = train_X[tr_ind]
        y_train_fold = np.squeeze(train_y[tr_ind])

        X_valid_fold = train_X[val_ind]
        y_valid_fold = np.squeeze(train_y[val_ind])
        
        for i,(type_model, model) in enumerate(models):
            if type_model == 1:
                model_fold = clone(model)
                y_pred_valid = model_fold.fit(X_train_fold, 
                                              y_train_fold).predict(X_valid_fold)
                meta_matrix_train[val_ind, i] = y_pred_valid
                
                
                y_pred_test = model_fold.predict(test_X)
                meta_matrix_test[:, i] = y_pred_test
            elif type_model == 0:
                y_pred_valid = model.fit(X_train_fold, y_train_fold, 
                                   X_valid_fold, y_valid_fold).predict(X_valid_fold)
                meta_matrix_train[val_ind, i] = np.squeeze(y_pred_valid)

                y_pred_test = model.predict(test_X)
                meta_matrix_test[:, i] = np.squeeze(y_pred_test)
                
    return meta_matrix_train, meta_matrix_test

def meta_transform_mean(train_X, train_y, test_X, models,
                        num_iterations=10,
                        n_clusters=4,n_splits=7, seed=SEED):
    
    train_meta = np.zeros((train_X.shape[0], len(models)))
    test_meta = np.zeros((test_X.shape[0], len(models)))
    
    for t in range(num_iterations):
        train_meta_curr, test_meta_curr = meta_transformer(X_array, y, X_test_array, 
                                                         models, n_clusters=n_clusters, n_splits=n_splits,
                                                          seed=seed+t)
        train_meta += train_meta_curr
        test_meta += test_meta_curr
        
    train_meta /= num_iterations
    test_meta /= num_iterations
    
    return train_meta, test_meta

### LGBM regressors + NN

In [0]:
alpha=0.5

zoo_models = [ (1, XGBRegressor(n_estimators=100, colsample_bytree=.3,
                             random_state=SEED,
                             n_jobs=-1)),
              
              (1, LGBMRegressor(n_estimators=100,
                                      colsample_bytree=.3,
                                       n_jobs=-1,
                                       random_state=SEED+1)),
              
              (1, LGBMRegressor(n_estimators=100,
                                      colsample_bytree=.4,
                                       n_jobs=-1,
                                       random_state=SEED+2)),
              
              (1, LGBMRegressor(n_estimators=100,
                                      colsample_bytree=.5,
                                       n_jobs=-1,
                                       random_state=SEED+3)),
              
              (0, NN(input_shape=(X_array.shape[1],), epochs=500, 
                     batch_size=256, dropout=.2)),
              
]

In [0]:
%%time
# get the matrices of metafeatures
train_meta_lgb, test_meta_lgb = meta_transform_mean(X_array, y,
                                                    X_test_array, zoo_models,
                                                    num_iterations=5)

### NNs on different subspaces

In [0]:
class ModelSub(object):
  def __init__(self, estimator, colsample=1., num_iterations=5, seed=17, type_model=1):
    self.colsample = colsample
    self.num_iterations = num_iterations
    self.seed=seed
    self.model = estimator
    self.type_model=type_model
    
  
  def fit_predict(self, train_X, train_y, test_X):
    pred_train = np.zeros((train_X.shape[0],1))
    pred_test = np.zeros((test_X.shape[0], 1))

    num_features = int(self.colsample * train_X.shape[1])
    indices = np.arange(0, train_X.shape[1])
    for i in range(self.num_iterations):
      # select subset
      np.random.seed(self.seed + i)
      indices_train = np.random.choice(indices, num_features)

      train_fold, test_fold = np.copy(train_X[:, indices_train]), np.copy(test_X[:, indices_train])
      # prediction on folds
      y_pred_test, _, y_pred_train = prediction_cluster_folds(train_X, train_y, 
                                                              test_X, self.model, 
                                                              n_splits=7,
                                                              type_model=self.type_model,
                                                              seed=10 * self.seed + i)

      assert(y_pred_train.shape == train_y.shape)
      pred_train += y_pred_train
      pred_test += y_pred_test

    pred_train /= self.num_iterations
    pred_test /= self.num_iterations

    return pred_train,pred_test
      

In [0]:
def subsets_transformer(train_X, train_y, test_X, models, seed=SEED):
    
    pred_train_mat = np.zeros((train_X.shape[0], len(models)))
    pred_test_mat = np.zeros((test_X.shape[0], len(models)))

    for (i, model) in tqdm_notebook(enumerate(models), total=len(models)):
      y_pred_train, y_pred_test = model.fit_predict(train_X, train_y, test_X)
      
      pred_train_mat[:, i] = np.squeeze(y_pred_train)
      pred_test_mat[:, i] = np.squeeze(y_pred_test)
    
    return pred_train_mat, pred_test_mat

In [0]:
nn = NN(input_shape=(X_array.shape[1],), epochs=500, batch_size=256)

zoo_modelsubs = [
        ModelSub(nn, colsample=.1, num_iterations=3, type_model=0, seed=SEED),
        ModelSub(nn, colsample=.15, num_iterations=3, type_model=0, seed=SEED+1),
        ModelSub(nn, colsample=.2, num_iterations=3, type_model=0,seed=SEED+2),
        ModelSub(nn, colsample=.25, num_iterations=3, type_model=0,seed=SEED+3),
        ModelSub(nn, colsample=.3, num_iterations=3, type_model=0,seed=SEED+4),
        ModelSub(nn, colsample=.4, num_iterations=3, type_model=0,seed=SEED+5)
]

In [0]:
%%time
train_meta_nns, test_meta_nns = subsets_transformer(X_array, y,
                                                    X_test_array, zoo_modelsubs, seed=17)

### Stacking

In [0]:
train_meta_full = pd.concat([train_meta_lgb, train_meta_nns], axis=1).values
test_meta_full = pd.concat([test_meta_lgb, test_meta_nns], axis=1).values

In [0]:
ridge = Ridge(alpha=10, random_state=SEED)
y_pred, scores, _ = prediction_cluster_folds(train_meta_full, y, 
                                       test_meta_full,
                                       ridge,
                                       n_clusters=4, 
                                       n_splits=7)
np.mean(scores)

In [0]:
submission = pd.DataFrame(test_initial['Id'], columns=['Id'])
submission[target] = y_pred
submission.to_csv('../data/submissions/xgb_lgb_nn_full_nn_rnd_subs_stacking.csv', 
                  index=False)