# Libraries

In [1]:
import pandas as pd
import numpy as np

# Constants

In [2]:
SEED = 42

# Load the data

In [3]:
train = pd.read_csv('../input/train.csv')

In [4]:
test = pd.read_csv('../input/test.csv')

# Analysis

In [5]:
train.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [6]:
test.tail()

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
4204,8410,aj,h,as,f,d,aa,j,e,0,...,0,0,0,0,0,0,0,0,0,0
4205,8411,t,aa,ai,d,d,aa,j,y,0,...,0,1,0,0,0,0,0,0,0,0
4206,8413,y,v,as,f,d,aa,d,w,0,...,0,0,0,0,0,0,0,0,0,0
4207,8414,ak,v,as,a,d,aa,c,q,0,...,0,0,1,0,0,0,0,0,0,0
4208,8416,t,aa,ai,c,d,aa,g,r,0,...,1,0,0,0,0,0,0,0,0,0


In [7]:
print('Train shape:', train.shape)
print('Test shape:', test.shape)

Train shape: (4209, 378)
Test shape: (4209, 377)


# Feature encoding

In [8]:
from sklearn.preprocessing import LabelEncoder

In [9]:
for column_name in train.columns[2:]:
    label_encoder = LabelEncoder() 
    
    train_column_values = list(train[column_name].values)
    test_column_values = list(test[column_name].values)
    
    label_encoder.fit(train_column_values + test_column_values)
    
    train[column_name] = label_encoder.transform(train_column_values)
    test[column_name] = label_encoder.transform(test_column_values)

In [10]:
train.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,37,23,20,0,3,27,9,14,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,37,21,22,4,3,31,11,14,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,24,24,38,2,3,30,9,23,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,24,21,38,5,3,30,11,4,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,24,23,38,5,3,14,3,13,...,0,0,0,0,0,0,0,0,0,0


# Feature engineering

In [11]:
from sklearn.decomposition import PCA, FastICA, LatentDirichletAllocation, RandomizedPCA, DictionaryLearning
from sklearn.decomposition import TruncatedSVD, ProjectedGradientNMF, FactorAnalysis, IncrementalPCA

from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection

In [25]:
COMPONENT_NUM = 90

decompositions = [
    TruncatedSVD(n_components=COMPONENT_NUM, random_state=SEED),
    PCA(n_components=COMPONENT_NUM, random_state=SEED),
    FastICA(n_components=COMPONENT_NUM, random_state=SEED),
    FactorAnalysis(n_components=COMPONENT_NUM, random_state=SEED),
    GaussianRandomProjection(n_components=COMPONENT_NUM, random_state=SEED),
    SparseRandomProjection(n_components=COMPONENT_NUM, random_state=SEED)
    
    #LatentDirichletAllocation(n_topics=COMPONENT_NUM, random_state=SEED),
    #DictionaryLearning(n_components=COMPONENT_NUM, random_state=SEED),
    #ProjectedGradientNMF(n_components=COMPONENT_NUM, random_state=SEED),
]

decomp_names = ['tSVD', 'PCA', 'ICA', 'GRP', 'SRP']

train_decomp_features = [decompositor.fit_transform(train.drop(["y"], axis=1)) for decompositor in decompositions]
test_decomp_features = [decompositor.transform(test) for decompositor in decompositions]

In [26]:
decomposition_features = []

for i in range(1, COMPONENT_NUM + 1):
    
    for feature_name, train_decomp_feature, test_decomp_feature in zip(
        decomp_names,
        train_decomp_features,
        test_decomp_features
    ):
    
        enumerated_feature_name = feature_name + '_' + str(i)
        train[enumerated_feature_name] = train_decomp_feature[:,i-1]
        test[enumerated_feature_name] = test_decomp_feature[:,i-1]
        
        decomposition_features.append(enumerated_feature_name)

In [27]:
train.head(2)

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,tSVD_89,PCA_89,ICA_89,GRP_89,SRP_89,tSVD_90,PCA_90,ICA_90,GRP_90,SRP_90
0,0,130.81,37,23,20,0,3,27,9,14,...,-0.142793,0.154351,0.026768,1.421823e-08,1.991487,0.102304,0.277541,0.031622,-2.417893e-09,-3.707266
1,6,88.53,37,21,22,4,3,31,11,14,...,-0.052571,-0.277157,0.004606,0.6466059,-0.543723,-0.279242,-0.154363,-0.015954,0.6281084,-3.486997


# Target and features

In [28]:
target = 'y'
features = list(train.columns[2:]) + decomposition_features

# Deduplicate features
features = list(set(features))

In [29]:
print(features)

['ICA_33', 'X222', 'X217', 'X125', 'PCA_57', 'PCA_86', 'X276', 'ICA_12', 'tSVD_49', 'X11', 'X59', 'SRP_10', 'X245', 'PCA_8', 'X180', 'PCA_72', 'PCA_85', 'tSVD_74', 'GRP_30', 'X135', 'X283', 'GRP_74', 'PCA_47', 'PCA_6', 'GRP_16', 'X355', 'X307', 'GRP_62', 'GRP_8', 'SRP_48', 'tSVD_33', 'X66', 'X67', 'tSVD_15', 'X279', 'ICA_70', 'X328', 'X218', 'tSVD_3', 'X36', 'X12', 'X293', 'X266', 'X169', 'X383', 'PCA_2', 'GRP_27', 'X201', 'ICA_17', 'X45', 'SRP_83', 'GRP_2', 'X74', 'X379', 'X48', 'X292', 'X211', 'X356', 'ICA_67', 'X190', 'SRP_13', 'SRP_31', 'ICA_84', 'ICA_77', 'X46', 'ICA_26', 'GRP_46', 'SRP_37', 'X237', 'tSVD_31', 'SRP_42', 'tSVD_88', 'tSVD_90', 'X23', 'GRP_75', 'ICA_44', 'X58', 'X63', 'SRP_70', 'GRP_81', 'X57', 'SRP_17', 'X5', 'PCA_21', 'ICA_39', 'PCA_73', 'X318', 'X168', 'X171', 'X226', 'GRP_14', 'PCA_87', 'X30', 'GRP_6', 'X338', 'PCA_42', 'SRP_9', 'tSVD_18', 'X97', 'PCA_1', 'ICA_8', 'X123', 'X137', 'ICA_58', 'X239', 'SRP_76', 'X69', 'GRP_28', 'X288', 'X86', 'SRP_7', 'X144', 'PCA_74

# Preprocessing

In [30]:
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from multiprocessing import Pool

import keras
from keras import backend as K
K.set_image_dim_ordering('tf')

In [31]:
X_train = train[features]
X_test = test[features]
y_train = train[target]

In [32]:
X_train.head()

Unnamed: 0,ICA_33,X222,X217,X125,PCA_57,PCA_86,X276,ICA_12,tSVD_49,X11,...,GRP_17,PCA_44,X343,GRP_43,PCA_33,X265,PCA_29,SRP_33,ICA_1,SRP_81
0,0.007175,0,0,0,-0.397945,-0.391049,0,0.020489,-0.440961,0,...,0.875104,1.365919,0,-3.90188,-0.235816,0,1.264221,4.304135,0.011538,11.086689
1,0.001327,0,0,0,0.139743,-0.030103,0,0.008402,0.043062,0,...,0.547068,0.294278,0,-0.29891,0.407838,1,0.391321,4.730525,0.010972,12.189403
2,-0.007377,0,0,0,-0.538384,-0.197373,1,-0.021325,0.238689,0,...,-1.141674,-0.119599,0,0.023995,-0.229985,0,-0.264811,5.708314,0.014983,12.261091
3,-0.002885,0,0,0,0.151465,0.010289,1,-0.02464,-0.018217,0,...,-0.683844,-0.144007,0,-0.53896,-0.218439,0,0.37236,4.351914,0.01891,13.069976
4,0.005262,0,0,0,-0.004753,-0.130202,1,-0.03675,-0.287441,0,...,-0.512981,-0.099142,0,-0.193949,-0.230852,0,-0.186548,3.517953,0.019403,9.374473


In [33]:
print(X_train.shape, X_test.shape)

(4209, 826) (4209, 826)


In [34]:
# def square(x):
#     return x**2

# def sqrt(x):
#     return np.sqrt(x) if x > 0 else 0.0

# def log(x):
#     return np.log(x + K.epsilon()) if x > 0 else 0.0

# modifers = {
#     '^2': square,
#     'sqrt': sqrt,
#     'log': log,
# }

In [35]:
# pool = Pool(8)

# for column_name in train_X.columns:
#     for modifer_name, modifer_function in modifers.items():
        
#         features.append(modifer_name)
#         new_feature_name = column_name + '_' + modifer_name
#         train_X[new_feature_name] = pool.map(modifer_function, train_X[column_name])

In [36]:
X_train.tail()

Unnamed: 0,ICA_33,X222,X217,X125,PCA_57,PCA_86,X276,ICA_12,tSVD_49,X11,...,GRP_17,PCA_44,X343,GRP_43,PCA_33,X265,PCA_29,SRP_33,ICA_1,SRP_81
4204,-0.000701,0,0,0,0.287055,-0.437628,0,0.008889,-0.227242,0,...,-0.174765,-0.488023,0,-0.029678,-0.279931,1,0.560058,598.843427,-0.009369,-538.585701
4205,-0.007796,0,0,0,-0.000205,0.117282,0,-0.014185,0.361688,0,...,4.450994,-0.33137,1,-1.310123,0.073278,0,-0.606889,597.918132,0.003342,-533.430236
4206,0.002755,0,0,0,0.116755,0.06444,0,-0.015071,-0.114067,0,...,0.233297,0.023843,0,-0.1363,0.312694,1,-0.241773,598.261778,0.008796,-535.216936
4207,0.005912,0,0,0,0.080635,0.010128,0,-0.009135,0.569498,0,...,-0.402807,0.317965,0,0.255983,-0.12173,1,-0.948929,599.41559,0.012908,-537.601747
4208,-0.003859,0,0,0,-0.030962,0.1984,0,0.01268,0.05514,0,...,0.091159,-0.186822,0,0.131161,-0.062533,1,0.118679,600.789191,0.007152,-536.563154


In [37]:
X_train, X_val, y_train, y_val = train_test_split(
    X_train.as_matrix(), y_train.as_matrix(), test_size=0.1, random_state=SEED
)

In [None]:
# scaler = StandardScaler()

# train_X = np.nan_to_num(train_X)
# val_X = np.nan_to_num(val_X)
# test_X = np.nan_to_num(test_X)

# train_X = scaler.fit_transform(train_X)
# val_X = scaler.transform(val_X)
# test_X = scaler.transform(test_X)

# Model

In [38]:
from sklearn.metrics import log_loss, auc, f1_score, r2_score, mean_squared_error

from keras.layers.advanced_activations import PReLU
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU

from keras.callbacks import Callback, ModelCheckpoint, EarlyStopping
from keras.layers import Merge, Conv1D, merge

from keras.models import Sequential
from keras.layers import Activation, Dropout, Dense, BatchNormalization, TimeDistributed, Input
from keras.layers import MaxPooling1D, Lambda, Convolution1D, Flatten, SpatialDropout1D
from keras_tqdm import TQDMNotebookCallback
from keras.layers.merge import Concatenate

from keras.optimizers import Adam, RMSprop, Adamax, Adagrad, Nadam, SGD
from keras.activations import elu, relu, tanh, sigmoid
from keras.constraints import maxnorm

from keras.models import load_model, Model
from keras.wrappers.scikit_learn import KerasRegressor

from xgboost import XGBRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR

In [39]:
def R2(y_true, y_pred):
    SS_res =  K.sum(K.square( y_true - y_pred )) 
    SS_tot = K.sum(K.square( y_true - K.mean(y_true) ) ) 
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

In [43]:
model = XGBRegressor(
    n_estimators=1190,
    learning_rate=0.005,
    max_depth=2,
    colsample_bytree=0.7,
    reg_alpha=0.1,
    reg_lambda=1.5,
    gamma=0.65,
    seed=SEED,
)

model.fit(X_train, y_train)

y_pred = model.predict(X_val)
print('R2:', r2_score(y_val, y_pred))
print('MSE:', mean_squared_error(y_val, y_pred))

R2: -0.021175544239
MSE: 146.610204844


In [None]:
# N_comp = 90
# model = XGBRegressor(
#     n_estimators=1190,
#     learning_rate=0.005,
#     max_depth=2,
#     colsample_bytree=0.7,
#     reg_alpha=0.1,
#     reg_lambda=1.5,
#     gamma=0.65,
#     seed=SEED,
# )
# R2: 0.631073454312
# MSE: 52.96679571

# model = XGBRegressor(
#     n_estimators=1190,
#     learning_rate=0.005,
#     max_depth=2,
#     colsample_bytree=0.7,
#     reg_alpha=0.1,
#     gamma=0.65,
#     seed=SEED,
# )
# R2: 0.630414534992
# MSE: 53.0613967774

# model = XGBRegressor(
#     n_estimators=1190,
#     learning_rate=0.005,
#     max_depth=2,
#     colsample_bytree=0.7,
#     gamma=0.65,
#     seed=SEED,
# )
# R2: 0.630217910662
# MSE: 53.0896261387

# model = XGBRegressor(
#     n_estimators=1150,
#     learning_rate=0.005,
#     max_depth=2,
#     colsample_bytree=0.7,
#     gamma=0.65,
#     seed=SEED,
# )
# R2: 0.630181562924
# MSE: 53.0948445846

# model = XGBRegressor(
#     n_estimators=1100,
#     learning_rate=0.005,
#     max_depth=2,
#     colsample_bytree=0.7,
#     gamma=0.5,
#     seed=SEED,
# )
# R2: 0.629985455549
# MSE: 53.1229997266

# model = XGBRegressor(
#     n_estimators=1100,
#     learning_rate=0.005,
#     max_depth=2,
#     subsample=0.975,
#     colsample_bytree=0.7,
#     gamma=0.5,
#     seed=SEED,
# )
# R2: 0.629480574972
# MSE: 53.1954854468

# model = XGBRegressor(
#     n_estimators=1100,
#     learning_rate=0.005,
#     max_depth=2,
#     subsample=0.95,
#     colsample_bytree=0.7,
#     gamma=0.5,
#     seed=SEED,
# )
# R2: 0.628850795778
# MSE: 53.2859028654


# model = XGBRegressor(
#     n_estimators=1100,
#     learning_rate=0.005,
#     max_depth=2,
#     subsample=0.85,
#     colsample_bytree=0.7,
#     gamma=0.5,
#     seed=SEED,
# )
# R2: 0.627534740771
# MSE: 53.4748489239

# model = XGBRegressor(
#     n_estimators=1100,
#     learning_rate=0.005,
#     max_depth=2,
#     subsample=0.75,
#     colsample_bytree=0.7,
#     gamma=0.5,
#     seed=SEED,
# )
# R2: 0.625560417155
# MSE: 53.7583026272


# model = XGBRegressor(
#     n_estimators=1100,
#     learning_rate=0.005,
#     max_depth=2,
#     subsample=0.7,
#     colsample_bytree=0.7,
#     gamma=0.5,
#     seed=SEED,
# )
# R2: 0.624128065094
# MSE: 53.9639454575

# model = XGBRegressor(
#     n_estimators=1100,
#     learning_rate=0.005,
#     max_depth=2,
#     subsample=0.65,
#     colsample_bytree=0.7,
#     seed=SEED,
# )
# R2: 0.623189320208
# MSE: 54.0987210901


# model = XGBRegressor(
#     n_estimators=1300,
#     learning_rate=0.005,
#     max_depth=2,
#     subsample=0.65,
#     colsample_bytree=0.7,
#     seed=SEED,
# )
# R2: 0.622057808729
# MSE: 54.2611722286

# model = XGBRegressor(
#     n_estimators=1300,
#     learning_rate=0.005,
#     max_depth=2,
#     subsample=0.65,
#     seed=SEED,
# )
# R2: 0.6222671485
# MSE: 54.2311173111

# model = XGBRegressor(
#     n_estimators=1500,
#     learning_rate=0.005,
#     max_depth=4,
#     subsample=0.6
# )
# R2: 0.598039995632
# MSE: 57.7094104065


# model = XGBRegressor(
#     n_estimators=1300,
#     learning_rate=0.005,
#     max_depth=4,
#     subsample=0.6
# )

# R2: 0.602747796317
# MSE: 57.0335113148

# model = XGBRegressor(
#     n_estimators=1300,
#     learning_rate=0.005,
#     max_depth=4,
#     subsample=0.65,
#     seed=SEED,
# )
# R2: 0.60273292628
# MSE: 57.0356462066

In [None]:
model = Sequential()

model.add(Dense(1024, input_dim=X_train.shape[1]))
model.add(Activation(elu))
model.add(BatchNormalization())
model.add(Dropout(0.35))

model.add(Dense(512))
model.add(Activation(elu))
model.add(BatchNormalization())
model.add(Dropout(0.35))

model.add(Dense(512))
model.add(Activation(elu))
model.add(BatchNormalization())
model.add(Dropout(0.4))

model.add(Dense(256))
model.add(Activation(elu))
model.add(BatchNormalization())
model.add(Dropout(0.45))

model.add(Dense(128))
model.add(Activation(elu))
model.add(BatchNormalization())

model.add(Dense(1, activation='linear'))   

In [None]:
model.compile(
    loss='mean_squared_error',
    optimizer=Adam(0.001, clipnorm=2.5),
    metrics=[R2]
)

model.fit(
    X_train, y_train,
    batch_size=512,
    epochs=18,
    validation_data = (X_val, y_val),
    callbacks=[TQDMNotebookCallback()]
)

In [None]:
# model = Sequential()

# model.add(Dense(512, input_dim=train_X.shape[1]))
# model.add(Activation(elu))
# model.add(BatchNormalization())

# dropout_rate = 0.001
# neuron_decay = 0.99
# base_neurons = 300

# for i in range(50):
#     base_neurons = int(base_neurons * neuron_decay) 
    
#     model.add(Dense(base_neurons))
#     model.add(Activation(elu))
#     model.add(BatchNormalization())
#     model.add(Dropout(dropout_rate * i))
    
#     print('Neurons:', base_neurons, ' dropout:', dropout_rate)
    
    
    
# model.add(Dense(1))

# Submission

In [None]:
try:
    X_test = X_test.as_matrix()
except Exception:
    pass

In [23]:
predictions = model.predict(X_test).ravel()

In [24]:
model_name = 'XGB_MSE_52_966_R2_0_631.h5'
#model.save('../models/' + model_name)

output = pd.DataFrame({'id': test['ID'].copy().values, 'y': predictions})
output.to_csv('../submissions/submission_' + model_name + '.csv', index=False)