In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor

from IPython.display import display

import warnings
warnings.filterwarnings('ignore')

import os

In [None]:
os.listdir('../input/santander-value-prediction-challenge')

In [None]:
train_df = pd.read_csv('../input/santander-value-prediction-challenge/train.csv')
test_df = pd.read_csv('../input/santander-value-prediction-challenge/test.csv')

In [None]:
# Check and Remove Constant Features
colsToRemove = []
for i in train_df.columns:
    if(i != "ID" and i != "target"):
        if(train_df[i].std() ==0):
            colsToRemove.append(i)
            
train_df.drop(colsToRemove,axis=1,inplace=True)

test_df.drop(colsToRemove,axis=1,inplace=True)

print('removed cols number: ',len(colsToRemove))

In [None]:
# Remove Duplicate Columns
%%time

# 방법1
def duplicate_columns(frame):
    groups = frame.columns.to_series().groupby(frame.dtypes).groups
    dups = []
    
    for t,v in groups.items():
        
        cs = frame[v].columns
        vs = frame[v]
        lcs = len(cs)
        
        for i in range(lcs):
            ia = vs.iloc[:,i].values
            for j in range(i+1,lcs):
                ja = vs.iloc[:,j].values
                if np.array_equal(ia,ja):
                    dups.append(cs[i])
                    break
    return dups

colsToRemove = duplicate_columns(train_df)
print(colsToRemove)
        

# 방법2 (추천 및 시도)
train = train.loc[:, ~train.columns.duplicated()]



In [None]:
train_df.drop(colsToRemove,axis=1,inplace=True)
test_df.drop(colsToRemove,axis=1,inplace=True)
print("Removed dupliacte",len(colsToRemove))

In [None]:
# Drop Sparse Data

def drop_sparse(train,test):
    flist = [x for x in train.columns if not x in ["ID","target"]]
    for f in flist:
        if len(np.unique(train[f]))<2:
            train.drop(f,axis=1,inplace=True)
            test.drop(f,axis=1,inplace=True)
    return train,test

In [None]:
%%time
train_df, test_df = drop_sparse(train_df,test_df)

In [None]:
gc.collect()
print(train_df.shape)
print(test_df.shape)

In [None]:
x_train = train_df.drop(["ID","target"],axis=1)
y_train = np.log1p(train_df['target'].values)

x_test = test_df.drop(['ID'],axis=1)

In [None]:
# LightGBM

def run_lgb(train_x,train_y,val_x,val_y,test_x):
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 40,
        "learning_rate" : 0.004,
        "bagging_fraction" : 0.6,
        "feature_fraction" : 0.6,
        "bagging_frequency" : 6,
        "bagging_seed" : 42,
        "verbose" : -1,
        "seed":42
    }
    lgtrain = lgb.Dataset(train_x,label=train_y)
    lgval = lgb.Dataset(val_x,label=val_y)
    evals_result = {}
    model = lgb.train(params,lgtrain,5000,valid_sets=[lgtrain,lgval],
                     early_stopping_rounds=100,verbose_eval=150,
                     evals_result=evals_result)
    
    pred_test_y = np.expm1(model.predict(test_x,num_iteration=model.best_iteration))
    return pred_test_y, model, evals_result

In [None]:
pred_test,model,evals_result = run_lgb(dev_x,dev_y,val_x,val_y,x_test)

In [None]:
# feature importance

gain = model.feature_importance('gain')
featureimp = pd.DataFrame({"feature":model.feature_name(),
                          "split":model.feature_importance("split"),
                          "gain":100*gain/gain.sum()}).sort_values('gain',ascending=False)
print(featureimp[:50])

In [None]:
# XGB Modeling

def run_xgb(train_x,train_y,val_x,val_y,test_x):
    params = {"objective": 'reg:linear',
             'eval_metric' : 'rmse',
             'eta' : 0.001,
             'max_depth' : 10,
             'subsample' : 0.6,
             'colsample_bytree' : 0.6,
             'alpha' : 0.001,
             'randome_state' : 42,
             'silent' : True}
    
    tr_data = xgb.DMatrix(train_x,train_y)
    va_data = xgb.DMatrix(val_x,val_y)
    
    watchlist = [(tr_data,'train'),(va_data,'valid')]
    
    model_xgb = xgb.train(params,tr_data,2000,watchlist,maximize=False,
                         early_stopping_rounds=100,verbose_eval=100)
    dtest = xgb.DMatrix(test_x)
    xgb_pred_y = np.expm1(model_xgb.predict(dtest,ntree_limit=model_xgb.best_ntree_limit))
    
    return xgb_pred_y,model_xgb
    
    

In [None]:
pred_test_xgb,model_xgb = run_xgb(dev_x,dev_y,val_x,val_y,x_test)

In [None]:
# Catboost

cb_model = CatBoostRegressor(iterations=500,
                             learning_rate=0.05,
                             depth=10,
                             eval_metric='RMSE',
                             random_seed = 42,
                             bagging_temperature = 0.2,
                             od_type='Iter',
                             metric_period = 50,
                             od_wait=20)

In [None]:
cb_model.fit(dev_X, dev_y,
             eval_set=(val_X, val_y),
             use_best_model=True,
             verbose=50)

In [None]:
pred_test_cat = np.expm1(cb_model.predict(X_test))


In [None]:
# Combine Predictions

sub = pd.read_csv('../input/sample_submission.csv')

sub_lgb = pd.DataFrame()
sub_lgb["target"] = pred_test

sub_xgb = pd.DataFrame()
sub_xgb["target"] = pred_test_xgb

sub_cat = pd.DataFrame()
sub_cat["target"] = pred_test_cat

sub["target"] = (sub_lgb["target"] * 0.5 + sub_xgb["target"] * 0.3 + sub_cat["target"] * 0.2)

In [None]:
print(sub.head())
sub.to_csv('sub_lgb_xgb_cat.csv', index=False)