<h1><center><font size="5">Santander LGBM Baseline Prediction with feature elimination</font></center></h1>


# Load Packages and Data

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from numba import jit
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, KFold
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import os
import warnings
warnings.filterwarnings('ignore')
print(os.listdir("../input"))

In [None]:
train = pd.read_csv('../input/santander-fe-train-and-test/fe_train.csv')
test = pd.read_csv('../input/santander-fe-train-and-test/fe_test.csv')
target = train['target']

In [None]:
print ("Test ",test.shape)
print ("Train ",train.shape)

In [None]:
del train['Unnamed: 0']
del test['Unnamed: 0']

In [None]:
features = [c for c in train.columns if c not in ['ID_code', 'target']]

In [None]:
len(features)

# Feature Elimination

In [None]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [None]:
rfc=RandomForestClassifier(n_estimators=50, max_depth=2, random_state=42)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
%time
rfc.fit(train[features].values, target)

In [None]:
len(rfc.feature_importances_)

In [None]:
# 230 least important features
least_imp_230_features_df = pd.DataFrame(np.transpose([features, rfc.feature_importances_]), columns=['Feature Name', 'Importance']).\
sort_values('Importance', ascending=False).tail(230)

In [None]:
for col in train.columns.values:
    if col in least_imp_230_features_df['Feature Name'].values:
        train = train.drop(labels=[col], axis=1)

In [None]:
# Reducing size of data

def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
            # Print current column type
            print("******************************")
            print("Column: ",col)
            print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
            print("dtype after: ",props[col].dtype)
            print("******************************")
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props, NAlist

In [None]:
train, NAlist = reduce_mem_usage(train)
print("_________________")
print("")
print("Warning: the following columns have missing values filled with 'df['column_name'].min() -1': ")
print("_________________")
print("")
print(NAlist)

In [None]:
for col in test.columns.values:
    if col in least_imp_230_features_df['Feature Name'].values:
        test = test.drop(labels=[col], axis=1)

In [None]:
test, NAlist = reduce_mem_usage(test)
print("_________________")
print("")
print("Warning: the following columns have missing values filled with 'df['column_name'].min() -1': ")
print("_________________")
print("")
print(NAlist)

In [None]:
train.shape, test.shape

# Build the Light GBM Model

In [None]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.335,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.041,
    'learning_rate': 0.0083,
    'max_depth': -1,
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': -1
}

In [None]:
num_folds = 5
features = [c for c in train.columns if c not in ['ID_code', 'target']]
print(features == test.columns.values)

folds = StratifiedKFold(n_splits=num_folds, shuffle=False, random_state=42)
oof = np.zeros(len(train))
# getVal = np.zeros(len(train))
predictions = np.zeros(len(target))
# feature_importance_df = pd.DataFrame()

In [None]:
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
    
    X_train, y_train = train.iloc[trn_idx][features], target.iloc[trn_idx]
    X_valid, y_valid = train.iloc[val_idx][features], target.iloc[val_idx]
    
    print("Fold idx:{}".format(fold_))
    trn_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_valid, label=y_valid)
    
    clf = lgb.train(param, trn_data, 10000, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 4000)
    oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
#     fold_importance_df = pd.DataFrame()
#     fold_importance_df["feature"] = features
#     fold_importance_df["importance"] = clf.feature_importance()
#     fold_importance_df["fold"] = fold_ + 1
#     feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits

In [None]:
print("\n >> CV score: {:<8.5f}".format(roc_auc_score(target, oof)))

In [None]:
# cols = (feature_importance_df[["feature", "importance"]]
#         .groupby("feature")
#         .mean()
#         .sort_values(by="importance", ascending=False)[:1000].index)
# best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]

# plt.figure(figsize=(14,26))
# sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance",ascending=False))
# plt.title('LightGBM Features (averaged over folds)')
# plt.tight_layout()
# plt.savefig('lgbm_importances.png')

# Submission

In [None]:
submission = pd.DataFrame({"ID_code": test.ID_code.values})
submission["target"] = predictions
submission.to_csv("santander_LGBM_baseline_submission.csv", index=False)

In [None]:
# submission.head()